diff --git a/README.md b/README.md index 13a5182a..c245464a 100644 --- a/README.md +++ b/README.md @@ -22,34 +22,18 @@ go get github.com/unidoc/unidoc ## Overview - * Read and extract PDF metadata - * Merge PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_merge.go)). - * Split PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_split.go)). - * Protect PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_protect.go)). - * Unlock PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_unlock.go)). - * Rotate PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_rotate.go)). - * Crop PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_crop.go)). + * Many [features](http://unidoc.io/features) with documented examples. * Self contained with no external dependencies * Developer friendly +## Roadmap + +Our [roadmap](https://trello.com/b/JcliaYYI) is publicly available and features can be voted upon. + ## Examples See the [unidoc-examples](https://github.com/unidoc/unidoc-examples/tree/master) folder. -## Roadmap - -The following features are on the roadmap, these are all subjects to change. - - * Compress PDF - * Create PDF (high level API) - * Fill out Forms - * Create Forms - * Bindings for Python (and C#/Java if there is interest) - * Create Doc and DocX files - * Convert PDF to Word - * OCR Engine - * And many more... - ## Copying/License UniDoc is licensed as [AGPL][agpl] software (with extra terms as specified in our license). @@ -76,7 +60,7 @@ Contributors need to approve the [Contributor License Agreement](https://docs.go ## Support -Open source users can create a GitHub issue and we will look at it. Commercial users can either create a GitHub issue and also email us at support@unidoc.io and we will assist them directly. +Please email us at support@unidoc.io for any queries. ## Stay up to date diff --git a/pdf/crossrefs.go b/pdf/crossrefs.go index 61a76dbb..1577f120 100644 --- a/pdf/crossrefs.go +++ b/pdf/crossrefs.go @@ -245,7 +245,18 @@ func (this *PdfParser) lookupByNumber(objNumber int, attemptRepairs bool) (PdfOb obj, err := this.parseIndirectObject() if err != nil { - common.Log.Error("Failed reading xref") + common.Log.Error("Failed reading xref (%s)", err) + // Offset pointing to a non-object. Try to repair the file. + if attemptRepairs { + common.Log.Error("Attempting to repair xrefs (top down)") + xrefTable, err := this.repairRebuildXrefsTopDown() + if err != nil { + common.Log.Error("Failed repair (%s)", err) + return nil, false, err + } + this.xrefs = *xrefTable + return this.lookupByNumber(objNumber, false) + } return nil, false, err } @@ -334,29 +345,6 @@ func (this *PdfParser) Trace(obj PdfObject) (PdfObject, error) { return o, nil } -func (this *PdfParser) rebuildXrefTable() error { - newXrefs := XrefTable{} - for objNum, xref := range this.xrefs { - obj, _, err := this.lookupByNumberWrapper(objNum, false) - if err != nil { - return err - } - actObjNum, actGenNum, err := getObjectNumber(obj) - if err != nil { - return err - } - - xref.objectNumber = int(actObjNum) - xref.generation = int(actGenNum) - newXrefs[int(actObjNum)] = xref - } - - this.xrefs = newXrefs - common.Log.Debug("New xref table built") - printXrefTable(this.xrefs) - return nil -} - func printXrefTable(xrefTable XrefTable) { common.Log.Debug("=X=X=X=") common.Log.Debug("Xref table:") diff --git a/pdf/pdfparser.go b/pdf/pdfparser.go index 389c5afd..29ddbdc9 100644 --- a/pdf/pdfparser.go +++ b/pdf/pdfparser.go @@ -24,7 +24,7 @@ import ( var rePdfVersion = regexp.MustCompile(`%PDF-(\d\.\d)`) var reEOF = regexp.MustCompile("%%EOF") var reXrefTable = regexp.MustCompile(`\s*xref\s*`) -var reStartXref = regexp.MustCompile(`startxref\s*(\d+)`) +var reStartXref = regexp.MustCompile(`startx?ref\s*(\d+)`) var reNumeric = regexp.MustCompile(`^[\+-.]*([0-9.]+)`) var reExponential = regexp.MustCompile(`^[\+-.]*([0-9.]+)e[\+-.]*([0-9.]+)`) var reReference = regexp.MustCompile(`^\s*(\d+)\s+(\d+)\s+R`) @@ -1070,13 +1070,22 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { return nil, errors.New("Startxref not found") } if len(result) > 2 { - // GH: Take the last one? + // GH: Take the last one? Make a test case. common.Log.Error("Multiple startxref (%s)!", b2) return nil, errors.New("Multiple startxref entries?") } - offsetXref, _ := strconv.Atoi(result[1]) + offsetXref, _ := strconv.ParseInt(result[1], 10, 64) common.Log.Debug("startxref at %d", offsetXref) + if offsetXref > fSize { + common.Log.Error("Xref offset outside of file") + common.Log.Error("Attempting repair") + offsetXref, err = this.repairLocateXref() + if err != nil { + common.Log.Error("Repair attempt failed (%s)") + return nil, err + } + } // Read the xref. this.rs.Seek(int64(offsetXref), os.SEEK_SET) this.reader = bufio.NewReader(this.rs) diff --git a/pdf/reader.go b/pdf/reader.go index acadf2b0..e9a4ff2a 100644 --- a/pdf/reader.go +++ b/pdf/reader.go @@ -381,10 +381,7 @@ func (this *PdfReader) GetForms() (*PdfObjectDictionary, error) { common.Log.Debug("Has Acro forms") common.Log.Debug("Traverse the Acroforms structure") - nofollowList := map[PdfObjectName]bool{ - "Parent": true, - } - err := this.traverseObjectData(formsDict, nofollowList) + err := this.traverseObjectData(formsDict) if err != nil { common.Log.Error("Unable to traverse AcroForms (%s)", err) return nil, err @@ -442,13 +439,8 @@ func (this *PdfReader) buildPageList(node *PdfIndirectObject, parent *PdfIndirec (*nodeDict)["Parent"] = parent } - // Resolve the object recursively, not following Parents or Kids fields. - // Later can refactor and use only one smart recursive function. - nofollowList := map[PdfObjectName]bool{ - "Parent": true, - "Kids": true, - } - err := this.traverseObjectData(node, nofollowList) + // Resolve the object recursively. + err := this.traverseObjectData(node) if err != nil { return err } @@ -473,20 +465,9 @@ func (this *PdfReader) buildPageList(node *PdfIndirectObject, parent *PdfIndirec } common.Log.Debug("Kids: %s", kids) for idx, child := range *kids { - childRef, ok := child.(*PdfObjectReference) + child, ok := child.(*PdfIndirectObject) if !ok { - return errors.New("Invalid kid, non-reference") - } - - common.Log.Debug("look up ref %s", childRef) - pchild, err := this.parser.LookupByReference(*childRef) - if err != nil { - common.Log.Error("Unable to lookup page ref") - return errors.New("Unable to lookup page ref") - } - child, ok := pchild.(*PdfIndirectObject) - if !ok { - common.Log.Error("Page not indirect object - %s (%s)", childRef, pchild) + common.Log.Error("Page not indirect object - (%s)", child) return errors.New("Page not indirect object") } (*kids)[idx] = child @@ -526,11 +507,10 @@ func (this *PdfReader) resolveReference(ref *PdfObjectReference) (PdfObject, boo /* * Recursively traverse through the page object data and look up * references to indirect objects. - * GH: Consider to define a smarter traversing engine, defining explicitly - * - how deep we can go in terms of following certain Trees by name etc. - * GH: Are we fully protected against circular references? + * + * GH: Are we fully protected against circular references? (Add tests). */ -func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjectName]bool) error { +func (this *PdfReader) traverseObjectData(o PdfObject) error { common.Log.Debug("Traverse object data") if _, isTraversed := this.traversed[o]; isTraversed { return nil @@ -540,37 +520,30 @@ func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjec if io, isIndirectObj := o.(*PdfIndirectObject); isIndirectObj { common.Log.Debug("io: %s", io) common.Log.Debug("- %s", io.PdfObject) - err := this.traverseObjectData(io.PdfObject, nofollowKeys) + err := this.traverseObjectData(io.PdfObject) return err } if so, isStreamObj := o.(*PdfObjectStream); isStreamObj { - err := this.traverseObjectData(so.PdfObjectDictionary, nofollowKeys) + err := this.traverseObjectData(so.PdfObjectDictionary) return err } if dict, isDict := o.(*PdfObjectDictionary); isDict { common.Log.Debug("- dict: %s", dict) for name, v := range *dict { - if nofollowKeys != nil { - if _, nofollow := nofollowKeys[name]; nofollow { - // Do not retraverse up the tree. - continue - } - } - if ref, isRef := v.(*PdfObjectReference); isRef { resolvedObj, _, err := this.resolveReference(ref) if err != nil { return err } (*dict)[name] = resolvedObj - err = this.traverseObjectData(resolvedObj, nofollowKeys) + err = this.traverseObjectData(resolvedObj) if err != nil { return err } } else { - err := this.traverseObjectData(v, nofollowKeys) + err := this.traverseObjectData(v) if err != nil { return err } @@ -589,12 +562,12 @@ func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjec } (*arr)[idx] = resolvedObj - err = this.traverseObjectData(resolvedObj, nofollowKeys) + err = this.traverseObjectData(resolvedObj) if err != nil { return err } } else { - err := this.traverseObjectData(v, nofollowKeys) + err := this.traverseObjectData(v) if err != nil { return err } @@ -683,11 +656,8 @@ func (this *PdfReader) GetPage(pageNumber int) (PdfObject, error) { } page := this.pageList[pageNumber-1] - nofollowList := map[PdfObjectName]bool{ - "Parent": true, - } // Look up all references related to page and load everything. - err := this.traverseObjectData(page, nofollowList) + err := this.traverseObjectData(page) if err != nil { return nil, err } diff --git a/pdf/repairs.go b/pdf/repairs.go new file mode 100644 index 00000000..a197e95d --- /dev/null +++ b/pdf/repairs.go @@ -0,0 +1,140 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +// Routines related to repairing malformed pdf files. + +package pdf + +import ( + "errors" + "fmt" + "os" + "regexp" + + "github.com/unidoc/unidoc/common" +) + +var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`) + +// Locates a standard Xref table by looking for the "xref" entry. +// Xref object stream not supported. +func (this *PdfParser) repairLocateXref() (int64, error) { + readBuf := int64(1000) + this.rs.Seek(-readBuf, os.SEEK_CUR) + + curOffset, err := this.rs.Seek(0, os.SEEK_CUR) + if err != nil { + return 0, err + } + b2 := make([]byte, readBuf) + this.rs.Read(b2) + + results := repairReXrefTable.FindAllStringIndex(string(b2), -1) + if len(results) < 1 { + common.Log.Error("Repair: xref not found!") + return 0, errors.New("Repair: xref not found") + } + + localOffset := int64(results[len(results)-1][0]) + xrefOffset := curOffset + localOffset + return xrefOffset, nil +} + +// Renumbers the xref table. +// Useful when the cross reference is pointing to an object with the wrong number. +// Update the table. +func (this *PdfParser) rebuildXrefTable() error { + newXrefs := XrefTable{} + for objNum, xref := range this.xrefs { + obj, _, err := this.lookupByNumberWrapper(objNum, false) + if err != nil { + common.Log.Error("Unable to look up object (%s)", err) + common.Log.Error("Xref table completely broken - attempting to repair ") + xrefTable, err := this.repairRebuildXrefsTopDown() + if err != nil { + common.Log.Error("Failed xref rebuild repair (%s)", err) + return err + } + this.xrefs = *xrefTable + common.Log.Debug("Repaired xref table built") + return nil + } + actObjNum, actGenNum, err := getObjectNumber(obj) + if err != nil { + return err + } + + xref.objectNumber = int(actObjNum) + xref.generation = int(actGenNum) + newXrefs[int(actObjNum)] = xref + } + + this.xrefs = newXrefs + common.Log.Debug("New xref table built") + printXrefTable(this.xrefs) + return nil +} + +// Parse the entire file from top down. +// Currently not supporting object streams... +// Also need to detect object streams and load the object numbers. +func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) { + reRepairIndirectObject := regexp.MustCompile(`^(\d+)\s+(\d+)\s+obj`) + + this.SetFileOffset(0) + + xrefTable := XrefTable{} + for { + this.skipComments() + + curOffset := this.GetFileOffset() + + peakBuf, err := this.reader.Peek(10) + if err != nil { + // EOF + break + } + + // Indirect object? + results := reRepairIndirectObject.FindIndex(peakBuf) + if len(results) > 0 { + obj, err := this.parseIndirectObject() + if err != nil { + common.Log.Error("Unable to parse indirect object (%s)", err) + return nil, err + } + + if indObj, ok := obj.(*PdfIndirectObject); ok { + // Make the entry for the cross ref table. + xrefEntry := XrefObject{} + xrefEntry.xtype = XREF_TABLE_ENTRY + xrefEntry.objectNumber = int(indObj.ObjectNumber) + xrefEntry.generation = int(indObj.GenerationNumber) + xrefEntry.offset = curOffset + xrefTable[int(indObj.ObjectNumber)] = xrefEntry + } else if streamObj, ok := obj.(*PdfObjectStream); ok { + // Make the entry for the cross ref table. + xrefEntry := XrefObject{} + xrefEntry.xtype = XREF_TABLE_ENTRY + xrefEntry.objectNumber = int(streamObj.ObjectNumber) + xrefEntry.generation = int(streamObj.GenerationNumber) + xrefEntry.offset = curOffset + xrefTable[int(streamObj.ObjectNumber)] = xrefEntry + } else { + return nil, fmt.Errorf("Not an indirect object or stream (%T)", obj) // Should never happen. + } + } else if string(peakBuf[0:6]) == "endobj" { + this.reader.Discard(6) + } else { + // Stop once we reach xrefs/trailer section etc. Technically this could fail for complex + // cases, but lets keep it simple for now. Add more complexity when needed (problematic user committed files). + // In general more likely that more complex files would have better understanding of the PDF standard. + common.Log.Debug("Not an object - stop repair rebuilding xref here (%s)", peakBuf) + break + } + } + + return &xrefTable, nil +} diff --git a/pdf/writer.go b/pdf/writer.go index 6219a429..5cf028b1 100644 --- a/pdf/writer.go +++ b/pdf/writer.go @@ -22,6 +22,43 @@ import ( "github.com/unidoc/unidoc/license" ) +var pdfProducer = "" +var pdfCreator = "" + +func getPdfProducer() string { + if len(pdfProducer) > 0 { + return pdfProducer + } + + // Return default. + licenseKey := license.GetLicenseKey() + return fmt.Sprintf("UniDoc Library version %s (%s) - http://unidoc.io", getUniDocVersion(), licenseKey.TypeToString()) +} + +func SetPdfProducer(producer string) { + licenseKey := license.GetLicenseKey() + commercial := licenseKey.Type == license.LicenseTypeCommercial + if !commercial { + // Only commercial users can modify the producer. + return + } + + pdfProducer = producer +} + +func getPdfCreator() string { + if len(pdfCreator) > 0 { + return pdfCreator + } + + // Return default. + return "UniDoc - http://unidoc.io" +} + +func SetPdfCreator(creator string) { + pdfCreator = creator +} + type PdfWriter struct { root *PdfIndirectObject pages *PdfIndirectObject @@ -45,14 +82,10 @@ func NewPdfWriter() PdfWriter { w.objectsMap = map[PdfObject]bool{} w.objects = []PdfObject{} - licenseKey := license.GetLicenseKey() - - producer := fmt.Sprintf("UniDoc Library version %s (%s) - http://unidoc.io", getUniDocVersion(), licenseKey.TypeToString()) - // Creation info. infoDict := PdfObjectDictionary{} - infoDict[PdfObjectName("Producer")] = MakeString(producer) - infoDict[PdfObjectName("Creator")] = MakeString("FoxyUtils Online PDF https://foxyutils.com") + infoDict[PdfObjectName("Producer")] = MakeString(getPdfProducer()) + infoDict[PdfObjectName("Creator")] = MakeString(getPdfCreator()) infoObj := PdfIndirectObject{} infoObj.PdfObject = &infoDict w.infoObj = &infoObj @@ -151,7 +184,16 @@ func (this *PdfWriter) addObjects(obj PdfObject) error { if err != nil { return err } - + } else { + // How to handle the parent? Make sure it is present? + if parentObj, parentIsRef := (*dict)["Parent"].(*PdfObjectReference); parentIsRef { + // Parent is a reference. Means we can drop it? + // Could refer to somewhere outside of the scope of the output doc. + // Should be done by the reader already. + // -> ERROR. + common.Log.Error("Parent is a reference object - Cannot be in writer (needs to be resolved)") + return fmt.Errorf("Parent is a reference object - Cannot be in writer (needs to be resolved) - %s", parentObj) + } } } return nil