Merge in master

2025-05-02 22:17:06 +08:00 · 2016-08-17 15:04:29 +00:00 · 2016-08-17 15:04:29 +00:00 · 734cd7ddbf
commit 734cd7ddbf
parent 1103702ce7 e9dec61b51
6 changed files with 234 additions and 101 deletions
--- a/README.md
+++ b/README.md
@ -22,34 +22,18 @@ go get github.com/unidoc/unidoc

 ## Overview

- * Read and extract PDF metadata
- * Merge PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_merge.go)).
- * Split PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_split.go)).
- * Protect PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_protect.go)).
- * Unlock PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_unlock.go)).
- * Rotate PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_rotate.go)).
- * Crop PDF ([example](https://github.com/unidoc/unidoc-examples/blob/master/pdf/pdf_crop.go)).
+ * Many [features](http://unidoc.io/features) with documented examples.
 * Self contained with no external dependencies
 * Developer friendly

+## Roadmap
+
+Our [roadmap](https://trello.com/b/JcliaYYI) is publicly available and features can be voted upon.
+
 ## Examples

 See the [unidoc-examples](https://github.com/unidoc/unidoc-examples/tree/master) folder.

-## Roadmap
-
-The following features are on the roadmap, these are all subjects to change.
-
- * Compress PDF
- * Create PDF (high level API)
- * Fill out Forms
- * Create Forms
- * Bindings for Python (and C#/Java if there is interest)
- * Create Doc and DocX files
- * Convert PDF to Word
- * OCR Engine
- * And many more...
-
 ## Copying/License

 UniDoc is licensed as [AGPL][agpl] software (with extra terms as specified in our license).
@ -76,7 +60,7 @@ Contributors need to approve the [Contributor License Agreement](https://docs.go

 ## Support

-Open source users can create a GitHub issue and we will look at it. Commercial users can either create a GitHub issue and also email us at support@unidoc.io and we will assist them directly.
+Please email us at support@unidoc.io for any queries.

 ## Stay up to date

--- a/pdf/crossrefs.go
+++ b/pdf/crossrefs.go
@ -245,7 +245,18 @@ func (this *PdfParser) lookupByNumber(objNumber int, attemptRepairs bool) (PdfOb

 		obj, err := this.parseIndirectObject()
 		if err != nil {
-			common.Log.Error("Failed reading xref")
+			common.Log.Error("Failed reading xref (%s)", err)
+			// Offset pointing to a non-object.  Try to repair the file.
+			if attemptRepairs {
+				common.Log.Error("Attempting to repair xrefs (top down)")
+				xrefTable, err := this.repairRebuildXrefsTopDown()
+				if err != nil {
+					common.Log.Error("Failed repair (%s)", err)
+					return nil, false, err
+				}
+				this.xrefs = *xrefTable
+				return this.lookupByNumber(objNumber, false)
+			}
 			return nil, false, err
 		}

@ -334,29 +345,6 @@ func (this *PdfParser) Trace(obj PdfObject) (PdfObject, error) {
 	return o, nil
 }

-func (this *PdfParser) rebuildXrefTable() error {
-	newXrefs := XrefTable{}
-	for objNum, xref := range this.xrefs {
-		obj, _, err := this.lookupByNumberWrapper(objNum, false)
-		if err != nil {
-			return err
-		}
-		actObjNum, actGenNum, err := getObjectNumber(obj)
-		if err != nil {
-			return err
-		}
-
-		xref.objectNumber = int(actObjNum)
-		xref.generation = int(actGenNum)
-		newXrefs[int(actObjNum)] = xref
-	}
-
-	this.xrefs = newXrefs
-	common.Log.Debug("New xref table built")
-	printXrefTable(this.xrefs)
-	return nil
-}
-
 func printXrefTable(xrefTable XrefTable) {
 	common.Log.Debug("=X=X=X=")
 	common.Log.Debug("Xref table:")
--- a/pdf/pdfparser.go
+++ b/pdf/pdfparser.go
@ -24,7 +24,7 @@ import (
 var rePdfVersion = regexp.MustCompile(`%PDF-(\d\.\d)`)
 var reEOF = regexp.MustCompile("%%EOF")
 var reXrefTable = regexp.MustCompile(`\s*xref\s*`)
-var reStartXref = regexp.MustCompile(`startxref\s*(\d+)`)
+var reStartXref = regexp.MustCompile(`startx?ref\s*(\d+)`)
 var reNumeric = regexp.MustCompile(`^[\+-.]*([0-9.]+)`)
 var reExponential = regexp.MustCompile(`^[\+-.]*([0-9.]+)e[\+-.]*([0-9.]+)`)
 var reReference = regexp.MustCompile(`^\s*(\d+)\s+(\d+)\s+R`)
@ -1070,13 +1070,22 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {
 		return nil, errors.New("Startxref not found")
 	}
 	if len(result) > 2 {
-		// GH: Take the last one?
+		// GH: Take the last one?  Make a test case.
 		common.Log.Error("Multiple startxref (%s)!", b2)
 		return nil, errors.New("Multiple startxref entries?")
 	}
-	offsetXref, _ := strconv.Atoi(result[1])
+	offsetXref, _ := strconv.ParseInt(result[1], 10, 64)
 	common.Log.Debug("startxref at %d", offsetXref)

+	if offsetXref > fSize {
+		common.Log.Error("Xref offset outside of file")
+		common.Log.Error("Attempting repair")
+		offsetXref, err = this.repairLocateXref()
+		if err != nil {
+			common.Log.Error("Repair attempt failed (%s)")
+			return nil, err
+		}
+	}
 	// Read the xref.
 	this.rs.Seek(int64(offsetXref), os.SEEK_SET)
 	this.reader = bufio.NewReader(this.rs)
--- a/pdf/reader.go
+++ b/pdf/reader.go
@ -381,10 +381,7 @@ func (this *PdfReader) GetForms() (*PdfObjectDictionary, error) {
 	common.Log.Debug("Has Acro forms")

 	common.Log.Debug("Traverse the Acroforms structure")
-	nofollowList := map[PdfObjectName]bool{
-		"Parent": true,
-	}
-	err := this.traverseObjectData(formsDict, nofollowList)
+	err := this.traverseObjectData(formsDict)
 	if err != nil {
 		common.Log.Error("Unable to traverse AcroForms (%s)", err)
 		return nil, err
@ -442,13 +439,8 @@ func (this *PdfReader) buildPageList(node *PdfIndirectObject, parent *PdfIndirec
 		(*nodeDict)["Parent"] = parent
 	}

-	// Resolve the object recursively, not following Parents or Kids fields.
-	// Later can refactor and use only one smart recursive function.
-	nofollowList := map[PdfObjectName]bool{
-		"Parent": true,
-		"Kids":   true,
-	}
-	err := this.traverseObjectData(node, nofollowList)
+	// Resolve the object recursively.
+	err := this.traverseObjectData(node)
 	if err != nil {
 		return err
 	}
@ -473,20 +465,9 @@ func (this *PdfReader) buildPageList(node *PdfIndirectObject, parent *PdfIndirec
 	}
 	common.Log.Debug("Kids: %s", kids)
 	for idx, child := range *kids {
-		childRef, ok := child.(*PdfObjectReference)
+		child, ok := child.(*PdfIndirectObject)
 		if !ok {
-			return errors.New("Invalid kid, non-reference")
-		}
-
-		common.Log.Debug("look up ref %s", childRef)
-		pchild, err := this.parser.LookupByReference(*childRef)
-		if err != nil {
-			common.Log.Error("Unable to lookup page ref")
-			return errors.New("Unable to lookup page ref")
-		}
-		child, ok := pchild.(*PdfIndirectObject)
-		if !ok {
-			common.Log.Error("Page not indirect object - %s (%s)", childRef, pchild)
+			common.Log.Error("Page not indirect object - (%s)", child)
 			return errors.New("Page not indirect object")
 		}
 		(*kids)[idx] = child
@ -526,11 +507,10 @@ func (this *PdfReader) resolveReference(ref *PdfObjectReference) (PdfObject, boo
 /*
 * Recursively traverse through the page object data and look up
 * references to indirect objects.
- * GH: Consider to define a smarter traversing engine, defining explicitly
- * - how deep we can go in terms of following certain Trees by name etc.
- * GH: Are we fully protected against circular references?
+ *
+ * GH: Are we fully protected against circular references? (Add tests).
 */
-func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjectName]bool) error {
+func (this *PdfReader) traverseObjectData(o PdfObject) error {
 	common.Log.Debug("Traverse object data")
 	if _, isTraversed := this.traversed[o]; isTraversed {
 		return nil
@ -540,37 +520,30 @@ func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjec
 	if io, isIndirectObj := o.(*PdfIndirectObject); isIndirectObj {
 		common.Log.Debug("io: %s", io)
 		common.Log.Debug("- %s", io.PdfObject)
-		err := this.traverseObjectData(io.PdfObject, nofollowKeys)
+		err := this.traverseObjectData(io.PdfObject)
 		return err
 	}

 	if so, isStreamObj := o.(*PdfObjectStream); isStreamObj {
-		err := this.traverseObjectData(so.PdfObjectDictionary, nofollowKeys)
+		err := this.traverseObjectData(so.PdfObjectDictionary)
 		return err
 	}

 	if dict, isDict := o.(*PdfObjectDictionary); isDict {
 		common.Log.Debug("- dict: %s", dict)
 		for name, v := range *dict {
-			if nofollowKeys != nil {
-				if _, nofollow := nofollowKeys[name]; nofollow {
-					// Do not retraverse up the tree.
-					continue
-				}
-			}
-
 			if ref, isRef := v.(*PdfObjectReference); isRef {
 				resolvedObj, _, err := this.resolveReference(ref)
 				if err != nil {
 					return err
 				}
 				(*dict)[name] = resolvedObj
-				err = this.traverseObjectData(resolvedObj, nofollowKeys)
+				err = this.traverseObjectData(resolvedObj)
 				if err != nil {
 					return err
 				}
 			} else {
-				err := this.traverseObjectData(v, nofollowKeys)
+				err := this.traverseObjectData(v)
 				if err != nil {
 					return err
 				}
@ -589,12 +562,12 @@ func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjec
 				}
 				(*arr)[idx] = resolvedObj

-				err = this.traverseObjectData(resolvedObj, nofollowKeys)
+				err = this.traverseObjectData(resolvedObj)
 				if err != nil {
 					return err
 				}
 			} else {
-				err := this.traverseObjectData(v, nofollowKeys)
+				err := this.traverseObjectData(v)
 				if err != nil {
 					return err
 				}
@ -683,11 +656,8 @@ func (this *PdfReader) GetPage(pageNumber int) (PdfObject, error) {
 	}
 	page := this.pageList[pageNumber-1]

-	nofollowList := map[PdfObjectName]bool{
-		"Parent": true,
-	}
 	// Look up all references related to page and load everything.
-	err := this.traverseObjectData(page, nofollowList)
+	err := this.traverseObjectData(page)
 	if err != nil {
 		return nil, err
 	}
--- a/pdf/repairs.go
+++ b/pdf/repairs.go
@ -0,0 +1,140 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+// Routines related to repairing malformed pdf files.
+
+package pdf
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"regexp"
+
+	"github.com/unidoc/unidoc/common"
+)
+
+var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`)
+
+// Locates a standard Xref table by looking for the "xref" entry.
+// Xref object stream not supported.
+func (this *PdfParser) repairLocateXref() (int64, error) {
+	readBuf := int64(1000)
+	this.rs.Seek(-readBuf, os.SEEK_CUR)
+
+	curOffset, err := this.rs.Seek(0, os.SEEK_CUR)
+	if err != nil {
+		return 0, err
+	}
+	b2 := make([]byte, readBuf)
+	this.rs.Read(b2)
+
+	results := repairReXrefTable.FindAllStringIndex(string(b2), -1)
+	if len(results) < 1 {
+		common.Log.Error("Repair: xref not found!")
+		return 0, errors.New("Repair: xref not found")
+	}
+
+	localOffset := int64(results[len(results)-1][0])
+	xrefOffset := curOffset + localOffset
+	return xrefOffset, nil
+}
+
+// Renumbers the xref table.
+// Useful when the cross reference is pointing to an object with the wrong number.
+// Update the table.
+func (this *PdfParser) rebuildXrefTable() error {
+	newXrefs := XrefTable{}
+	for objNum, xref := range this.xrefs {
+		obj, _, err := this.lookupByNumberWrapper(objNum, false)
+		if err != nil {
+			common.Log.Error("Unable to look up object (%s)", err)
+			common.Log.Error("Xref table completely broken - attempting to repair ")
+			xrefTable, err := this.repairRebuildXrefsTopDown()
+			if err != nil {
+				common.Log.Error("Failed xref rebuild repair (%s)", err)
+				return err
+			}
+			this.xrefs = *xrefTable
+			common.Log.Debug("Repaired xref table built")
+			return nil
+		}
+		actObjNum, actGenNum, err := getObjectNumber(obj)
+		if err != nil {
+			return err
+		}
+
+		xref.objectNumber = int(actObjNum)
+		xref.generation = int(actGenNum)
+		newXrefs[int(actObjNum)] = xref
+	}
+
+	this.xrefs = newXrefs
+	common.Log.Debug("New xref table built")
+	printXrefTable(this.xrefs)
+	return nil
+}
+
+// Parse the entire file from top down.
+// Currently not supporting object streams...
+// Also need to detect object streams and load the object numbers.
+func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
+	reRepairIndirectObject := regexp.MustCompile(`^(\d+)\s+(\d+)\s+obj`)
+
+	this.SetFileOffset(0)
+
+	xrefTable := XrefTable{}
+	for {
+		this.skipComments()
+
+		curOffset := this.GetFileOffset()
+
+		peakBuf, err := this.reader.Peek(10)
+		if err != nil {
+			// EOF
+			break
+		}
+
+		// Indirect object?
+		results := reRepairIndirectObject.FindIndex(peakBuf)
+		if len(results) > 0 {
+			obj, err := this.parseIndirectObject()
+			if err != nil {
+				common.Log.Error("Unable to parse indirect object (%s)", err)
+				return nil, err
+			}
+
+			if indObj, ok := obj.(*PdfIndirectObject); ok {
+				// Make the entry for the cross ref table.
+				xrefEntry := XrefObject{}
+				xrefEntry.xtype = XREF_TABLE_ENTRY
+				xrefEntry.objectNumber = int(indObj.ObjectNumber)
+				xrefEntry.generation = int(indObj.GenerationNumber)
+				xrefEntry.offset = curOffset
+				xrefTable[int(indObj.ObjectNumber)] = xrefEntry
+			} else if streamObj, ok := obj.(*PdfObjectStream); ok {
+				// Make the entry for the cross ref table.
+				xrefEntry := XrefObject{}
+				xrefEntry.xtype = XREF_TABLE_ENTRY
+				xrefEntry.objectNumber = int(streamObj.ObjectNumber)
+				xrefEntry.generation = int(streamObj.GenerationNumber)
+				xrefEntry.offset = curOffset
+				xrefTable[int(streamObj.ObjectNumber)] = xrefEntry
+			} else {
+				return nil, fmt.Errorf("Not an indirect object or stream (%T)", obj) // Should never happen.
+			}
+		} else if string(peakBuf[0:6]) == "endobj" {
+			this.reader.Discard(6)
+		} else {
+			// Stop once we reach xrefs/trailer section etc.  Technically this could fail for complex
+			// cases, but lets keep it simple for now.  Add more complexity when needed (problematic user committed files).
+			// In general more likely that more complex files would have better understanding of the PDF standard.
+			common.Log.Debug("Not an object - stop repair rebuilding xref here (%s)", peakBuf)
+			break
+		}
+	}
+
+	return &xrefTable, nil
+}
--- a/pdf/writer.go
+++ b/pdf/writer.go
@ -22,6 +22,43 @@ import (
 	"github.com/unidoc/unidoc/license"
 )

+var pdfProducer = ""
+var pdfCreator = ""
+
+func getPdfProducer() string {
+	if len(pdfProducer) > 0 {
+		return pdfProducer
+	}
+
+	// Return default.
+	licenseKey := license.GetLicenseKey()
+	return fmt.Sprintf("UniDoc Library version %s (%s) - http://unidoc.io", getUniDocVersion(), licenseKey.TypeToString())
+}
+
+func SetPdfProducer(producer string) {
+	licenseKey := license.GetLicenseKey()
+	commercial := licenseKey.Type == license.LicenseTypeCommercial
+	if !commercial {
+		// Only commercial users can modify the producer.
+		return
+	}
+
+	pdfProducer = producer
+}
+
+func getPdfCreator() string {
+	if len(pdfCreator) > 0 {
+		return pdfCreator
+	}
+
+	// Return default.
+	return "UniDoc - http://unidoc.io"
+}
+
+func SetPdfCreator(creator string) {
+	pdfCreator = creator
+}
+
 type PdfWriter struct {
 	root       *PdfIndirectObject
 	pages      *PdfIndirectObject
@ -45,14 +82,10 @@ func NewPdfWriter() PdfWriter {
 	w.objectsMap = map[PdfObject]bool{}
 	w.objects = []PdfObject{}

-	licenseKey := license.GetLicenseKey()
-
-	producer := fmt.Sprintf("UniDoc Library version %s (%s) - http://unidoc.io", getUniDocVersion(), licenseKey.TypeToString())
-
 	// Creation info.
 	infoDict := PdfObjectDictionary{}
-	infoDict[PdfObjectName("Producer")] = MakeString(producer)
-	infoDict[PdfObjectName("Creator")] = MakeString("FoxyUtils Online PDF https://foxyutils.com")
+	infoDict[PdfObjectName("Producer")] = MakeString(getPdfProducer())
+	infoDict[PdfObjectName("Creator")] = MakeString(getPdfCreator())
 	infoObj := PdfIndirectObject{}
 	infoObj.PdfObject = &infoDict
 	w.infoObj = &infoObj
@ -151,7 +184,16 @@ func (this *PdfWriter) addObjects(obj PdfObject) error {
 				if err != nil {
 					return err
 				}
-
+			} else {
+				// How to handle the parent?  Make sure it is present?
+				if parentObj, parentIsRef := (*dict)["Parent"].(*PdfObjectReference); parentIsRef {
+					// Parent is a reference.  Means we can drop it?
+					// Could refer to somewhere outside of the scope of the output doc.
+					// Should be done by the reader already.
+					// -> ERROR.
+					common.Log.Error("Parent is a reference object - Cannot be in writer (needs to be resolved)")
+					return fmt.Errorf("Parent is a reference object - Cannot be in writer (needs to be resolved) - %s", parentObj)
+				}
 			}
 		}
 		return nil