Merge branch 'v2' of https://github.com/unidoc/unidoc into up_v2_dev

* 'v2' of https://github.com/unidoc/unidoc: Enhancements to tolerate more malformed PDFs. Fixes #47 Minor fix to allow null for colorspace entry in Resource dicts Improved xref rebuild repair procedure. Closes #45. EOF marker seek more forgiving #46 Fix issue with unremoved AES decrypted padding bytes #44 Handle annotations with missing subtype as generic #43 Annotations fix
2025-05-02 22:17:06 +08:00 · 2017-05-08 15:18:07 +10:00 · 2017-05-08 15:18:07 +10:00 · 4134d42c76
commit 4134d42c76
parent a904ca32e7 5c5ad1bc14
8 changed files with 312 additions and 93 deletions
--- a/pdf/core/crypt.go
+++ b/pdf/core/crypt.go
@ -519,6 +519,7 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by
 			common.Log.Debug("ERROR AES invalid buf %s", buf)
 			return buf, fmt.Errorf("AES: Buf len < 16 (%d)", len(buf))
 		}
+
 		iv := buf[:16]
 		buf = buf[16:]

@ -534,8 +535,20 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by
 		common.Log.Trace("chop AES Decrypt (%d): % x", len(buf), buf)
 		mode.CryptBlocks(buf, buf)
 		common.Log.Trace("to (%d): % x", len(buf), buf)
-		//copy(buf[0:], buf[16:])
-		//common.Log.Debug("chop to (%d): % x", len(buf), buf)
+
+		if len(buf) == 0 {
+			common.Log.Trace("Empty buf, returning empty string")
+			return buf, nil
+		}
+
+		// The padded length is indicated by the last values.  Remove those.
+		padLen := int(buf[len(buf)-1])
+		if padLen >= len(buf) {
+			common.Log.Debug("Illegal pad length")
+			return buf, fmt.Errorf("Invalid pad length")
+		}
+		buf = buf[:len(buf)-padLen]
+
 		return buf, nil
 	}
 	return nil, fmt.Errorf("Unsupported crypt filter method (%s)", cfMethod)
--- a/pdf/core/parser.go
+++ b/pdf/core/parser.go
@ -647,7 +647,13 @@ func (this *PdfParser) parsePdfVersion() (int, int, error) {

 	result1 := rePdfVersion.FindStringSubmatch(string(b))
 	if len(result1) < 3 {
-		common.Log.Debug("Error: PDF Version not found!")
+		major, minor, err := this.seekPdfVersionTopDown()
+		if err == nil {
+			common.Log.Debug("Failed recovery - unable to find version")
+			return 0, 0, err
+		}
+
+		return major, minor, nil
 		return 0, 0, errors.New("PDF version not found")
 	}

@ -742,6 +748,7 @@ func (this *PdfParser) parseXrefTable() (*PdfObjectDictionary, error) {
 			continue
 		}
 		if (len(txt) > 6) && (txt[:7] == "trailer") {
+			common.Log.Trace("Found trailer - %s", txt)
 			// Sometimes get "trailer << ...."
 			// Need to rewind to end of trailer text.
 			if len(txt) > 9 {
@ -1001,13 +1008,64 @@ func (this *PdfParser) parseXref() (*PdfObjectDictionary, error) {
 			return nil, err
 		}
 	} else {
-		common.Log.Debug("ERROR: Invalid xref.... starting with \"%s\"", string(bb))
-		return nil, errors.New("Invalid xref format")
+		common.Log.Debug("Warning: Unable to find xref table or stream. Repair attempted: Looking for earliest xref from bottom.")
+		err := this.repairSeekXrefMarker()
+		if err != nil {
+			common.Log.Debug("Repair failed - %v", err)
+			return nil, err
+		}
+
+		trailerDict, err = this.parseXrefTable()
+		if err != nil {
+			return nil, err
+		}
 	}

 	return trailerDict, err
 }

+// Look for EOF marker and seek to its beginning.
+// Define an offset position from the end of the file.
+func (this *PdfParser) seekToEOFMarker(fSize int64) error {
+	// Define the starting point (from the end of the file) to search from.
+	var offset int64 = 0
+
+	// Define an buffer length in terms of how many bytes to read from the end of the file.
+	var buflen int64 = 1000
+
+	for offset < fSize {
+		if fSize <= (buflen + offset) {
+			buflen = fSize - offset
+		}
+
+		// Move back enough (as we need to read forward).
+		_, err := this.rs.Seek(-offset-buflen, os.SEEK_END)
+		if err != nil {
+			return err
+		}
+
+		// Read the data.
+		b1 := make([]byte, buflen)
+		this.rs.Read(b1)
+		common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1))
+		ind := reEOF.FindAllStringIndex(string(b1), -1)
+		if ind != nil {
+			// Found it.
+			lastInd := ind[len(ind)-1]
+			common.Log.Trace("Ind: % d", ind)
+			this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
+			return nil
+		} else {
+			common.Log.Debug("Warning: EOF marker not found! - continue seeking")
+		}
+
+		offset += buflen
+	}
+
+	common.Log.Debug("Error: EOF marker was not found.")
+	return errors.New("EOF not found")
+}
+
 //
 // Load the xrefs from the bottom of file prior to parsing the file.
 // 1. Look for %%EOF marker, then
@ -1031,39 +1089,29 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {
 	this.xrefs = make(XrefTable)
 	this.objstms = make(ObjectStreams)

-	// Look for EOF marker and seek to its beginning.
-	// Define an offset position from the end of the file.
-	var offset int64 = 1000
 	// Get the file size.
 	fSize, err := this.rs.Seek(0, os.SEEK_END)
 	if err != nil {
 		return nil, err
 	}
 	common.Log.Trace("fsize: %d", fSize)
-	if fSize <= offset {
-		offset = fSize
-	}
-	_, err = this.rs.Seek(-offset, os.SEEK_END)
+
+	// Seek the EOF marker.
+	err = this.seekToEOFMarker(fSize)
 	if err != nil {
+		common.Log.Debug("Failed seek to eof marker: %v", err)
 		return nil, err
 	}
-	b1 := make([]byte, offset)
-	this.rs.Read(b1)
-	common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1))
-	ind := reEOF.FindAllStringIndex(string(b1), -1)
-	if ind == nil {
-		common.Log.Debug("Error: EOF marker not found!")
-		return nil, errors.New("EOF marker not found")
-	}
-	lastInd := ind[len(ind)-1]
-	common.Log.Trace("Ind: % d", ind)
-	this.rs.Seek(-offset+int64(lastInd[0]), os.SEEK_END)

 	// Look for startxref and get the xref offset.
-	offset = 64
+	var offset int64 = 64
 	this.rs.Seek(-offset, os.SEEK_CUR)
 	b2 := make([]byte, offset)
-	this.rs.Read(b2)
+	_, err = this.rs.Read(b2)
+	if err != nil {
+		common.Log.Debug("Failed reading while looking for startxref: %v", err)
+		return nil, err
+	}

 	result := reStartXref.FindStringSubmatch(string(b2))
 	if len(result) < 2 {
@ -1071,7 +1119,6 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {
 		return nil, errors.New("Startxref not found")
 	}
 	if len(result) > 2 {
-		// GH: Take the last one?  Make a test case.
 		common.Log.Debug("ERROR: Multiple startxref (%s)!", b2)
 		return nil, errors.New("Multiple startxref entries?")
 	}
@ -1133,8 +1180,9 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {

 		ptrailerDict, err := this.parseXref()
 		if err != nil {
-			common.Log.Debug("ERROR: Failed loading another (Prev) trailer")
-			return nil, err
+			common.Log.Debug("Warning: Error - Failed loading another (Prev) trailer")
+			common.Log.Debug("Attempting to continue by ignoring it")
+			break
 		}

 		xx, present = (*ptrailerDict)["Prev"]
--- a/pdf/core/repairs.go
+++ b/pdf/core/repairs.go
@ -13,6 +13,10 @@ import (
 	"os"
 	"regexp"

+	"bufio"
+	"io"
+	"strconv"
+
 	"github.com/unidoc/unidoc/common"
 )

@ -77,9 +81,22 @@ func (this *PdfParser) rebuildXrefTable() error {
 	return nil
 }

+// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil).
+func parseObjectNumberFromString(str string) (int, int, error) {
+	result := reIndirectObject.FindStringSubmatch(str)
+	if len(result) < 3 {
+		return 0, 0, errors.New("Unable to detect indirect object signature")
+	}
+
+	on, _ := strconv.Atoi(result[1])
+	gn, _ := strconv.Atoi(result[2])
+
+	return on, gn, nil
+}
+
 // Parse the entire file from top down.
-// Currently not supporting object streams...
-// Also need to detect object streams and load the object numbers.
+// Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns.
+// N.B. This collects the XREF_TABLE_ENTRY data only.
 func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
 	if this.repairsAttempted {
 		// Avoid multiple repairs (only try once).
@ -87,60 +104,183 @@ func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
 	}
 	this.repairsAttempted = true

-	reRepairIndirectObject := regexp.MustCompile(`^(\d+)\s+(\d+)\s+obj`)
+	// Go to beginning, reset reader.
+	this.rs.Seek(0, os.SEEK_SET)
+	this.reader = bufio.NewReader(this.rs)

-	this.SetFileOffset(0)
+	// Keep a running buffer of last bytes.
+	bufLen := 20
+	last := make([]byte, bufLen)

 	xrefTable := XrefTable{}
 	for {
-		this.skipComments()
-
-		curOffset := this.GetFileOffset()
-
-		peakBuf, err := this.reader.Peek(10)
+		b, err := this.reader.ReadByte()
 		if err != nil {
-			// EOF
-			break
+			if err == io.EOF {
+				break
+			} else {
+				return nil, err
+			}
 		}

-		// Indirect object?
-		results := reRepairIndirectObject.FindIndex(peakBuf)
-		if len(results) > 0 {
-			obj, err := this.ParseIndirectObject()
+		// Format:
+		// object number - whitespace - generation number - obj
+		// e.g. "12 0 obj"
+		if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) {
+			i := bufLen - 4
+			// Go past whitespace
+			for IsWhiteSpace(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 || !IsDecimalDigit(last[i]) {
+				continue
+			}
+			// Go past generation number
+			for IsDecimalDigit(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 || !IsWhiteSpace(last[i]) {
+				continue
+			}
+			// Go past whitespace
+			for IsWhiteSpace(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 || !IsDecimalDigit(last[i]) {
+				continue
+			}
+			// Go past object number.
+			for IsDecimalDigit(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 {
+				continue // Probably too long to be a valid object...
+			}
+
+			objOffset := this.GetFileOffset() - int64(bufLen-i)
+
+			objstr := append(last[i+1:], b)
+			objNum, genNum, err := parseObjectNumberFromString(string(objstr))
 			if err != nil {
-				common.Log.Debug("ERROR: Unable to parse indirect object (%s)", err)
+				common.Log.Debug("Unable to parse object number: %v", err)
 				return nil, err
 			}

-			if indObj, ok := obj.(*PdfIndirectObject); ok {
+			// Create and insert the XREF entry if not existing, or the generation number is higher.
+			if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum {
 				// Make the entry for the cross ref table.
 				xrefEntry := XrefObject{}
 				xrefEntry.xtype = XREF_TABLE_ENTRY
-				xrefEntry.objectNumber = int(indObj.ObjectNumber)
-				xrefEntry.generation = int(indObj.GenerationNumber)
-				xrefEntry.offset = curOffset
-				xrefTable[int(indObj.ObjectNumber)] = xrefEntry
-			} else if streamObj, ok := obj.(*PdfObjectStream); ok {
-				// Make the entry for the cross ref table.
-				xrefEntry := XrefObject{}
-				xrefEntry.xtype = XREF_TABLE_ENTRY
-				xrefEntry.objectNumber = int(streamObj.ObjectNumber)
-				xrefEntry.generation = int(streamObj.GenerationNumber)
-				xrefEntry.offset = curOffset
-				xrefTable[int(streamObj.ObjectNumber)] = xrefEntry
-			} else {
-				return nil, fmt.Errorf("Not an indirect object or stream (%T)", obj) // Should never happen.
+				xrefEntry.objectNumber = int(objNum)
+				xrefEntry.generation = int(genNum)
+				xrefEntry.offset = objOffset
+				xrefTable[objNum] = xrefEntry
 			}
-		} else if string(peakBuf[0:6]) == "endobj" {
-			this.reader.Discard(6)
-		} else {
-			// Stop once we reach xrefs/trailer section etc.  Technically this could fail for complex
-			// cases, but lets keep it simple for now.  Add more complexity when needed (problematic user committed files).
-			// In general more likely that more complex files would have better understanding of the PDF standard.
-			common.Log.Debug("Not an object - stop repair rebuilding xref here (%s)", peakBuf)
-			break
 		}
+
+		last = append(last[1:bufLen], b)
 	}

 	return &xrefTable, nil
 }
+
+// Look for first sign of xref table from end of file.
+func (this *PdfParser) repairSeekXrefMarker() error {
+	// Get the file size.
+	fSize, err := this.rs.Seek(0, os.SEEK_END)
+	if err != nil {
+		return err
+	}
+
+	reXrefTableStart := regexp.MustCompile(`\sxref\s*`)
+
+	// Define the starting point (from the end of the file) to search from.
+	var offset int64 = 0
+
+	// Define an buffer length in terms of how many bytes to read from the end of the file.
+	var buflen int64 = 1000
+
+	for offset < fSize {
+		if fSize <= (buflen + offset) {
+			buflen = fSize - offset
+		}
+
+		// Move back enough (as we need to read forward).
+		_, err := this.rs.Seek(-offset-buflen, os.SEEK_END)
+		if err != nil {
+			return err
+		}
+
+		// Read the data.
+		b1 := make([]byte, buflen)
+		this.rs.Read(b1)
+
+		common.Log.Trace("Looking for xref : \"%s\"", string(b1))
+		ind := reXrefTableStart.FindAllStringIndex(string(b1), -1)
+		if ind != nil {
+			// Found it.
+			lastInd := ind[len(ind)-1]
+			common.Log.Trace("Ind: % d", ind)
+			this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
+			this.reader = bufio.NewReader(this.rs)
+			// Go past whitespace, finish at 'x'.
+			for {
+				bb, err := this.reader.Peek(1)
+				if err != nil {
+					return err
+				}
+				common.Log.Trace("B: %d %c", bb[0], bb[0])
+				if !IsWhiteSpace(bb[0]) {
+					break
+				}
+				this.reader.Discard(1)
+			}
+
+			return nil
+		} else {
+			common.Log.Debug("Warning: EOF marker not found! - continue seeking")
+		}
+
+		offset += buflen
+	}
+
+	common.Log.Debug("Error: Xref table marker was not found.")
+	return errors.New("xref not found ")
+}
+
+// Called when Pdf version not found normally.  Looks for the PDF version by scanning top-down.
+// %PDF-1.7
+func (this *PdfParser) seekPdfVersionTopDown() (int, int, error) {
+	// Go to beginning, reset reader.
+	this.rs.Seek(0, os.SEEK_SET)
+	this.reader = bufio.NewReader(this.rs)
+
+	// Keep a running buffer of last bytes.
+	bufLen := 20
+	last := make([]byte, bufLen)
+
+	for {
+		b, err := this.reader.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				break
+			} else {
+				return 0, 0, err
+			}
+		}
+
+		// Format:
+		// object number - whitespace - generation number - obj
+		// e.g. "12 0 obj"
+		if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
+			last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
+			major := int(last[bufLen-2] - '0')
+			minor := int(b - '0')
+			return major, minor, nil
+		}
+
+		last = append(last[1:bufLen], b)
+	}
+
+	return 0, 0, errors.New("Version not found")
+}
--- a/pdf/model/annotations.go
+++ b/pdf/model/annotations.go
@ -680,10 +680,13 @@ func (r *PdfReader) newPdfAnnotationFromIndirectObject(container *PdfIndirectObj

 	subtypeObj, has := (*d)["Subtype"]
 	if !has {
-		return nil, fmt.Errorf("Missing Subtype")
+		common.Log.Debug("WARNING: Compatibility issue - annotation Subtype missing - assuming no subtype")
+		annot.context = nil
+		return annot, nil
 	}
 	subtype, ok := subtypeObj.(*PdfObjectName)
 	if !ok {
+		common.Log.Debug("ERROR: Invalid Subtype object type != name (%T)", subtypeObj)
 		return nil, fmt.Errorf("Invalid Subtype object type != name (%T)", subtypeObj)
 	}
 	switch *subtype {
@ -1597,6 +1600,8 @@ func (this *PdfAnnotation) GetContainingPdfObject() PdfObject {
 	return this.primitive
 }

+// Note: Call the sub-annotation's ToPdfObject to set both the generic and non-generic information.
+// TODO/FIXME: Consider doing it here instead.
 func (this *PdfAnnotation) ToPdfObject() PdfObject {
 	container := this.primitive
 	d := container.PdfObject.(*PdfObjectDictionary)
--- a/pdf/model/page.go
+++ b/pdf/model/page.go
@ -23,23 +23,22 @@ import (

 // PDF page object (7.7.3.3 - Table 30).
 type PdfPage struct {
-	Parent       PdfObject
-	LastModified *PdfDate
-	Resources    *PdfPageResources
-	CropBox      *PdfRectangle
-	MediaBox     *PdfRectangle
-	BleedBox     *PdfRectangle
-	TrimBox      *PdfRectangle
-	ArtBox       *PdfRectangle
-	BoxColorInfo PdfObject
-	Contents     PdfObject
-	Rotate       *int64
-	Group        PdfObject
-	Thumb        PdfObject
-	B            PdfObject
-	Dur          PdfObject
-	Trans        PdfObject
-	//Annots               PdfObject
+	Parent               PdfObject
+	LastModified         *PdfDate
+	Resources            *PdfPageResources
+	CropBox              *PdfRectangle
+	MediaBox             *PdfRectangle
+	BleedBox             *PdfRectangle
+	TrimBox              *PdfRectangle
+	ArtBox               *PdfRectangle
+	BoxColorInfo         PdfObject
+	Contents             PdfObject
+	Rotate               *int64
+	Group                PdfObject
+	Thumb                PdfObject
+	B                    PdfObject
+	Dur                  PdfObject
+	Trans                PdfObject
 	AA                   PdfObject
 	Metadata             PdfObject
 	PieceInfo            PdfObject
@ -52,7 +51,7 @@ type PdfPage struct {
 	PresSteps            PdfObject
 	UserUnit             PdfObject
 	VP                   PdfObject
-	//Annotations
+
 	Annotations []*PdfAnnotation

 	// Primitive container.
@ -477,7 +476,12 @@ func (this *PdfPage) GetPageDict() *PdfObjectDictionary {
 	if this.Annotations != nil {
 		arr := PdfObjectArray{}
 		for _, annot := range this.Annotations {
-			arr = append(arr, annot.GetContext().ToPdfObject())
+			if subannot := annot.GetContext(); subannot != nil {
+				arr = append(arr, subannot.ToPdfObject())
+			} else {
+				// Generic annotation dict (without subtype).
+				arr = append(arr, annot.ToPdfObject())
+			}
 		}
 		p.Set("Annots", &arr)
 	}
--- a/pdf/model/reader.go
+++ b/pdf/model/reader.go
@ -462,11 +462,6 @@ func (this *PdfReader) loadForms() (*PdfAcroForm, error) {
 	common.Log.Trace("Has Acro forms")
 	// Load it.

-	acroForm, err := this.newPdfAcroFormFromDict(formsDict)
-	if err != nil {
-		return nil, err
-	}
-
 	// Ensure we have access to everything.
 	common.Log.Trace("Traverse the Acroforms structure")
 	err = this.traverseObjectData(formsDict)
@ -475,6 +470,12 @@ func (this *PdfReader) loadForms() (*PdfAcroForm, error) {
 		return nil, err
 	}

+	// Create the acro forms object.
+	acroForm, err := this.newPdfAcroFormFromDict(formsDict)
+	if err != nil {
+		return nil, err
+	}
+
 	return acroForm, nil
 }

--- a/pdf/model/resources.go
+++ b/pdf/model/resources.go
@ -40,7 +40,7 @@ func NewPdfPageResourcesFromDict(dict *PdfObjectDictionary) (*PdfPageResources,
 	if obj, isDefined := (*dict)["ExtGState"]; isDefined {
 		r.ExtGState = obj
 	}
-	if obj, isDefined := (*dict)["ColorSpace"]; isDefined {
+	if obj, isDefined := (*dict)["ColorSpace"]; isDefined && !isNullObject(obj) {
 		colorspaces, err := newPdfPageResourcesColorspacesFromPdfObject(obj)
 		if err != nil {
 			return nil, err
--- a/pdf/model/utils.go
+++ b/pdf/model/utils.go
@ -28,6 +28,14 @@ func getNumberAsFloat(obj PdfObject) (float64, error) {
 	return 0, errors.New("Not a number")
 }

+func isNullObject(obj PdfObject) bool {
+	if _, isNull := obj.(*PdfObjectNull); isNull {
+		return true
+	} else {
+		return false
+	}
+}
+
 // Convert a list of pdf objects representing floats or integers to a slice of float64 values.
 func getNumbersAsFloat(objects []PdfObject) ([]float64, error) {
 	floats := []float64{}