From 57e576344c6233f8d4e0dbef3e6d9ce78df5d34f Mon Sep 17 00:00:00 2001 From: Gunnsteinn Hall Date: Fri, 21 Apr 2017 12:08:58 +0000 Subject: [PATCH 1/7] Annotations fix https://github.com/unidoc/unidoc/issues/41 --- pdf/model/page.go | 35 +++++++++++++++++------------------ pdf/model/reader.go | 11 ++++++----- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pdf/model/page.go b/pdf/model/page.go index 933c596a..4e7946cc 100644 --- a/pdf/model/page.go +++ b/pdf/model/page.go @@ -22,23 +22,22 @@ import ( // PDF page object (7.7.3.3 - Table 30). type PdfPage struct { - Parent PdfObject - LastModified *PdfDate - Resources *PdfPageResources - CropBox *PdfRectangle - MediaBox *PdfRectangle - BleedBox *PdfRectangle - TrimBox *PdfRectangle - ArtBox *PdfRectangle - BoxColorInfo PdfObject - Contents PdfObject - Rotate *int64 - Group PdfObject - Thumb PdfObject - B PdfObject - Dur PdfObject - Trans PdfObject - //Annots PdfObject + Parent PdfObject + LastModified *PdfDate + Resources *PdfPageResources + CropBox *PdfRectangle + MediaBox *PdfRectangle + BleedBox *PdfRectangle + TrimBox *PdfRectangle + ArtBox *PdfRectangle + BoxColorInfo PdfObject + Contents PdfObject + Rotate *int64 + Group PdfObject + Thumb PdfObject + B PdfObject + Dur PdfObject + Trans PdfObject AA PdfObject Metadata PdfObject PieceInfo PdfObject @@ -51,7 +50,7 @@ type PdfPage struct { PresSteps PdfObject UserUnit PdfObject VP PdfObject - //Annotations + Annotations []*PdfAnnotation // Primitive container. diff --git a/pdf/model/reader.go b/pdf/model/reader.go index 4504162f..352c9bf3 100644 --- a/pdf/model/reader.go +++ b/pdf/model/reader.go @@ -462,11 +462,6 @@ func (this *PdfReader) loadForms() (*PdfAcroForm, error) { common.Log.Trace("Has Acro forms") // Load it. - acroForm, err := this.newPdfAcroFormFromDict(formsDict) - if err != nil { - return nil, err - } - // Ensure we have access to everything. common.Log.Trace("Traverse the Acroforms structure") err = this.traverseObjectData(formsDict) @@ -475,6 +470,12 @@ func (this *PdfReader) loadForms() (*PdfAcroForm, error) { return nil, err } + // Create the acro forms object. + acroForm, err := this.newPdfAcroFormFromDict(formsDict) + if err != nil { + return nil, err + } + return acroForm, nil } From 93459267450ebd8015a33eb1eb9508ed8bb52d40 Mon Sep 17 00:00:00 2001 From: Gunnsteinn Hall Date: Mon, 24 Apr 2017 18:18:46 +0000 Subject: [PATCH 2/7] Handle annotations with missing subtype as generic #43 --- pdf/model/annotations.go | 7 ++++++- pdf/model/page.go | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pdf/model/annotations.go b/pdf/model/annotations.go index ba89a05d..99d47bff 100644 --- a/pdf/model/annotations.go +++ b/pdf/model/annotations.go @@ -680,10 +680,13 @@ func (r *PdfReader) newPdfAnnotationFromIndirectObject(container *PdfIndirectObj subtypeObj, has := (*d)["Subtype"] if !has { - return nil, fmt.Errorf("Missing Subtype") + common.Log.Debug("WARNING: Compatibility issue - annotation Subtype missing - assuming no subtype") + annot.context = nil + return annot, nil } subtype, ok := subtypeObj.(*PdfObjectName) if !ok { + common.Log.Debug("ERROR: Invalid Subtype object type != name (%T)", subtypeObj) return nil, fmt.Errorf("Invalid Subtype object type != name (%T)", subtypeObj) } switch *subtype { @@ -1597,6 +1600,8 @@ func (this *PdfAnnotation) GetContainingPdfObject() PdfObject { return this.primitive } +// Note: Call the sub-annotation's ToPdfObject to set both the generic and non-generic information. +// TODO/FIXME: Consider doing it here instead. func (this *PdfAnnotation) ToPdfObject() PdfObject { container := this.primitive d := container.PdfObject.(*PdfObjectDictionary) diff --git a/pdf/model/page.go b/pdf/model/page.go index 4e7946cc..76d18df6 100644 --- a/pdf/model/page.go +++ b/pdf/model/page.go @@ -475,7 +475,12 @@ func (this *PdfPage) GetPageDict() *PdfObjectDictionary { if this.Annotations != nil { arr := PdfObjectArray{} for _, annot := range this.Annotations { - arr = append(arr, annot.GetContext().ToPdfObject()) + if subannot := annot.GetContext(); subannot != nil { + arr = append(arr, subannot.ToPdfObject()) + } else { + // Generic annotation dict (without subtype). + arr = append(arr, annot.ToPdfObject()) + } } p.Set("Annots", &arr) } From d25fa23fb0df3c1d3590cb5da97e80c30cd15f9e Mon Sep 17 00:00:00 2001 From: Gunnsteinn Hall Date: Mon, 24 Apr 2017 21:36:07 +0000 Subject: [PATCH 3/7] Fix issue with unremoved AES decrypted padding bytes #44 --- pdf/core/crypt.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pdf/core/crypt.go b/pdf/core/crypt.go index c7b08d33..aff8ce85 100644 --- a/pdf/core/crypt.go +++ b/pdf/core/crypt.go @@ -519,6 +519,7 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by common.Log.Debug("ERROR AES invalid buf %s", buf) return buf, fmt.Errorf("AES: Buf len < 16 (%d)", len(buf)) } + iv := buf[:16] buf = buf[16:] @@ -534,8 +535,20 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by common.Log.Trace("chop AES Decrypt (%d): % x", len(buf), buf) mode.CryptBlocks(buf, buf) common.Log.Trace("to (%d): % x", len(buf), buf) - //copy(buf[0:], buf[16:]) - //common.Log.Debug("chop to (%d): % x", len(buf), buf) + + if len(buf) == 0 { + common.Log.Trace("Empty buf, returning empty string") + return buf, nil + } + + // The padded length is indicated by the last values. Remove those. + padLen := int(buf[len(buf)-1]) + if padLen >= len(buf) { + common.Log.Debug("Illegal pad length") + return buf, fmt.Errorf("Invalid pad length") + } + buf = buf[:len(buf)-padLen] + return buf, nil } return nil, fmt.Errorf("Unsupported crypt filter method (%s)", cfMethod) From 532f564b3a5c7a53f74c0aeeedd8fd97a6a25756 Mon Sep 17 00:00:00 2001 From: Gunnsteinn Hall Date: Wed, 26 Apr 2017 08:53:18 +0000 Subject: [PATCH 4/7] EOF marker seek more forgiving #46 --- pdf/core/parser.go | 72 +++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/pdf/core/parser.go b/pdf/core/parser.go index b5d9de51..e8fe9d4d 100644 --- a/pdf/core/parser.go +++ b/pdf/core/parser.go @@ -1008,6 +1008,48 @@ func (this *PdfParser) parseXref() (*PdfObjectDictionary, error) { return trailerDict, err } +// Look for EOF marker and seek to its beginning. +// Define an offset position from the end of the file. +func (this *PdfParser) seekToEOFMarker(fSize int64) error { + // Define the starting point (from the end of the file) to search from. + var offset int64 = 0 + + // Define an buffer length in terms of how many bytes to read from the end of the file. + var buflen int64 = 1000 + + for offset < fSize { + if fSize <= (buflen + offset) { + buflen = fSize - offset + } + + // Move back enough (as we need to read forward). + _, err := this.rs.Seek(-offset-buflen, os.SEEK_END) + if err != nil { + return err + } + + // Read the data. + b1 := make([]byte, buflen) + this.rs.Read(b1) + common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1)) + ind := reEOF.FindAllStringIndex(string(b1), -1) + if ind != nil { + // Found it. + lastInd := ind[len(ind)-1] + common.Log.Trace("Ind: % d", ind) + this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END) + return nil + } else { + common.Log.Debug("Warning: EOF marker not found! - continue seeking") + } + + offset += buflen + } + + common.Log.Debug("Error: EOF marker was not found.") + return errors.New("EOF not found") +} + // // Load the xrefs from the bottom of file prior to parsing the file. // 1. Look for %%EOF marker, then @@ -1031,39 +1073,29 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { this.xrefs = make(XrefTable) this.objstms = make(ObjectStreams) - // Look for EOF marker and seek to its beginning. - // Define an offset position from the end of the file. - var offset int64 = 1000 // Get the file size. fSize, err := this.rs.Seek(0, os.SEEK_END) if err != nil { return nil, err } common.Log.Trace("fsize: %d", fSize) - if fSize <= offset { - offset = fSize - } - _, err = this.rs.Seek(-offset, os.SEEK_END) + + // Seek the EOF marker. + err = this.seekToEOFMarker(fSize) if err != nil { + common.Log.Debug("Failed seek to eof marker: %v", err) return nil, err } - b1 := make([]byte, offset) - this.rs.Read(b1) - common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1)) - ind := reEOF.FindAllStringIndex(string(b1), -1) - if ind == nil { - common.Log.Debug("Error: EOF marker not found!") - return nil, errors.New("EOF marker not found") - } - lastInd := ind[len(ind)-1] - common.Log.Trace("Ind: % d", ind) - this.rs.Seek(-offset+int64(lastInd[0]), os.SEEK_END) // Look for startxref and get the xref offset. - offset = 64 + var offset int64 = 64 this.rs.Seek(-offset, os.SEEK_CUR) b2 := make([]byte, offset) - this.rs.Read(b2) + _, err = this.rs.Read(b2) + if err != nil { + common.Log.Debug("Failed reading while looking for startxref: %v", err) + return nil, err + } result := reStartXref.FindStringSubmatch(string(b2)) if len(result) < 2 { From 3f4e84a2e4d93768f051d7d6571b02a2554d6f2c Mon Sep 17 00:00:00 2001 From: Gunnsteinn Hall Date: Wed, 26 Apr 2017 23:11:35 +0000 Subject: [PATCH 5/7] Improved xref rebuild repair procedure. Closes #45. --- pdf/core/repairs.go | 117 +++++++++++++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 39 deletions(-) diff --git a/pdf/core/repairs.go b/pdf/core/repairs.go index 9c3d0674..3a31e5d0 100644 --- a/pdf/core/repairs.go +++ b/pdf/core/repairs.go @@ -13,6 +13,10 @@ import ( "os" "regexp" + "bufio" + "io" + "strconv" + "github.com/unidoc/unidoc/common" ) @@ -77,9 +81,22 @@ func (this *PdfParser) rebuildXrefTable() error { return nil } +// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil). +func parseObjectNumberFromString(str string) (int, int, error) { + result := reIndirectObject.FindStringSubmatch(str) + if len(result) < 3 { + return 0, 0, errors.New("Unable to detect indirect object signature") + } + + on, _ := strconv.Atoi(result[1]) + gn, _ := strconv.Atoi(result[2]) + + return on, gn, nil +} + // Parse the entire file from top down. -// Currently not supporting object streams... -// Also need to detect object streams and load the object numbers. +// Goes through the file byte-by-byte looking for " obj" patterns. +// N.B. This collects the XREF_TABLE_ENTRY data only. func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) { if this.repairsAttempted { // Avoid multiple repairs (only try once). @@ -87,59 +104,81 @@ func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) { } this.repairsAttempted = true - reRepairIndirectObject := regexp.MustCompile(`^(\d+)\s+(\d+)\s+obj`) + // Go to beginning, reset reader. + this.rs.Seek(0, os.SEEK_SET) + this.reader = bufio.NewReader(this.rs) - this.SetFileOffset(0) + // Keep a running buffer of last bytes. + bufLen := 20 + last := make([]byte, bufLen) xrefTable := XrefTable{} for { - this.skipComments() - - curOffset := this.GetFileOffset() - - peakBuf, err := this.reader.Peek(10) + b, err := this.reader.ReadByte() if err != nil { - // EOF - break + if err == io.EOF { + break + } else { + return nil, err + } } - // Indirect object? - results := reRepairIndirectObject.FindIndex(peakBuf) - if len(results) > 0 { - obj, err := this.ParseIndirectObject() + // Format: + // object number - whitespace - generation number - obj + // e.g. "12 0 obj" + if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) { + i := bufLen - 4 + // Go past whitespace + for IsWhiteSpace(last[i]) && i > 0 { + i-- + } + if i == 0 || !IsDecimalDigit(last[i]) { + continue + } + // Go past generation number + for IsDecimalDigit(last[i]) && i > 0 { + i-- + } + if i == 0 || !IsWhiteSpace(last[i]) { + continue + } + // Go past whitespace + for IsWhiteSpace(last[i]) && i > 0 { + i-- + } + if i == 0 || !IsDecimalDigit(last[i]) { + continue + } + // Go past object number. + for IsDecimalDigit(last[i]) && i > 0 { + i-- + } + if i == 0 { + continue // Probably too long to be a valid object... + } + + objOffset := this.GetFileOffset() - int64(bufLen-i) + + objstr := append(last[i+1:], b) + objNum, genNum, err := parseObjectNumberFromString(string(objstr)) if err != nil { - common.Log.Debug("ERROR: Unable to parse indirect object (%s)", err) + common.Log.Debug("Unable to parse object number: %v", err) return nil, err } - if indObj, ok := obj.(*PdfIndirectObject); ok { + // Create and insert the XREF entry if not existing, or the generation number is higher. + if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum { // Make the entry for the cross ref table. xrefEntry := XrefObject{} xrefEntry.xtype = XREF_TABLE_ENTRY - xrefEntry.objectNumber = int(indObj.ObjectNumber) - xrefEntry.generation = int(indObj.GenerationNumber) - xrefEntry.offset = curOffset - xrefTable[int(indObj.ObjectNumber)] = xrefEntry - } else if streamObj, ok := obj.(*PdfObjectStream); ok { - // Make the entry for the cross ref table. - xrefEntry := XrefObject{} - xrefEntry.xtype = XREF_TABLE_ENTRY - xrefEntry.objectNumber = int(streamObj.ObjectNumber) - xrefEntry.generation = int(streamObj.GenerationNumber) - xrefEntry.offset = curOffset - xrefTable[int(streamObj.ObjectNumber)] = xrefEntry - } else { - return nil, fmt.Errorf("Not an indirect object or stream (%T)", obj) // Should never happen. + xrefEntry.objectNumber = int(objNum) + xrefEntry.generation = int(genNum) + xrefEntry.offset = objOffset + xrefTable[objNum] = xrefEntry } - } else if string(peakBuf[0:6]) == "endobj" { - this.reader.Discard(6) - } else { - // Stop once we reach xrefs/trailer section etc. Technically this could fail for complex - // cases, but lets keep it simple for now. Add more complexity when needed (problematic user committed files). - // In general more likely that more complex files would have better understanding of the PDF standard. - common.Log.Debug("Not an object - stop repair rebuilding xref here (%s)", peakBuf) - break } + + last = append(last[1:bufLen], b) } return &xrefTable, nil From 78c6f01d03793454af25bfc50d10a5ad1e219cde Mon Sep 17 00:00:00 2001 From: Gunnsteinn Hall Date: Wed, 26 Apr 2017 23:23:29 +0000 Subject: [PATCH 6/7] Minor fix to allow null for colorspace entry in Resource dicts --- pdf/model/resources.go | 2 +- pdf/model/utils.go | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pdf/model/resources.go b/pdf/model/resources.go index 56263be6..cc479f53 100644 --- a/pdf/model/resources.go +++ b/pdf/model/resources.go @@ -40,7 +40,7 @@ func NewPdfPageResourcesFromDict(dict *PdfObjectDictionary) (*PdfPageResources, if obj, isDefined := (*dict)["ExtGState"]; isDefined { r.ExtGState = obj } - if obj, isDefined := (*dict)["ColorSpace"]; isDefined { + if obj, isDefined := (*dict)["ColorSpace"]; isDefined && !isNullObject(obj) { colorspaces, err := newPdfPageResourcesColorspacesFromPdfObject(obj) if err != nil { return nil, err diff --git a/pdf/model/utils.go b/pdf/model/utils.go index 834beb32..14e3650b 100644 --- a/pdf/model/utils.go +++ b/pdf/model/utils.go @@ -28,6 +28,14 @@ func getNumberAsFloat(obj PdfObject) (float64, error) { return 0, errors.New("Not a number") } +func isNullObject(obj PdfObject) bool { + if _, isNull := obj.(*PdfObjectNull); isNull { + return true + } else { + return false + } +} + // Convert a list of pdf objects representing floats or integers to a slice of float64 values. func getNumbersAsFloat(objects []PdfObject) ([]float64, error) { floats := []float64{} From 5c5ad1bc14aee9b2e72f8babd5665ad7d7f6b804 Mon Sep 17 00:00:00 2001 From: Gunnsteinn Hall Date: Thu, 27 Apr 2017 22:25:00 +0000 Subject: [PATCH 7/7] Enhancements to tolerate more malformed PDFs. Fixes #47 --- pdf/core/parser.go | 28 +++++++++--- pdf/core/repairs.go | 101 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 6 deletions(-) diff --git a/pdf/core/parser.go b/pdf/core/parser.go index e8fe9d4d..ed7ae4b3 100644 --- a/pdf/core/parser.go +++ b/pdf/core/parser.go @@ -647,7 +647,13 @@ func (this *PdfParser) parsePdfVersion() (int, int, error) { result1 := rePdfVersion.FindStringSubmatch(string(b)) if len(result1) < 3 { - common.Log.Debug("Error: PDF Version not found!") + major, minor, err := this.seekPdfVersionTopDown() + if err == nil { + common.Log.Debug("Failed recovery - unable to find version") + return 0, 0, err + } + + return major, minor, nil return 0, 0, errors.New("PDF version not found") } @@ -742,6 +748,7 @@ func (this *PdfParser) parseXrefTable() (*PdfObjectDictionary, error) { continue } if (len(txt) > 6) && (txt[:7] == "trailer") { + common.Log.Trace("Found trailer - %s", txt) // Sometimes get "trailer << ...." // Need to rewind to end of trailer text. if len(txt) > 9 { @@ -1001,8 +1008,17 @@ func (this *PdfParser) parseXref() (*PdfObjectDictionary, error) { return nil, err } } else { - common.Log.Debug("ERROR: Invalid xref.... starting with \"%s\"", string(bb)) - return nil, errors.New("Invalid xref format") + common.Log.Debug("Warning: Unable to find xref table or stream. Repair attempted: Looking for earliest xref from bottom.") + err := this.repairSeekXrefMarker() + if err != nil { + common.Log.Debug("Repair failed - %v", err) + return nil, err + } + + trailerDict, err = this.parseXrefTable() + if err != nil { + return nil, err + } } return trailerDict, err @@ -1103,7 +1119,6 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { return nil, errors.New("Startxref not found") } if len(result) > 2 { - // GH: Take the last one? Make a test case. common.Log.Debug("ERROR: Multiple startxref (%s)!", b2) return nil, errors.New("Multiple startxref entries?") } @@ -1165,8 +1180,9 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { ptrailerDict, err := this.parseXref() if err != nil { - common.Log.Debug("ERROR: Failed loading another (Prev) trailer") - return nil, err + common.Log.Debug("Warning: Error - Failed loading another (Prev) trailer") + common.Log.Debug("Attempting to continue by ignoring it") + break } xx, present = (*ptrailerDict)["Prev"] diff --git a/pdf/core/repairs.go b/pdf/core/repairs.go index 3a31e5d0..94be6dcb 100644 --- a/pdf/core/repairs.go +++ b/pdf/core/repairs.go @@ -183,3 +183,104 @@ func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) { return &xrefTable, nil } + +// Look for first sign of xref table from end of file. +func (this *PdfParser) repairSeekXrefMarker() error { + // Get the file size. + fSize, err := this.rs.Seek(0, os.SEEK_END) + if err != nil { + return err + } + + reXrefTableStart := regexp.MustCompile(`\sxref\s*`) + + // Define the starting point (from the end of the file) to search from. + var offset int64 = 0 + + // Define an buffer length in terms of how many bytes to read from the end of the file. + var buflen int64 = 1000 + + for offset < fSize { + if fSize <= (buflen + offset) { + buflen = fSize - offset + } + + // Move back enough (as we need to read forward). + _, err := this.rs.Seek(-offset-buflen, os.SEEK_END) + if err != nil { + return err + } + + // Read the data. + b1 := make([]byte, buflen) + this.rs.Read(b1) + + common.Log.Trace("Looking for xref : \"%s\"", string(b1)) + ind := reXrefTableStart.FindAllStringIndex(string(b1), -1) + if ind != nil { + // Found it. + lastInd := ind[len(ind)-1] + common.Log.Trace("Ind: % d", ind) + this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END) + this.reader = bufio.NewReader(this.rs) + // Go past whitespace, finish at 'x'. + for { + bb, err := this.reader.Peek(1) + if err != nil { + return err + } + common.Log.Trace("B: %d %c", bb[0], bb[0]) + if !IsWhiteSpace(bb[0]) { + break + } + this.reader.Discard(1) + } + + return nil + } else { + common.Log.Debug("Warning: EOF marker not found! - continue seeking") + } + + offset += buflen + } + + common.Log.Debug("Error: Xref table marker was not found.") + return errors.New("xref not found ") +} + +// Called when Pdf version not found normally. Looks for the PDF version by scanning top-down. +// %PDF-1.7 +func (this *PdfParser) seekPdfVersionTopDown() (int, int, error) { + // Go to beginning, reset reader. + this.rs.Seek(0, os.SEEK_SET) + this.reader = bufio.NewReader(this.rs) + + // Keep a running buffer of last bytes. + bufLen := 20 + last := make([]byte, bufLen) + + for { + b, err := this.reader.ReadByte() + if err != nil { + if err == io.EOF { + break + } else { + return 0, 0, err + } + } + + // Format: + // object number - whitespace - generation number - obj + // e.g. "12 0 obj" + if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' && + last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' { + major := int(last[bufLen-2] - '0') + minor := int(b - '0') + return major, minor, nil + } + + last = append(last[1:bufLen], b) + } + + return 0, 0, errors.New("Version not found") +}