Merge branch 'v2' of https://github.com/unidoc/unidoc into up_v2_dev

* 'v2' of https://github.com/unidoc/unidoc:
  Enhancements to tolerate more malformed PDFs.  Fixes #47
  Minor fix to allow null for colorspace entry in Resource dicts
  Improved xref rebuild repair procedure. Closes #45.
  EOF marker seek more forgiving #46
  Fix issue with unremoved AES decrypted padding bytes #44
  Handle annotations with missing subtype as generic #43
  Annotations fix
This commit is contained in:
Peter Williams 2017-05-08 15:18:07 +10:00
commit 4134d42c76
8 changed files with 312 additions and 93 deletions

View File

@ -519,6 +519,7 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by
common.Log.Debug("ERROR AES invalid buf %s", buf)
return buf, fmt.Errorf("AES: Buf len < 16 (%d)", len(buf))
}
iv := buf[:16]
buf = buf[16:]
@ -534,8 +535,20 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by
common.Log.Trace("chop AES Decrypt (%d): % x", len(buf), buf)
mode.CryptBlocks(buf, buf)
common.Log.Trace("to (%d): % x", len(buf), buf)
//copy(buf[0:], buf[16:])
//common.Log.Debug("chop to (%d): % x", len(buf), buf)
if len(buf) == 0 {
common.Log.Trace("Empty buf, returning empty string")
return buf, nil
}
// The padded length is indicated by the last values. Remove those.
padLen := int(buf[len(buf)-1])
if padLen >= len(buf) {
common.Log.Debug("Illegal pad length")
return buf, fmt.Errorf("Invalid pad length")
}
buf = buf[:len(buf)-padLen]
return buf, nil
}
return nil, fmt.Errorf("Unsupported crypt filter method (%s)", cfMethod)

View File

@ -647,7 +647,13 @@ func (this *PdfParser) parsePdfVersion() (int, int, error) {
result1 := rePdfVersion.FindStringSubmatch(string(b))
if len(result1) < 3 {
common.Log.Debug("Error: PDF Version not found!")
major, minor, err := this.seekPdfVersionTopDown()
if err == nil {
common.Log.Debug("Failed recovery - unable to find version")
return 0, 0, err
}
return major, minor, nil
return 0, 0, errors.New("PDF version not found")
}
@ -742,6 +748,7 @@ func (this *PdfParser) parseXrefTable() (*PdfObjectDictionary, error) {
continue
}
if (len(txt) > 6) && (txt[:7] == "trailer") {
common.Log.Trace("Found trailer - %s", txt)
// Sometimes get "trailer << ...."
// Need to rewind to end of trailer text.
if len(txt) > 9 {
@ -1001,13 +1008,64 @@ func (this *PdfParser) parseXref() (*PdfObjectDictionary, error) {
return nil, err
}
} else {
common.Log.Debug("ERROR: Invalid xref.... starting with \"%s\"", string(bb))
return nil, errors.New("Invalid xref format")
common.Log.Debug("Warning: Unable to find xref table or stream. Repair attempted: Looking for earliest xref from bottom.")
err := this.repairSeekXrefMarker()
if err != nil {
common.Log.Debug("Repair failed - %v", err)
return nil, err
}
trailerDict, err = this.parseXrefTable()
if err != nil {
return nil, err
}
}
return trailerDict, err
}
// Look for EOF marker and seek to its beginning.
// Define an offset position from the end of the file.
func (this *PdfParser) seekToEOFMarker(fSize int64) error {
// Define the starting point (from the end of the file) to search from.
var offset int64 = 0
// Define an buffer length in terms of how many bytes to read from the end of the file.
var buflen int64 = 1000
for offset < fSize {
if fSize <= (buflen + offset) {
buflen = fSize - offset
}
// Move back enough (as we need to read forward).
_, err := this.rs.Seek(-offset-buflen, os.SEEK_END)
if err != nil {
return err
}
// Read the data.
b1 := make([]byte, buflen)
this.rs.Read(b1)
common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1))
ind := reEOF.FindAllStringIndex(string(b1), -1)
if ind != nil {
// Found it.
lastInd := ind[len(ind)-1]
common.Log.Trace("Ind: % d", ind)
this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
return nil
} else {
common.Log.Debug("Warning: EOF marker not found! - continue seeking")
}
offset += buflen
}
common.Log.Debug("Error: EOF marker was not found.")
return errors.New("EOF not found")
}
//
// Load the xrefs from the bottom of file prior to parsing the file.
// 1. Look for %%EOF marker, then
@ -1031,39 +1089,29 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {
this.xrefs = make(XrefTable)
this.objstms = make(ObjectStreams)
// Look for EOF marker and seek to its beginning.
// Define an offset position from the end of the file.
var offset int64 = 1000
// Get the file size.
fSize, err := this.rs.Seek(0, os.SEEK_END)
if err != nil {
return nil, err
}
common.Log.Trace("fsize: %d", fSize)
if fSize <= offset {
offset = fSize
}
_, err = this.rs.Seek(-offset, os.SEEK_END)
// Seek the EOF marker.
err = this.seekToEOFMarker(fSize)
if err != nil {
common.Log.Debug("Failed seek to eof marker: %v", err)
return nil, err
}
b1 := make([]byte, offset)
this.rs.Read(b1)
common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1))
ind := reEOF.FindAllStringIndex(string(b1), -1)
if ind == nil {
common.Log.Debug("Error: EOF marker not found!")
return nil, errors.New("EOF marker not found")
}
lastInd := ind[len(ind)-1]
common.Log.Trace("Ind: % d", ind)
this.rs.Seek(-offset+int64(lastInd[0]), os.SEEK_END)
// Look for startxref and get the xref offset.
offset = 64
var offset int64 = 64
this.rs.Seek(-offset, os.SEEK_CUR)
b2 := make([]byte, offset)
this.rs.Read(b2)
_, err = this.rs.Read(b2)
if err != nil {
common.Log.Debug("Failed reading while looking for startxref: %v", err)
return nil, err
}
result := reStartXref.FindStringSubmatch(string(b2))
if len(result) < 2 {
@ -1071,7 +1119,6 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {
return nil, errors.New("Startxref not found")
}
if len(result) > 2 {
// GH: Take the last one? Make a test case.
common.Log.Debug("ERROR: Multiple startxref (%s)!", b2)
return nil, errors.New("Multiple startxref entries?")
}
@ -1133,8 +1180,9 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {
ptrailerDict, err := this.parseXref()
if err != nil {
common.Log.Debug("ERROR: Failed loading another (Prev) trailer")
return nil, err
common.Log.Debug("Warning: Error - Failed loading another (Prev) trailer")
common.Log.Debug("Attempting to continue by ignoring it")
break
}
xx, present = (*ptrailerDict)["Prev"]

View File

@ -13,6 +13,10 @@ import (
"os"
"regexp"
"bufio"
"io"
"strconv"
"github.com/unidoc/unidoc/common"
)
@ -77,9 +81,22 @@ func (this *PdfParser) rebuildXrefTable() error {
return nil
}
// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil).
func parseObjectNumberFromString(str string) (int, int, error) {
result := reIndirectObject.FindStringSubmatch(str)
if len(result) < 3 {
return 0, 0, errors.New("Unable to detect indirect object signature")
}
on, _ := strconv.Atoi(result[1])
gn, _ := strconv.Atoi(result[2])
return on, gn, nil
}
// Parse the entire file from top down.
// Currently not supporting object streams...
// Also need to detect object streams and load the object numbers.
// Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns.
// N.B. This collects the XREF_TABLE_ENTRY data only.
func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
if this.repairsAttempted {
// Avoid multiple repairs (only try once).
@ -87,60 +104,183 @@ func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
}
this.repairsAttempted = true
reRepairIndirectObject := regexp.MustCompile(`^(\d+)\s+(\d+)\s+obj`)
// Go to beginning, reset reader.
this.rs.Seek(0, os.SEEK_SET)
this.reader = bufio.NewReader(this.rs)
this.SetFileOffset(0)
// Keep a running buffer of last bytes.
bufLen := 20
last := make([]byte, bufLen)
xrefTable := XrefTable{}
for {
this.skipComments()
curOffset := this.GetFileOffset()
peakBuf, err := this.reader.Peek(10)
b, err := this.reader.ReadByte()
if err != nil {
// EOF
break
if err == io.EOF {
break
} else {
return nil, err
}
}
// Indirect object?
results := reRepairIndirectObject.FindIndex(peakBuf)
if len(results) > 0 {
obj, err := this.ParseIndirectObject()
// Format:
// object number - whitespace - generation number - obj
// e.g. "12 0 obj"
if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) {
i := bufLen - 4
// Go past whitespace
for IsWhiteSpace(last[i]) && i > 0 {
i--
}
if i == 0 || !IsDecimalDigit(last[i]) {
continue
}
// Go past generation number
for IsDecimalDigit(last[i]) && i > 0 {
i--
}
if i == 0 || !IsWhiteSpace(last[i]) {
continue
}
// Go past whitespace
for IsWhiteSpace(last[i]) && i > 0 {
i--
}
if i == 0 || !IsDecimalDigit(last[i]) {
continue
}
// Go past object number.
for IsDecimalDigit(last[i]) && i > 0 {
i--
}
if i == 0 {
continue // Probably too long to be a valid object...
}
objOffset := this.GetFileOffset() - int64(bufLen-i)
objstr := append(last[i+1:], b)
objNum, genNum, err := parseObjectNumberFromString(string(objstr))
if err != nil {
common.Log.Debug("ERROR: Unable to parse indirect object (%s)", err)
common.Log.Debug("Unable to parse object number: %v", err)
return nil, err
}
if indObj, ok := obj.(*PdfIndirectObject); ok {
// Create and insert the XREF entry if not existing, or the generation number is higher.
if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum {
// Make the entry for the cross ref table.
xrefEntry := XrefObject{}
xrefEntry.xtype = XREF_TABLE_ENTRY
xrefEntry.objectNumber = int(indObj.ObjectNumber)
xrefEntry.generation = int(indObj.GenerationNumber)
xrefEntry.offset = curOffset
xrefTable[int(indObj.ObjectNumber)] = xrefEntry
} else if streamObj, ok := obj.(*PdfObjectStream); ok {
// Make the entry for the cross ref table.
xrefEntry := XrefObject{}
xrefEntry.xtype = XREF_TABLE_ENTRY
xrefEntry.objectNumber = int(streamObj.ObjectNumber)
xrefEntry.generation = int(streamObj.GenerationNumber)
xrefEntry.offset = curOffset
xrefTable[int(streamObj.ObjectNumber)] = xrefEntry
} else {
return nil, fmt.Errorf("Not an indirect object or stream (%T)", obj) // Should never happen.
xrefEntry.objectNumber = int(objNum)
xrefEntry.generation = int(genNum)
xrefEntry.offset = objOffset
xrefTable[objNum] = xrefEntry
}
} else if string(peakBuf[0:6]) == "endobj" {
this.reader.Discard(6)
} else {
// Stop once we reach xrefs/trailer section etc. Technically this could fail for complex
// cases, but lets keep it simple for now. Add more complexity when needed (problematic user committed files).
// In general more likely that more complex files would have better understanding of the PDF standard.
common.Log.Debug("Not an object - stop repair rebuilding xref here (%s)", peakBuf)
break
}
last = append(last[1:bufLen], b)
}
return &xrefTable, nil
}
// Look for first sign of xref table from end of file.
func (this *PdfParser) repairSeekXrefMarker() error {
// Get the file size.
fSize, err := this.rs.Seek(0, os.SEEK_END)
if err != nil {
return err
}
reXrefTableStart := regexp.MustCompile(`\sxref\s*`)
// Define the starting point (from the end of the file) to search from.
var offset int64 = 0
// Define an buffer length in terms of how many bytes to read from the end of the file.
var buflen int64 = 1000
for offset < fSize {
if fSize <= (buflen + offset) {
buflen = fSize - offset
}
// Move back enough (as we need to read forward).
_, err := this.rs.Seek(-offset-buflen, os.SEEK_END)
if err != nil {
return err
}
// Read the data.
b1 := make([]byte, buflen)
this.rs.Read(b1)
common.Log.Trace("Looking for xref : \"%s\"", string(b1))
ind := reXrefTableStart.FindAllStringIndex(string(b1), -1)
if ind != nil {
// Found it.
lastInd := ind[len(ind)-1]
common.Log.Trace("Ind: % d", ind)
this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
this.reader = bufio.NewReader(this.rs)
// Go past whitespace, finish at 'x'.
for {
bb, err := this.reader.Peek(1)
if err != nil {
return err
}
common.Log.Trace("B: %d %c", bb[0], bb[0])
if !IsWhiteSpace(bb[0]) {
break
}
this.reader.Discard(1)
}
return nil
} else {
common.Log.Debug("Warning: EOF marker not found! - continue seeking")
}
offset += buflen
}
common.Log.Debug("Error: Xref table marker was not found.")
return errors.New("xref not found ")
}
// Called when Pdf version not found normally. Looks for the PDF version by scanning top-down.
// %PDF-1.7
func (this *PdfParser) seekPdfVersionTopDown() (int, int, error) {
// Go to beginning, reset reader.
this.rs.Seek(0, os.SEEK_SET)
this.reader = bufio.NewReader(this.rs)
// Keep a running buffer of last bytes.
bufLen := 20
last := make([]byte, bufLen)
for {
b, err := this.reader.ReadByte()
if err != nil {
if err == io.EOF {
break
} else {
return 0, 0, err
}
}
// Format:
// object number - whitespace - generation number - obj
// e.g. "12 0 obj"
if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
major := int(last[bufLen-2] - '0')
minor := int(b - '0')
return major, minor, nil
}
last = append(last[1:bufLen], b)
}
return 0, 0, errors.New("Version not found")
}

View File

@ -680,10 +680,13 @@ func (r *PdfReader) newPdfAnnotationFromIndirectObject(container *PdfIndirectObj
subtypeObj, has := (*d)["Subtype"]
if !has {
return nil, fmt.Errorf("Missing Subtype")
common.Log.Debug("WARNING: Compatibility issue - annotation Subtype missing - assuming no subtype")
annot.context = nil
return annot, nil
}
subtype, ok := subtypeObj.(*PdfObjectName)
if !ok {
common.Log.Debug("ERROR: Invalid Subtype object type != name (%T)", subtypeObj)
return nil, fmt.Errorf("Invalid Subtype object type != name (%T)", subtypeObj)
}
switch *subtype {
@ -1597,6 +1600,8 @@ func (this *PdfAnnotation) GetContainingPdfObject() PdfObject {
return this.primitive
}
// Note: Call the sub-annotation's ToPdfObject to set both the generic and non-generic information.
// TODO/FIXME: Consider doing it here instead.
func (this *PdfAnnotation) ToPdfObject() PdfObject {
container := this.primitive
d := container.PdfObject.(*PdfObjectDictionary)

View File

@ -23,23 +23,22 @@ import (
// PDF page object (7.7.3.3 - Table 30).
type PdfPage struct {
Parent PdfObject
LastModified *PdfDate
Resources *PdfPageResources
CropBox *PdfRectangle
MediaBox *PdfRectangle
BleedBox *PdfRectangle
TrimBox *PdfRectangle
ArtBox *PdfRectangle
BoxColorInfo PdfObject
Contents PdfObject
Rotate *int64
Group PdfObject
Thumb PdfObject
B PdfObject
Dur PdfObject
Trans PdfObject
//Annots PdfObject
Parent PdfObject
LastModified *PdfDate
Resources *PdfPageResources
CropBox *PdfRectangle
MediaBox *PdfRectangle
BleedBox *PdfRectangle
TrimBox *PdfRectangle
ArtBox *PdfRectangle
BoxColorInfo PdfObject
Contents PdfObject
Rotate *int64
Group PdfObject
Thumb PdfObject
B PdfObject
Dur PdfObject
Trans PdfObject
AA PdfObject
Metadata PdfObject
PieceInfo PdfObject
@ -52,7 +51,7 @@ type PdfPage struct {
PresSteps PdfObject
UserUnit PdfObject
VP PdfObject
//Annotations
Annotations []*PdfAnnotation
// Primitive container.
@ -477,7 +476,12 @@ func (this *PdfPage) GetPageDict() *PdfObjectDictionary {
if this.Annotations != nil {
arr := PdfObjectArray{}
for _, annot := range this.Annotations {
arr = append(arr, annot.GetContext().ToPdfObject())
if subannot := annot.GetContext(); subannot != nil {
arr = append(arr, subannot.ToPdfObject())
} else {
// Generic annotation dict (without subtype).
arr = append(arr, annot.ToPdfObject())
}
}
p.Set("Annots", &arr)
}

View File

@ -462,11 +462,6 @@ func (this *PdfReader) loadForms() (*PdfAcroForm, error) {
common.Log.Trace("Has Acro forms")
// Load it.
acroForm, err := this.newPdfAcroFormFromDict(formsDict)
if err != nil {
return nil, err
}
// Ensure we have access to everything.
common.Log.Trace("Traverse the Acroforms structure")
err = this.traverseObjectData(formsDict)
@ -475,6 +470,12 @@ func (this *PdfReader) loadForms() (*PdfAcroForm, error) {
return nil, err
}
// Create the acro forms object.
acroForm, err := this.newPdfAcroFormFromDict(formsDict)
if err != nil {
return nil, err
}
return acroForm, nil
}

View File

@ -40,7 +40,7 @@ func NewPdfPageResourcesFromDict(dict *PdfObjectDictionary) (*PdfPageResources,
if obj, isDefined := (*dict)["ExtGState"]; isDefined {
r.ExtGState = obj
}
if obj, isDefined := (*dict)["ColorSpace"]; isDefined {
if obj, isDefined := (*dict)["ColorSpace"]; isDefined && !isNullObject(obj) {
colorspaces, err := newPdfPageResourcesColorspacesFromPdfObject(obj)
if err != nil {
return nil, err

View File

@ -28,6 +28,14 @@ func getNumberAsFloat(obj PdfObject) (float64, error) {
return 0, errors.New("Not a number")
}
func isNullObject(obj PdfObject) bool {
if _, isNull := obj.(*PdfObjectNull); isNull {
return true
} else {
return false
}
}
// Convert a list of pdf objects representing floats or integers to a slice of float64 values.
func getNumbersAsFloat(objects []PdfObject) ([]float64, error) {
floats := []float64{}