Merge pull request #28 from unidoc/ud-42

UD-42 (Repair broken xrefs)
This commit is contained in:
Alfred Hall 2016-08-17 12:12:35 +00:00 committed by GitHub
commit e9dec61b51
5 changed files with 190 additions and 78 deletions

View File

@ -245,7 +245,18 @@ func (this *PdfParser) lookupByNumber(objNumber int, attemptRepairs bool) (PdfOb
obj, err := this.parseIndirectObject() obj, err := this.parseIndirectObject()
if err != nil { if err != nil {
common.Log.Error("Failed reading xref") common.Log.Error("Failed reading xref (%s)", err)
// Offset pointing to a non-object. Try to repair the file.
if attemptRepairs {
common.Log.Error("Attempting to repair xrefs (top down)")
xrefTable, err := this.repairRebuildXrefsTopDown()
if err != nil {
common.Log.Error("Failed repair (%s)", err)
return nil, false, err
}
this.xrefs = *xrefTable
return this.lookupByNumber(objNumber, false)
}
return nil, false, err return nil, false, err
} }
@ -334,29 +345,6 @@ func (this *PdfParser) Trace(obj PdfObject) (PdfObject, error) {
return o, nil return o, nil
} }
func (this *PdfParser) rebuildXrefTable() error {
newXrefs := XrefTable{}
for objNum, xref := range this.xrefs {
obj, _, err := this.lookupByNumberWrapper(objNum, false)
if err != nil {
return err
}
actObjNum, actGenNum, err := getObjectNumber(obj)
if err != nil {
return err
}
xref.objectNumber = int(actObjNum)
xref.generation = int(actGenNum)
newXrefs[int(actObjNum)] = xref
}
this.xrefs = newXrefs
common.Log.Debug("New xref table built")
printXrefTable(this.xrefs)
return nil
}
func printXrefTable(xrefTable XrefTable) { func printXrefTable(xrefTable XrefTable) {
common.Log.Debug("=X=X=X=") common.Log.Debug("=X=X=X=")
common.Log.Debug("Xref table:") common.Log.Debug("Xref table:")

View File

@ -24,7 +24,7 @@ import (
var rePdfVersion = regexp.MustCompile(`%PDF-(\d\.\d)`) var rePdfVersion = regexp.MustCompile(`%PDF-(\d\.\d)`)
var reEOF = regexp.MustCompile("%%EOF") var reEOF = regexp.MustCompile("%%EOF")
var reXrefTable = regexp.MustCompile(`\s*xref\s*`) var reXrefTable = regexp.MustCompile(`\s*xref\s*`)
var reStartXref = regexp.MustCompile(`startxref\s*(\d+)`) var reStartXref = regexp.MustCompile(`startx?ref\s*(\d+)`)
var reNumeric = regexp.MustCompile(`^[\+-.]*([0-9.]+)`) var reNumeric = regexp.MustCompile(`^[\+-.]*([0-9.]+)`)
var reExponential = regexp.MustCompile(`^[\+-.]*([0-9.]+)e[\+-.]*([0-9.]+)`) var reExponential = regexp.MustCompile(`^[\+-.]*([0-9.]+)e[\+-.]*([0-9.]+)`)
var reReference = regexp.MustCompile(`^\s*(\d+)\s+(\d+)\s+R`) var reReference = regexp.MustCompile(`^\s*(\d+)\s+(\d+)\s+R`)
@ -1070,13 +1070,22 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) {
return nil, errors.New("Startxref not found") return nil, errors.New("Startxref not found")
} }
if len(result) > 2 { if len(result) > 2 {
// GH: Take the last one? // GH: Take the last one? Make a test case.
common.Log.Error("Multiple startxref (%s)!", b2) common.Log.Error("Multiple startxref (%s)!", b2)
return nil, errors.New("Multiple startxref entries?") return nil, errors.New("Multiple startxref entries?")
} }
offsetXref, _ := strconv.Atoi(result[1]) offsetXref, _ := strconv.ParseInt(result[1], 10, 64)
common.Log.Debug("startxref at %d", offsetXref) common.Log.Debug("startxref at %d", offsetXref)
if offsetXref > fSize {
common.Log.Error("Xref offset outside of file")
common.Log.Error("Attempting repair")
offsetXref, err = this.repairLocateXref()
if err != nil {
common.Log.Error("Repair attempt failed (%s)")
return nil, err
}
}
// Read the xref. // Read the xref.
this.rs.Seek(int64(offsetXref), os.SEEK_SET) this.rs.Seek(int64(offsetXref), os.SEEK_SET)
this.reader = bufio.NewReader(this.rs) this.reader = bufio.NewReader(this.rs)

View File

@ -191,10 +191,7 @@ func (this *PdfReader) GetOutlines() ([]*PdfIndirectObject, error) {
return outlinesList, err return outlinesList, err
} }
common.Log.Debug("Traverse outlines") common.Log.Debug("Traverse outlines")
nofollowList := map[PdfObjectName]bool{ err = this.traverseObjectData(outlinesObj)
"Parent": true,
}
err = this.traverseObjectData(outlinesObj, nofollowList)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -269,10 +266,7 @@ func (this *PdfReader) GetForms() (*PdfObjectDictionary, error) {
common.Log.Debug("Has Acro forms") common.Log.Debug("Has Acro forms")
common.Log.Debug("Traverse the Acroforms structure") common.Log.Debug("Traverse the Acroforms structure")
nofollowList := map[PdfObjectName]bool{ err := this.traverseObjectData(formsDict)
"Parent": true,
}
err := this.traverseObjectData(formsDict, nofollowList)
if err != nil { if err != nil {
common.Log.Error("Unable to traverse AcroForms (%s)", err) common.Log.Error("Unable to traverse AcroForms (%s)", err)
return nil, err return nil, err
@ -317,13 +311,8 @@ func (this *PdfReader) buildToc(node *PdfIndirectObject, parent *PdfIndirectObje
(*nodeDict)["Parent"] = parent (*nodeDict)["Parent"] = parent
} }
// Resolve the object recursively, not following Parents or Kids fields. // Resolve the object recursively.
// Later can refactor and use only one smart recursive function. err := this.traverseObjectData(node)
nofollowList := map[PdfObjectName]bool{
"Parent": true,
"Kids": true,
}
err := this.traverseObjectData(node, nofollowList)
if err != nil { if err != nil {
return err return err
} }
@ -348,21 +337,9 @@ func (this *PdfReader) buildToc(node *PdfIndirectObject, parent *PdfIndirectObje
} }
common.Log.Debug("Kids: %s", kids) common.Log.Debug("Kids: %s", kids)
for idx, child := range *kids { for idx, child := range *kids {
child, ok := child.(*PdfIndirectObject)
childRef, ok := child.(*PdfObjectReference)
if !ok { if !ok {
return errors.New("Invalid kid, non-reference") common.Log.Error("Page not indirect object - (%s)", child)
}
common.Log.Debug("look up ref %s", childRef)
pchild, err := this.parser.LookupByReference(*childRef)
if err != nil {
common.Log.Error("Unable to lookup page ref")
return errors.New("Unable to lookup page ref")
}
child, ok := pchild.(*PdfIndirectObject)
if !ok {
common.Log.Error("Page not indirect object - %s (%s)", childRef, pchild)
return errors.New("Page not indirect object") return errors.New("Page not indirect object")
} }
(*kids)[idx] = child (*kids)[idx] = child
@ -402,11 +379,10 @@ func (this *PdfReader) resolveReference(ref *PdfObjectReference) (PdfObject, boo
/* /*
* Recursively traverse through the page object data and look up * Recursively traverse through the page object data and look up
* references to indirect objects. * references to indirect objects.
* GH: Consider to define a smarter traversing engine, defining explicitly *
* - how deep we can go in terms of following certain Trees by name etc. * GH: Are we fully protected against circular references? (Add tests).
* GH: Are we fully protected against circular references?
*/ */
func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjectName]bool) error { func (this *PdfReader) traverseObjectData(o PdfObject) error {
common.Log.Debug("Traverse object data") common.Log.Debug("Traverse object data")
if _, isTraversed := this.traversed[o]; isTraversed { if _, isTraversed := this.traversed[o]; isTraversed {
return nil return nil
@ -416,37 +392,30 @@ func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjec
if io, isIndirectObj := o.(*PdfIndirectObject); isIndirectObj { if io, isIndirectObj := o.(*PdfIndirectObject); isIndirectObj {
common.Log.Debug("io: %s", io) common.Log.Debug("io: %s", io)
common.Log.Debug("- %s", io.PdfObject) common.Log.Debug("- %s", io.PdfObject)
err := this.traverseObjectData(io.PdfObject, nofollowKeys) err := this.traverseObjectData(io.PdfObject)
return err return err
} }
if so, isStreamObj := o.(*PdfObjectStream); isStreamObj { if so, isStreamObj := o.(*PdfObjectStream); isStreamObj {
err := this.traverseObjectData(so.PdfObjectDictionary, nofollowKeys) err := this.traverseObjectData(so.PdfObjectDictionary)
return err return err
} }
if dict, isDict := o.(*PdfObjectDictionary); isDict { if dict, isDict := o.(*PdfObjectDictionary); isDict {
common.Log.Debug("- dict: %s", dict) common.Log.Debug("- dict: %s", dict)
for name, v := range *dict { for name, v := range *dict {
if nofollowKeys != nil {
if _, nofollow := nofollowKeys[name]; nofollow {
// Do not retraverse up the tree.
continue
}
}
if ref, isRef := v.(*PdfObjectReference); isRef { if ref, isRef := v.(*PdfObjectReference); isRef {
resolvedObj, _, err := this.resolveReference(ref) resolvedObj, _, err := this.resolveReference(ref)
if err != nil { if err != nil {
return err return err
} }
(*dict)[name] = resolvedObj (*dict)[name] = resolvedObj
err = this.traverseObjectData(resolvedObj, nofollowKeys) err = this.traverseObjectData(resolvedObj)
if err != nil { if err != nil {
return err return err
} }
} else { } else {
err := this.traverseObjectData(v, nofollowKeys) err := this.traverseObjectData(v)
if err != nil { if err != nil {
return err return err
} }
@ -465,12 +434,12 @@ func (this *PdfReader) traverseObjectData(o PdfObject, nofollowKeys map[PdfObjec
} }
(*arr)[idx] = resolvedObj (*arr)[idx] = resolvedObj
err = this.traverseObjectData(resolvedObj, nofollowKeys) err = this.traverseObjectData(resolvedObj)
if err != nil { if err != nil {
return err return err
} }
} else { } else {
err := this.traverseObjectData(v, nofollowKeys) err := this.traverseObjectData(v)
if err != nil { if err != nil {
return err return err
} }
@ -557,11 +526,8 @@ func (this *PdfReader) GetPage(pageNumber int) (PdfObject, error) {
} }
page := this.pageList[pageNumber-1] page := this.pageList[pageNumber-1]
nofollowList := map[PdfObjectName]bool{
"Parent": true,
}
// Look up all references related to page and load everything. // Look up all references related to page and load everything.
err := this.traverseObjectData(page, nofollowList) err := this.traverseObjectData(page)
if err != nil { if err != nil {
return nil, err return nil, err
} }

140
pdf/repairs.go Normal file
View File

@ -0,0 +1,140 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
// Routines related to repairing malformed pdf files.
package pdf
import (
"errors"
"fmt"
"os"
"regexp"
"github.com/unidoc/unidoc/common"
)
var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`)
// Locates a standard Xref table by looking for the "xref" entry.
// Xref object stream not supported.
func (this *PdfParser) repairLocateXref() (int64, error) {
readBuf := int64(1000)
this.rs.Seek(-readBuf, os.SEEK_CUR)
curOffset, err := this.rs.Seek(0, os.SEEK_CUR)
if err != nil {
return 0, err
}
b2 := make([]byte, readBuf)
this.rs.Read(b2)
results := repairReXrefTable.FindAllStringIndex(string(b2), -1)
if len(results) < 1 {
common.Log.Error("Repair: xref not found!")
return 0, errors.New("Repair: xref not found")
}
localOffset := int64(results[len(results)-1][0])
xrefOffset := curOffset + localOffset
return xrefOffset, nil
}
// Renumbers the xref table.
// Useful when the cross reference is pointing to an object with the wrong number.
// Update the table.
func (this *PdfParser) rebuildXrefTable() error {
newXrefs := XrefTable{}
for objNum, xref := range this.xrefs {
obj, _, err := this.lookupByNumberWrapper(objNum, false)
if err != nil {
common.Log.Error("Unable to look up object (%s)", err)
common.Log.Error("Xref table completely broken - attempting to repair ")
xrefTable, err := this.repairRebuildXrefsTopDown()
if err != nil {
common.Log.Error("Failed xref rebuild repair (%s)", err)
return err
}
this.xrefs = *xrefTable
common.Log.Debug("Repaired xref table built")
return nil
}
actObjNum, actGenNum, err := getObjectNumber(obj)
if err != nil {
return err
}
xref.objectNumber = int(actObjNum)
xref.generation = int(actGenNum)
newXrefs[int(actObjNum)] = xref
}
this.xrefs = newXrefs
common.Log.Debug("New xref table built")
printXrefTable(this.xrefs)
return nil
}
// Parse the entire file from top down.
// Currently not supporting object streams...
// Also need to detect object streams and load the object numbers.
func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
reRepairIndirectObject := regexp.MustCompile(`^(\d+)\s+(\d+)\s+obj`)
this.SetFileOffset(0)
xrefTable := XrefTable{}
for {
this.skipComments()
curOffset := this.GetFileOffset()
peakBuf, err := this.reader.Peek(10)
if err != nil {
// EOF
break
}
// Indirect object?
results := reRepairIndirectObject.FindIndex(peakBuf)
if len(results) > 0 {
obj, err := this.parseIndirectObject()
if err != nil {
common.Log.Error("Unable to parse indirect object (%s)", err)
return nil, err
}
if indObj, ok := obj.(*PdfIndirectObject); ok {
// Make the entry for the cross ref table.
xrefEntry := XrefObject{}
xrefEntry.xtype = XREF_TABLE_ENTRY
xrefEntry.objectNumber = int(indObj.ObjectNumber)
xrefEntry.generation = int(indObj.GenerationNumber)
xrefEntry.offset = curOffset
xrefTable[int(indObj.ObjectNumber)] = xrefEntry
} else if streamObj, ok := obj.(*PdfObjectStream); ok {
// Make the entry for the cross ref table.
xrefEntry := XrefObject{}
xrefEntry.xtype = XREF_TABLE_ENTRY
xrefEntry.objectNumber = int(streamObj.ObjectNumber)
xrefEntry.generation = int(streamObj.GenerationNumber)
xrefEntry.offset = curOffset
xrefTable[int(streamObj.ObjectNumber)] = xrefEntry
} else {
return nil, fmt.Errorf("Not an indirect object or stream (%T)", obj) // Should never happen.
}
} else if string(peakBuf[0:6]) == "endobj" {
this.reader.Discard(6)
} else {
// Stop once we reach xrefs/trailer section etc. Technically this could fail for complex
// cases, but lets keep it simple for now. Add more complexity when needed (problematic user committed files).
// In general more likely that more complex files would have better understanding of the PDF standard.
common.Log.Debug("Not an object - stop repair rebuilding xref here (%s)", peakBuf)
break
}
}
return &xrefTable, nil
}

View File

@ -184,7 +184,16 @@ func (this *PdfWriter) addObjects(obj PdfObject) error {
if err != nil { if err != nil {
return err return err
} }
} else {
// How to handle the parent? Make sure it is present?
if parentObj, parentIsRef := (*dict)["Parent"].(*PdfObjectReference); parentIsRef {
// Parent is a reference. Means we can drop it?
// Could refer to somewhere outside of the scope of the output doc.
// Should be done by the reader already.
// -> ERROR.
common.Log.Error("Parent is a reference object - Cannot be in writer (needs to be resolved)")
return fmt.Errorf("Parent is a reference object - Cannot be in writer (needs to be resolved) - %s", parentObj)
}
} }
} }
return nil return nil