mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00
288 lines
7.4 KiB
Go
288 lines
7.4 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
// Routines related to repairing malformed pdf files.
|
|
|
|
package core
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"regexp"
|
|
|
|
"bufio"
|
|
"io"
|
|
"strconv"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
)
|
|
|
|
var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`)
|
|
|
|
// Locates a standard Xref table by looking for the "xref" entry.
|
|
// Xref object stream not supported.
|
|
func (parser *PdfParser) repairLocateXref() (int64, error) {
|
|
readBuf := int64(1000)
|
|
parser.rs.Seek(-readBuf, os.SEEK_CUR)
|
|
|
|
curOffset, err := parser.rs.Seek(0, os.SEEK_CUR)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
b2 := make([]byte, readBuf)
|
|
parser.rs.Read(b2)
|
|
|
|
results := repairReXrefTable.FindAllStringIndex(string(b2), -1)
|
|
if len(results) < 1 {
|
|
common.Log.Debug("ERROR: Repair: xref not found!")
|
|
return 0, errors.New("repair: xref not found")
|
|
}
|
|
|
|
localOffset := int64(results[len(results)-1][0])
|
|
xrefOffset := curOffset + localOffset
|
|
return xrefOffset, nil
|
|
}
|
|
|
|
// Renumbers the xref table.
|
|
// Useful when the cross reference is pointing to an object with the wrong number.
|
|
// Update the table.
|
|
func (parser *PdfParser) rebuildXrefTable() error {
|
|
newXrefs := XrefTable{}
|
|
newXrefs.ObjectMap = map[int]XrefObject{}
|
|
for objNum, xref := range parser.xrefs.ObjectMap {
|
|
obj, _, err := parser.lookupByNumberWrapper(objNum, false)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Unable to look up object (%s)", err)
|
|
common.Log.Debug("ERROR: Xref table completely broken - attempting to repair ")
|
|
xrefTable, err := parser.repairRebuildXrefsTopDown()
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Failed xref rebuild repair (%s)", err)
|
|
return err
|
|
}
|
|
parser.xrefs = *xrefTable
|
|
common.Log.Debug("Repaired xref table built")
|
|
return nil
|
|
}
|
|
actObjNum, actGenNum, err := getObjectNumber(obj)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
xref.ObjectNumber = int(actObjNum)
|
|
xref.Generation = int(actGenNum)
|
|
newXrefs.ObjectMap[int(actObjNum)] = xref
|
|
}
|
|
|
|
parser.xrefs = newXrefs
|
|
common.Log.Debug("New xref table built")
|
|
printXrefTable(parser.xrefs)
|
|
return nil
|
|
}
|
|
|
|
// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil).
|
|
func parseObjectNumberFromString(str string) (int, int, error) {
|
|
result := reIndirectObject.FindStringSubmatch(str)
|
|
if len(result) < 3 {
|
|
return 0, 0, errors.New("unable to detect indirect object signature")
|
|
}
|
|
|
|
on, _ := strconv.Atoi(result[1])
|
|
gn, _ := strconv.Atoi(result[2])
|
|
|
|
return on, gn, nil
|
|
}
|
|
|
|
// Parse the entire file from top down.
|
|
// Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns.
|
|
// N.B. This collects the XrefTypeTableEntry data only.
|
|
func (parser *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
|
|
if parser.repairsAttempted {
|
|
// Avoid multiple repairs (only try once).
|
|
return nil, fmt.Errorf("repair failed")
|
|
}
|
|
parser.repairsAttempted = true
|
|
|
|
// Go to beginning, reset reader.
|
|
parser.rs.Seek(0, os.SEEK_SET)
|
|
parser.reader = bufio.NewReader(parser.rs)
|
|
|
|
// Keep a running buffer of last bytes.
|
|
bufLen := 20
|
|
last := make([]byte, bufLen)
|
|
|
|
xrefTable := XrefTable{}
|
|
xrefTable.ObjectMap = make(map[int]XrefObject)
|
|
for {
|
|
b, err := parser.reader.ReadByte()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
} else {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// Format:
|
|
// object number - whitespace - generation number - obj
|
|
// e.g. "12 0 obj"
|
|
if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) {
|
|
i := bufLen - 4
|
|
// Go past whitespace
|
|
for IsWhiteSpace(last[i]) && i > 0 {
|
|
i--
|
|
}
|
|
if i == 0 || !IsDecimalDigit(last[i]) {
|
|
continue
|
|
}
|
|
// Go past generation number
|
|
for IsDecimalDigit(last[i]) && i > 0 {
|
|
i--
|
|
}
|
|
if i == 0 || !IsWhiteSpace(last[i]) {
|
|
continue
|
|
}
|
|
// Go past whitespace
|
|
for IsWhiteSpace(last[i]) && i > 0 {
|
|
i--
|
|
}
|
|
if i == 0 || !IsDecimalDigit(last[i]) {
|
|
continue
|
|
}
|
|
// Go past object number.
|
|
for IsDecimalDigit(last[i]) && i > 0 {
|
|
i--
|
|
}
|
|
if i == 0 {
|
|
continue // Probably too long to be a valid object...
|
|
}
|
|
|
|
objOffset := parser.GetFileOffset() - int64(bufLen-i)
|
|
|
|
objstr := append(last[i+1:], b)
|
|
objNum, genNum, err := parseObjectNumberFromString(string(objstr))
|
|
if err != nil {
|
|
common.Log.Debug("Unable to parse object number: %v", err)
|
|
return nil, err
|
|
}
|
|
|
|
// Create and insert the XREF entry if not existing, or the generation number is higher.
|
|
if curXref, has := xrefTable.ObjectMap[objNum]; !has || curXref.Generation < genNum {
|
|
// Make the entry for the cross ref table.
|
|
xrefEntry := XrefObject{}
|
|
xrefEntry.XType = XrefTypeTableEntry
|
|
xrefEntry.ObjectNumber = int(objNum)
|
|
xrefEntry.Generation = int(genNum)
|
|
xrefEntry.Offset = objOffset
|
|
xrefTable.ObjectMap[objNum] = xrefEntry
|
|
}
|
|
}
|
|
|
|
last = append(last[1:bufLen], b)
|
|
}
|
|
|
|
return &xrefTable, nil
|
|
}
|
|
|
|
// Look for first sign of xref table from end of file.
|
|
func (parser *PdfParser) repairSeekXrefMarker() error {
|
|
// Get the file size.
|
|
fSize, err := parser.rs.Seek(0, os.SEEK_END)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
reXrefTableStart := regexp.MustCompile(`\sxref\s*`)
|
|
|
|
// Define the starting point (from the end of the file) to search from.
|
|
var offset int64
|
|
|
|
// Define an buffer length in terms of how many bytes to read from the end of the file.
|
|
var buflen int64 = 1000
|
|
|
|
for offset < fSize {
|
|
if fSize <= (buflen + offset) {
|
|
buflen = fSize - offset
|
|
}
|
|
|
|
// Move back enough (as we need to read forward).
|
|
_, err := parser.rs.Seek(-offset-buflen, os.SEEK_END)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Read the data.
|
|
b1 := make([]byte, buflen)
|
|
parser.rs.Read(b1)
|
|
|
|
common.Log.Trace("Looking for xref : \"%s\"", string(b1))
|
|
ind := reXrefTableStart.FindAllStringIndex(string(b1), -1)
|
|
if ind != nil {
|
|
// Found it.
|
|
lastInd := ind[len(ind)-1]
|
|
common.Log.Trace("Ind: % d", ind)
|
|
parser.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
|
|
parser.reader = bufio.NewReader(parser.rs)
|
|
// Go past whitespace, finish at 'x'.
|
|
for {
|
|
bb, err := parser.reader.Peek(1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
common.Log.Trace("B: %d %c", bb[0], bb[0])
|
|
if !IsWhiteSpace(bb[0]) {
|
|
break
|
|
}
|
|
parser.reader.Discard(1)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
common.Log.Debug("Warning: EOF marker not found! - continue seeking")
|
|
offset += buflen
|
|
}
|
|
|
|
common.Log.Debug("Error: Xref table marker was not found.")
|
|
return errors.New("xref not found ")
|
|
}
|
|
|
|
// Called when Pdf version not found normally. Looks for the PDF version by scanning top-down.
|
|
// %PDF-1.7
|
|
func (parser *PdfParser) seekPdfVersionTopDown() (int, int, error) {
|
|
// Go to beginning, reset reader.
|
|
parser.rs.Seek(0, os.SEEK_SET)
|
|
parser.reader = bufio.NewReader(parser.rs)
|
|
|
|
// Keep a running buffer of last bytes.
|
|
bufLen := 20
|
|
last := make([]byte, bufLen)
|
|
|
|
for {
|
|
b, err := parser.reader.ReadByte()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
} else {
|
|
return 0, 0, err
|
|
}
|
|
}
|
|
|
|
// Format:
|
|
// object number - whitespace - generation number - obj
|
|
// e.g. "12 0 obj"
|
|
if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
|
|
last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
|
|
major := int(last[bufLen-2] - '0')
|
|
minor := int(b - '0')
|
|
return major, minor, nil
|
|
}
|
|
|
|
last = append(last[1:bufLen], b)
|
|
}
|
|
|
|
return 0, 0, errors.New("version not found")
|
|
}
|