mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-29 13:48:54 +08:00
1017 lines
26 KiB
Go
1017 lines
26 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package fdf
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"encoding/hex"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
)
|
|
|
|
// Regular Expressions for parsing and identifying object signatures.
|
|
var reFdfVersion = regexp.MustCompile(`%FDF-(\d)\.(\d)`)
|
|
var reEOF = regexp.MustCompile("%%EOF")
|
|
var reNumeric = regexp.MustCompile(`^[\+-.]*([0-9.]+)`)
|
|
var reExponential = regexp.MustCompile(`^[\+-.]*([0-9.]+)e[\+-.]*([0-9.]+)`)
|
|
var reReference = regexp.MustCompile(`^\s*(\d+)\s+(\d+)\s+R`)
|
|
var reIndirectObject = regexp.MustCompile(`(\d+)\s+(\d+)\s+obj`)
|
|
|
|
// fdfParser parses a FDF file and provides access to the object structure of the FDF.
|
|
type fdfParser struct {
|
|
majorVersion int
|
|
minorVersion int
|
|
|
|
objCache map[int64]core.PdfObject
|
|
|
|
rs io.ReadSeeker
|
|
reader *bufio.Reader
|
|
fileSize int64
|
|
|
|
trailerDict *core.PdfObjectDictionary
|
|
}
|
|
|
|
// Skip over any spaces.
|
|
func (parser *fdfParser) skipSpaces() (int, error) {
|
|
cnt := 0
|
|
for {
|
|
b, err := parser.reader.ReadByte()
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if core.IsWhiteSpace(b) {
|
|
cnt++
|
|
} else {
|
|
parser.reader.UnreadByte()
|
|
break
|
|
}
|
|
}
|
|
|
|
return cnt, nil
|
|
}
|
|
|
|
// Skip over comments and spaces. Can handle multi-line comments.
|
|
func (parser *fdfParser) skipComments() error {
|
|
if _, err := parser.skipSpaces(); err != nil {
|
|
return err
|
|
}
|
|
|
|
isFirst := true
|
|
for {
|
|
bb, err := parser.reader.Peek(1)
|
|
if err != nil {
|
|
common.Log.Debug("Error %s", err.Error())
|
|
return err
|
|
}
|
|
if isFirst && bb[0] != '%' {
|
|
// Not a comment clearly.
|
|
return nil
|
|
}
|
|
isFirst = false
|
|
if (bb[0] != '\r') && (bb[0] != '\n') {
|
|
parser.reader.ReadByte()
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
// Call recursively to handle multiline comments.
|
|
return parser.skipComments()
|
|
}
|
|
|
|
// Read a comment starting with '%'.
|
|
func (parser *fdfParser) readComment() (string, error) {
|
|
var r bytes.Buffer
|
|
|
|
_, err := parser.skipSpaces()
|
|
if err != nil {
|
|
return r.String(), err
|
|
}
|
|
|
|
isFirst := true
|
|
for {
|
|
bb, err := parser.reader.Peek(1)
|
|
if err != nil {
|
|
common.Log.Debug("Error %s", err.Error())
|
|
return r.String(), err
|
|
}
|
|
if isFirst && bb[0] != '%' {
|
|
return r.String(), errors.New("comment should start with %")
|
|
}
|
|
isFirst = false
|
|
if (bb[0] != '\r') && (bb[0] != '\n') {
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return r.String(), nil
|
|
}
|
|
|
|
// Read a single line of text from current position.
|
|
func (parser *fdfParser) readTextLine() (string, error) {
|
|
var r bytes.Buffer
|
|
for {
|
|
bb, err := parser.reader.Peek(1)
|
|
if err != nil {
|
|
common.Log.Debug("Error %s", err.Error())
|
|
return r.String(), err
|
|
}
|
|
if (bb[0] != '\r') && (bb[0] != '\n') {
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return r.String(), nil
|
|
}
|
|
|
|
// Parse a name starting with '/'.
|
|
func (parser *fdfParser) parseName() (core.PdfObjectName, error) {
|
|
var r bytes.Buffer
|
|
nameStarted := false
|
|
for {
|
|
bb, err := parser.reader.Peek(1)
|
|
if err == io.EOF {
|
|
break // Can happen when loading from object stream.
|
|
}
|
|
if err != nil {
|
|
return core.PdfObjectName(r.String()), err
|
|
}
|
|
|
|
if !nameStarted {
|
|
// Should always start with '/', otherwise not valid.
|
|
if bb[0] == '/' {
|
|
nameStarted = true
|
|
parser.reader.ReadByte()
|
|
} else if bb[0] == '%' {
|
|
parser.readComment()
|
|
parser.skipSpaces()
|
|
} else {
|
|
common.Log.Debug("ERROR Name starting with %s (% x)", bb, bb)
|
|
return core.PdfObjectName(r.String()), fmt.Errorf("invalid name: (%c)", bb[0])
|
|
}
|
|
} else {
|
|
if core.IsWhiteSpace(bb[0]) {
|
|
break
|
|
} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
|
|
break // Looks like start of next statement.
|
|
} else if bb[0] == '#' {
|
|
hexcode, err := parser.reader.Peek(3)
|
|
if err != nil {
|
|
return core.PdfObjectName(r.String()), err
|
|
}
|
|
parser.reader.Discard(3)
|
|
|
|
code, err := hex.DecodeString(string(hexcode[1:3]))
|
|
if err != nil {
|
|
return core.PdfObjectName(r.String()), err
|
|
}
|
|
r.Write(code)
|
|
} else {
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
}
|
|
}
|
|
}
|
|
return core.PdfObjectName(r.String()), nil
|
|
}
|
|
|
|
// Numeric objects.
|
|
// Section 7.3.3.
|
|
// Integer or Float.
|
|
//
|
|
// An integer shall be written as one or more decimal digits optionally
|
|
// preceded by a sign. The value shall be interpreted as a signed
|
|
// decimal integer and shall be converted to an integer object.
|
|
//
|
|
// A real value shall be written as one or more decimal digits with an
|
|
// optional sign and a leading, trailing, or embedded PERIOD (2Eh)
|
|
// (decimal point). The value shall be interpreted as a real number
|
|
// and shall be converted to a real object.
|
|
//
|
|
// Regarding exponential numbers: 7.3.3 Numeric Objects:
|
|
// A conforming writer shall not use the PostScript syntax for numbers
|
|
// with non-decimal radices (such as 16#FFFE) or in exponential format
|
|
// (such as 6.02E23).
|
|
// Nonetheless, we sometimes get numbers with exponential format, so
|
|
// we will support it in the reader (no confusion with other types, so
|
|
// no compromise).
|
|
func (parser *fdfParser) parseNumber() (core.PdfObject, error) {
|
|
isFloat := false
|
|
allowSigns := true
|
|
var r bytes.Buffer
|
|
for {
|
|
common.Log.Trace("Parsing number \"%s\"", r.String())
|
|
bb, err := parser.reader.Peek(1)
|
|
if err == io.EOF {
|
|
// GH: EOF handling. Handle EOF like end of line. Can happen with
|
|
// encoded object streams that the object is at the end.
|
|
// In other cases, we will get the EOF error elsewhere at any rate.
|
|
break // Handle like EOF
|
|
}
|
|
if err != nil {
|
|
common.Log.Debug("ERROR %s", err)
|
|
return nil, err
|
|
}
|
|
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
|
|
// Only appear in the beginning, otherwise serves as a delimiter.
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
allowSigns = false // Only allowed in beginning, and after e (exponential).
|
|
} else if core.IsDecimalDigit(bb[0]) {
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
} else if bb[0] == '.' {
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
isFloat = true
|
|
} else if bb[0] == 'e' {
|
|
// Exponential number format.
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
isFloat = true
|
|
allowSigns = true
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
if isFloat {
|
|
fVal, err := strconv.ParseFloat(r.String(), 64)
|
|
o := core.PdfObjectFloat(fVal)
|
|
return &o, err
|
|
} else {
|
|
intVal, err := strconv.ParseInt(r.String(), 10, 64)
|
|
o := core.PdfObjectInteger(intVal)
|
|
return &o, err
|
|
}
|
|
}
|
|
|
|
// A string starts with '(' and ends with ')'.
|
|
func (parser *fdfParser) parseString() (*core.PdfObjectString, error) {
|
|
parser.reader.ReadByte()
|
|
|
|
var r bytes.Buffer
|
|
count := 1
|
|
for {
|
|
bb, err := parser.reader.Peek(1)
|
|
if err != nil {
|
|
return core.MakeString(r.String()), err
|
|
}
|
|
|
|
if bb[0] == '\\' { // Escape sequence.
|
|
parser.reader.ReadByte() // Skip the escape \ byte.
|
|
b, err := parser.reader.ReadByte()
|
|
if err != nil {
|
|
return core.MakeString(r.String()), err
|
|
}
|
|
|
|
// Octal '\ddd' number (base 8).
|
|
if core.IsOctalDigit(b) {
|
|
bb, err := parser.reader.Peek(2)
|
|
if err != nil {
|
|
return core.MakeString(r.String()), err
|
|
}
|
|
|
|
var numeric []byte
|
|
numeric = append(numeric, b)
|
|
for _, val := range bb {
|
|
if core.IsOctalDigit(val) {
|
|
numeric = append(numeric, val)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
parser.reader.Discard(len(numeric) - 1)
|
|
|
|
common.Log.Trace("Numeric string \"%s\"", numeric)
|
|
code, err := strconv.ParseUint(string(numeric), 8, 32)
|
|
if err != nil {
|
|
return core.MakeString(r.String()), err
|
|
}
|
|
r.WriteByte(byte(code))
|
|
continue
|
|
}
|
|
|
|
switch b {
|
|
case 'n':
|
|
r.WriteRune('\n')
|
|
case 'r':
|
|
r.WriteRune('\r')
|
|
case 't':
|
|
r.WriteRune('\t')
|
|
case 'b':
|
|
r.WriteRune('\b')
|
|
case 'f':
|
|
r.WriteRune('\f')
|
|
case '(':
|
|
r.WriteRune('(')
|
|
case ')':
|
|
r.WriteRune(')')
|
|
case '\\':
|
|
r.WriteRune('\\')
|
|
}
|
|
|
|
continue
|
|
} else if bb[0] == '(' {
|
|
count++
|
|
} else if bb[0] == ')' {
|
|
count--
|
|
if count == 0 {
|
|
parser.reader.ReadByte()
|
|
break
|
|
}
|
|
}
|
|
|
|
b, _ := parser.reader.ReadByte()
|
|
r.WriteByte(b)
|
|
}
|
|
|
|
return core.MakeString(r.String()), nil
|
|
}
|
|
|
|
// Starts with '<' ends with '>'.
|
|
// Currently not converting the hex codes to characters.
|
|
func (parser *fdfParser) parseHexString() (*core.PdfObjectString, error) {
|
|
parser.reader.ReadByte()
|
|
|
|
var r bytes.Buffer
|
|
for {
|
|
bb, err := parser.reader.Peek(1)
|
|
if err != nil {
|
|
return core.MakeHexString(""), err
|
|
}
|
|
|
|
if bb[0] == '>' {
|
|
parser.reader.ReadByte()
|
|
break
|
|
}
|
|
|
|
b, _ := parser.reader.ReadByte()
|
|
if !core.IsWhiteSpace(b) {
|
|
r.WriteByte(b)
|
|
}
|
|
}
|
|
|
|
if r.Len()%2 == 1 {
|
|
r.WriteRune('0')
|
|
}
|
|
|
|
buf, err := hex.DecodeString(r.String())
|
|
if err != nil {
|
|
common.Log.Debug("ERROR Parsing hex string: '%s' - returning an empty string", r.String())
|
|
return core.MakeHexString(""), nil
|
|
}
|
|
return core.MakeHexString(string(buf)), nil
|
|
}
|
|
|
|
// Starts with '[' ends with ']'. Can contain any kinds of direct objects.
|
|
func (parser *fdfParser) parseArray() (*core.PdfObjectArray, error) {
|
|
arr := core.MakeArray()
|
|
|
|
parser.reader.ReadByte()
|
|
|
|
for {
|
|
parser.skipSpaces()
|
|
|
|
bb, err := parser.reader.Peek(1)
|
|
if err != nil {
|
|
return arr, err
|
|
}
|
|
|
|
if bb[0] == ']' {
|
|
parser.reader.ReadByte()
|
|
break
|
|
}
|
|
|
|
obj, err := parser.parseObject()
|
|
if err != nil {
|
|
return arr, err
|
|
}
|
|
arr.Append(obj)
|
|
}
|
|
|
|
return arr, nil
|
|
}
|
|
|
|
// Parse bool object.
|
|
func (parser *fdfParser) parseBool() (core.PdfObjectBool, error) {
|
|
bb, err := parser.reader.Peek(4)
|
|
if err != nil {
|
|
return core.PdfObjectBool(false), err
|
|
}
|
|
if (len(bb) >= 4) && (string(bb[:4]) == "true") {
|
|
parser.reader.Discard(4)
|
|
return core.PdfObjectBool(true), nil
|
|
}
|
|
|
|
bb, err = parser.reader.Peek(5)
|
|
if err != nil {
|
|
return core.PdfObjectBool(false), err
|
|
}
|
|
if (len(bb) >= 5) && (string(bb[:5]) == "false") {
|
|
parser.reader.Discard(5)
|
|
return core.PdfObjectBool(false), nil
|
|
}
|
|
|
|
return core.PdfObjectBool(false), errors.New("unexpected boolean string")
|
|
}
|
|
|
|
// Parse reference to an indirect object.
|
|
func parseReference(refStr string) (core.PdfObjectReference, error) {
|
|
objref := core.PdfObjectReference{}
|
|
|
|
result := reReference.FindStringSubmatch(string(refStr))
|
|
if len(result) < 3 {
|
|
common.Log.Debug("Error parsing reference")
|
|
return objref, errors.New("unable to parse reference")
|
|
}
|
|
|
|
objNum, err := strconv.Atoi(result[1])
|
|
if err != nil {
|
|
common.Log.Debug("Error parsing object number '%s' - Using object num = 0", result[1])
|
|
return objref, nil
|
|
}
|
|
objref.ObjectNumber = int64(objNum)
|
|
|
|
genNum, err := strconv.Atoi(result[2])
|
|
if err != nil {
|
|
common.Log.Debug("Error parsing generation number '%s' - Using gen = 0", result[2])
|
|
return objref, nil
|
|
}
|
|
objref.GenerationNumber = int64(genNum)
|
|
|
|
return objref, nil
|
|
}
|
|
|
|
// Parse null object.
|
|
func (parser *fdfParser) parseNull() (core.PdfObjectNull, error) {
|
|
_, err := parser.reader.Discard(4)
|
|
return core.PdfObjectNull{}, err
|
|
}
|
|
|
|
// Detect the signature at the current file position and parse
|
|
// the corresponding object.
|
|
func (parser *fdfParser) parseObject() (core.PdfObject, error) {
|
|
common.Log.Trace("Read direct object")
|
|
parser.skipSpaces()
|
|
for {
|
|
bb, err := parser.reader.Peek(2)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
common.Log.Trace("Peek string: %s", string(bb))
|
|
// Determine type.
|
|
if bb[0] == '/' {
|
|
name, err := parser.parseName()
|
|
common.Log.Trace("->Name: '%s'", name)
|
|
return &name, err
|
|
} else if bb[0] == '(' {
|
|
common.Log.Trace("->String!")
|
|
return parser.parseString()
|
|
} else if bb[0] == '[' {
|
|
common.Log.Trace("->Array!")
|
|
return parser.parseArray()
|
|
} else if (bb[0] == '<') && (bb[1] == '<') {
|
|
common.Log.Trace("->Dict!")
|
|
return parser.parseDict()
|
|
} else if bb[0] == '<' {
|
|
common.Log.Trace("->Hex string!")
|
|
return parser.parseHexString()
|
|
} else if bb[0] == '%' {
|
|
parser.readComment()
|
|
parser.skipSpaces()
|
|
} else {
|
|
common.Log.Trace("->Number or ref?")
|
|
// Reference or number?
|
|
// Let's peek farther to find out.
|
|
bb, _ = parser.reader.Peek(15)
|
|
peekStr := string(bb)
|
|
common.Log.Trace("Peek str: %s", peekStr)
|
|
|
|
if (len(peekStr) > 3) && (peekStr[:4] == "null") {
|
|
null, err := parser.parseNull()
|
|
return &null, err
|
|
} else if (len(peekStr) > 4) && (peekStr[:5] == "false") {
|
|
b, err := parser.parseBool()
|
|
return &b, err
|
|
} else if (len(peekStr) > 3) && (peekStr[:4] == "true") {
|
|
b, err := parser.parseBool()
|
|
return &b, err
|
|
}
|
|
|
|
// Match reference.
|
|
result1 := reReference.FindStringSubmatch(string(peekStr))
|
|
if len(result1) > 1 {
|
|
bb, _ = parser.reader.ReadBytes('R')
|
|
common.Log.Trace("-> !Ref: '%s'", string(bb[:]))
|
|
ref, err := parseReference(string(bb))
|
|
return &ref, err
|
|
}
|
|
|
|
result2 := reNumeric.FindStringSubmatch(string(peekStr))
|
|
if len(result2) > 1 {
|
|
// Number object.
|
|
common.Log.Trace("-> Number!")
|
|
return parser.parseNumber()
|
|
}
|
|
|
|
result2 = reExponential.FindStringSubmatch(string(peekStr))
|
|
if len(result2) > 1 {
|
|
// Number object (exponential)
|
|
common.Log.Trace("-> Exponential Number!")
|
|
common.Log.Trace("% s", result2)
|
|
return parser.parseNumber()
|
|
}
|
|
|
|
common.Log.Debug("ERROR Unknown (peek \"%s\")", peekStr)
|
|
return nil, errors.New("object parsing error - unexpected pattern")
|
|
}
|
|
}
|
|
}
|
|
|
|
// Reads and parses a FDF dictionary object enclosed with '<<' and '>>'
|
|
func (parser *fdfParser) parseDict() (*core.PdfObjectDictionary, error) {
|
|
common.Log.Trace("Reading FDF Dict!")
|
|
|
|
dict := core.MakeDict()
|
|
|
|
// Pass the '<<'
|
|
c, _ := parser.reader.ReadByte()
|
|
if c != '<' {
|
|
return nil, errors.New("invalid dict")
|
|
}
|
|
c, _ = parser.reader.ReadByte()
|
|
if c != '<' {
|
|
return nil, errors.New("invalid dict")
|
|
}
|
|
|
|
for {
|
|
parser.skipSpaces()
|
|
parser.skipComments()
|
|
|
|
bb, err := parser.reader.Peek(2)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))
|
|
if (bb[0] == '>') && (bb[1] == '>') {
|
|
common.Log.Trace("EOF dictionary")
|
|
parser.reader.ReadByte()
|
|
parser.reader.ReadByte()
|
|
break
|
|
}
|
|
common.Log.Trace("Parse the name!")
|
|
|
|
keyName, err := parser.parseName()
|
|
common.Log.Trace("Key: %s", keyName)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR Returning name err %s", err)
|
|
return nil, err
|
|
}
|
|
|
|
if len(keyName) > 4 && keyName[len(keyName)-4:] == "null" {
|
|
// Some writers have a bug where the null is appended without
|
|
// space. For example "\Boundsnull"
|
|
newKey := keyName[0 : len(keyName)-4]
|
|
common.Log.Debug("Taking care of null bug (%s)", keyName)
|
|
common.Log.Debug("New key \"%s\" = null", newKey)
|
|
parser.skipSpaces()
|
|
bb, _ := parser.reader.Peek(1)
|
|
if bb[0] == '/' {
|
|
dict.Set(newKey, core.MakeNull())
|
|
continue
|
|
}
|
|
}
|
|
|
|
parser.skipSpaces()
|
|
|
|
val, err := parser.parseObject()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
dict.Set(keyName, val)
|
|
|
|
common.Log.Trace("dict[%s] = %s", keyName, val.String())
|
|
}
|
|
common.Log.Trace("returning FDF Dict!")
|
|
|
|
return dict, nil
|
|
}
|
|
|
|
// Parse the FDF version from the beginning of the file.
|
|
// Returns the major and minor parts of the version.
|
|
// E.g. for "FDF-1.4" would return 1 and 4.
|
|
func (parser *fdfParser) parseFdfVersion() (int, int, error) {
|
|
parser.rs.Seek(0, io.SeekStart)
|
|
offset := 20
|
|
b := make([]byte, offset)
|
|
parser.rs.Read(b)
|
|
|
|
result1 := reFdfVersion.FindStringSubmatch(string(b))
|
|
if len(result1) < 3 {
|
|
major, minor, err := parser.seekFdfVersionTopDown()
|
|
if err != nil {
|
|
common.Log.Debug("Failed recovery - unable to find version")
|
|
return 0, 0, err
|
|
}
|
|
|
|
return major, minor, nil
|
|
}
|
|
|
|
majorVersion, err := strconv.Atoi(result1[1])
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
|
|
minorVersion, err := strconv.Atoi(result1[2])
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
|
|
common.Log.Debug("Fdf version %d.%d", majorVersion, minorVersion)
|
|
|
|
return int(majorVersion), int(minorVersion), nil
|
|
}
|
|
|
|
// Look for EOF marker and seek to its beginning.
|
|
// Define an offset position from the end of the file.
|
|
func (parser *fdfParser) seekToEOFMarker(fSize int64) error {
|
|
// Define the starting point (from the end of the file) to search from.
|
|
offset := int64(0)
|
|
|
|
// Define an buffer length in terms of how many bytes to read from the end of the file.
|
|
buflen := int64(1000)
|
|
|
|
for offset < fSize {
|
|
if fSize <= (buflen + offset) {
|
|
buflen = fSize - offset
|
|
}
|
|
|
|
// Move back enough (as we need to read forward).
|
|
_, err := parser.rs.Seek(-offset-buflen, io.SeekEnd)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Read the data.
|
|
b1 := make([]byte, buflen)
|
|
parser.rs.Read(b1)
|
|
common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1))
|
|
ind := reEOF.FindAllStringIndex(string(b1), -1)
|
|
if ind != nil {
|
|
// Found it.
|
|
lastInd := ind[len(ind)-1]
|
|
common.Log.Trace("Ind: % d", ind)
|
|
parser.rs.Seek(-offset-buflen+int64(lastInd[0]), io.SeekEnd)
|
|
return nil
|
|
}
|
|
common.Log.Debug("Warning: EOF marker not found! - continue seeking")
|
|
offset += buflen
|
|
}
|
|
|
|
common.Log.Debug("Error: EOF marker was not found.")
|
|
return errors.New("EOF not found")
|
|
}
|
|
|
|
// Parse an indirect object from the input stream. Can also be an object stream.
|
|
// Returns the indirect object (*PdfIndirectObject) or the stream object (*PdfObjectStream).
|
|
func (parser *fdfParser) parseIndirectObject() (core.PdfObject, error) {
|
|
indirect := core.PdfIndirectObject{}
|
|
|
|
common.Log.Trace("-Read indirect obj")
|
|
bb, err := parser.reader.Peek(20)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Fail to read indirect obj")
|
|
return &indirect, err
|
|
}
|
|
common.Log.Trace("(indirect obj peek \"%s\"", string(bb))
|
|
|
|
indices := reIndirectObject.FindStringSubmatchIndex(string(bb))
|
|
if len(indices) < 6 {
|
|
common.Log.Debug("ERROR: Unable to find object signature (%s)", string(bb))
|
|
return &indirect, errors.New("unable to detect indirect object signature")
|
|
}
|
|
parser.reader.Discard(indices[0]) // Take care of any small offset.
|
|
common.Log.Trace("Offsets % d", indices)
|
|
|
|
// Read the object header.
|
|
hlen := indices[1] - indices[0]
|
|
hb := make([]byte, hlen)
|
|
_, err = parser.readAtLeast(hb, hlen)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: unable to read - %s", err)
|
|
return nil, err
|
|
}
|
|
common.Log.Trace("textline: %s", hb)
|
|
|
|
result := reIndirectObject.FindStringSubmatch(string(hb))
|
|
if len(result) < 3 {
|
|
common.Log.Debug("ERROR: Unable to find object signature (%s)", string(hb))
|
|
return &indirect, errors.New("unable to detect indirect object signature")
|
|
}
|
|
|
|
on, _ := strconv.Atoi(result[1])
|
|
gn, _ := strconv.Atoi(result[2])
|
|
indirect.ObjectNumber = int64(on)
|
|
indirect.GenerationNumber = int64(gn)
|
|
|
|
for {
|
|
bb, err := parser.reader.Peek(2)
|
|
if err != nil {
|
|
return &indirect, err
|
|
}
|
|
common.Log.Trace("Ind. peek: %s (% x)!", string(bb), string(bb))
|
|
|
|
if core.IsWhiteSpace(bb[0]) {
|
|
parser.skipSpaces()
|
|
} else if bb[0] == '%' {
|
|
parser.skipComments()
|
|
} else if (bb[0] == '<') && (bb[1] == '<') {
|
|
common.Log.Trace("Call ParseDict")
|
|
indirect.PdfObject, err = parser.parseDict()
|
|
common.Log.Trace("EOF Call ParseDict: %v", err)
|
|
if err != nil {
|
|
return &indirect, err
|
|
}
|
|
common.Log.Trace("Parsed dictionary... finished.")
|
|
} else if (bb[0] == '/') || (bb[0] == '(') || (bb[0] == '[') || (bb[0] == '<') {
|
|
indirect.PdfObject, err = parser.parseObject()
|
|
if err != nil {
|
|
return &indirect, err
|
|
}
|
|
common.Log.Trace("Parsed object ... finished.")
|
|
} else {
|
|
if bb[0] == 'e' {
|
|
lineStr, err := parser.readTextLine()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(lineStr) >= 6 && lineStr[0:6] == "endobj" {
|
|
break
|
|
}
|
|
} else if bb[0] == 's' {
|
|
bb, _ = parser.reader.Peek(10)
|
|
if string(bb[:6]) == "stream" {
|
|
discardBytes := 6
|
|
if len(bb) > 6 {
|
|
if core.IsWhiteSpace(bb[discardBytes]) && bb[discardBytes] != '\r' && bb[discardBytes] != '\n' {
|
|
// If any other white space character... should not happen!
|
|
// Skip it..
|
|
common.Log.Debug("Non-conformant FDF not ending stream line properly with EOL marker")
|
|
discardBytes++
|
|
}
|
|
if bb[discardBytes] == '\r' {
|
|
discardBytes++
|
|
if bb[discardBytes] == '\n' {
|
|
discardBytes++
|
|
}
|
|
} else if bb[discardBytes] == '\n' {
|
|
discardBytes++
|
|
}
|
|
}
|
|
|
|
parser.reader.Discard(discardBytes)
|
|
|
|
dict, isDict := indirect.PdfObject.(*core.PdfObjectDictionary)
|
|
if !isDict {
|
|
return nil, errors.New("stream object missing dictionary")
|
|
}
|
|
common.Log.Trace("Stream dict %s", dict)
|
|
|
|
pstreamLength, ok := dict.Get("Length").(*core.PdfObjectInteger)
|
|
if !ok {
|
|
return nil, errors.New("stream length needs to be an integer")
|
|
}
|
|
streamLength := *pstreamLength
|
|
if streamLength < 0 {
|
|
return nil, errors.New("stream needs to be longer than 0")
|
|
}
|
|
|
|
// Make sure is less than actual file size.
|
|
if int64(streamLength) > parser.fileSize {
|
|
common.Log.Debug("ERROR: Stream length cannot be larger than file size")
|
|
return nil, errors.New("invalid stream length, larger than file size")
|
|
}
|
|
|
|
stream := make([]byte, streamLength)
|
|
_, err = parser.readAtLeast(stream, int(streamLength))
|
|
if err != nil {
|
|
common.Log.Debug("ERROR stream (%d): %X", len(stream), stream)
|
|
common.Log.Debug("ERROR: %v", err)
|
|
return nil, err
|
|
}
|
|
|
|
streamobj := core.PdfObjectStream{}
|
|
streamobj.Stream = stream
|
|
streamobj.PdfObjectDictionary = indirect.PdfObject.(*core.PdfObjectDictionary)
|
|
streamobj.ObjectNumber = indirect.ObjectNumber
|
|
streamobj.GenerationNumber = indirect.GenerationNumber
|
|
|
|
parser.skipSpaces()
|
|
parser.reader.Discard(9) // endstream
|
|
parser.skipSpaces()
|
|
return &streamobj, nil
|
|
}
|
|
}
|
|
|
|
indirect.PdfObject, err = parser.parseObject()
|
|
return &indirect, err
|
|
}
|
|
}
|
|
common.Log.Trace("Returning indirect!")
|
|
return &indirect, nil
|
|
}
|
|
|
|
// newParserFromString parses an FDF from a string.
|
|
// Useful for testing purposes.
|
|
func newParserFromString(txt string) (*fdfParser, error) {
|
|
parser := fdfParser{}
|
|
buf := []byte(txt)
|
|
|
|
bufReader := bytes.NewReader(buf)
|
|
parser.rs = bufReader
|
|
parser.objCache = map[int64]core.PdfObject{}
|
|
|
|
bufferedReader := bufio.NewReader(bufReader)
|
|
parser.reader = bufferedReader
|
|
|
|
parser.fileSize = int64(len(txt))
|
|
|
|
return &parser, parser.parse()
|
|
}
|
|
|
|
// Root returns the Root of the FDF document.
|
|
func (parser *fdfParser) Root() (*core.PdfObjectDictionary, error) {
|
|
if parser.trailerDict != nil {
|
|
if rootDict, ok := parser.trace(parser.trailerDict.Get("Root")).(*core.PdfObjectDictionary); ok {
|
|
if fdfDict, ok := parser.trace(rootDict.Get("FDF")).(*core.PdfObjectDictionary); ok {
|
|
return fdfDict, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
var keys []int64
|
|
for objNum := range parser.objCache {
|
|
keys = append(keys, objNum)
|
|
}
|
|
sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
|
|
|
|
for _, objNum := range keys {
|
|
obj := parser.objCache[objNum]
|
|
if rootDict, ok := parser.trace(obj).(*core.PdfObjectDictionary); ok {
|
|
if fdfDict, ok := parser.trace(rootDict.Get("FDF")).(*core.PdfObjectDictionary); ok {
|
|
return fdfDict, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, errors.New("FDF not found")
|
|
}
|
|
|
|
// newParser creates a new parser for a FDF file via ReadSeeker. Loads the cross reference stream and trailer.
|
|
// An error is returned on failure.
|
|
func newParser(rs io.ReadSeeker) (*fdfParser, error) {
|
|
parser := &fdfParser{}
|
|
|
|
parser.rs = rs
|
|
parser.objCache = map[int64]core.PdfObject{}
|
|
|
|
// Read from top to bottom...
|
|
// 1. Get the version
|
|
// 2. Sequentially parse indirect objects, until does not match
|
|
|
|
majorVersion, minorVersion, err := parser.parseFdfVersion()
|
|
if err != nil {
|
|
common.Log.Error("Unable to parse version: %v", err)
|
|
return nil, err
|
|
}
|
|
parser.majorVersion = majorVersion
|
|
parser.minorVersion = minorVersion
|
|
|
|
err = parser.parse()
|
|
return parser, err
|
|
}
|
|
|
|
// trace resolves a PdfObject to direct object, looking up and resolving references as needed.
|
|
func (parser *fdfParser) trace(obj core.PdfObject) core.PdfObject {
|
|
switch t := obj.(type) {
|
|
case *core.PdfObjectReference:
|
|
indObj, ok := parser.objCache[t.ObjectNumber].(*core.PdfIndirectObject)
|
|
if ok {
|
|
return indObj.PdfObject
|
|
}
|
|
common.Log.Debug("Type error")
|
|
return nil
|
|
case *core.PdfIndirectObject:
|
|
return t.PdfObject
|
|
}
|
|
|
|
return obj
|
|
}
|
|
|
|
// parse runs through the file and parses indirect objects and loads into cache.
|
|
func (parser *fdfParser) parse() error {
|
|
// Go to beginning, reset reader.
|
|
parser.rs.Seek(0, io.SeekStart)
|
|
parser.reader = bufio.NewReader(parser.rs)
|
|
|
|
// Parse indirect objects sequentially.
|
|
for {
|
|
parser.skipComments()
|
|
|
|
bb, err := parser.reader.Peek(20)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Fail to read indirect obj")
|
|
return err
|
|
}
|
|
|
|
if strings.HasPrefix(string(bb), "trailer") {
|
|
// End
|
|
parser.reader.Discard(7)
|
|
parser.skipSpaces()
|
|
parser.skipComments()
|
|
trailerDict, _ := parser.parseDict()
|
|
parser.trailerDict = trailerDict
|
|
break
|
|
}
|
|
|
|
indices := reIndirectObject.FindStringSubmatchIndex(string(bb))
|
|
if len(indices) < 6 {
|
|
common.Log.Debug("ERROR: Unable to find object signature (%s)", string(bb))
|
|
return errors.New("unable to detect indirect object signature")
|
|
}
|
|
|
|
indObj, err := parser.parseIndirectObject()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch o := indObj.(type) {
|
|
case *core.PdfIndirectObject:
|
|
parser.objCache[o.ObjectNumber] = o
|
|
case *core.PdfObjectStream:
|
|
parser.objCache[o.ObjectNumber] = o
|
|
default:
|
|
return errors.New("type error")
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Called when Fdf version not found normally. Looks for the PDF version by scanning top-down.
|
|
// %FDF-1.4
|
|
func (parser *fdfParser) seekFdfVersionTopDown() (int, int, error) {
|
|
// Go to beginning, reset reader.
|
|
parser.rs.Seek(0, io.SeekStart)
|
|
parser.reader = bufio.NewReader(parser.rs)
|
|
|
|
// Keep a running buffer of last bytes.
|
|
bufLen := 20
|
|
last := make([]byte, bufLen)
|
|
|
|
for {
|
|
b, err := parser.reader.ReadByte()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
} else {
|
|
return 0, 0, err
|
|
}
|
|
}
|
|
|
|
// Format:
|
|
// object number - whitespace - generation number - obj
|
|
// e.g. "12 0 obj"
|
|
if core.IsDecimalDigit(b) && last[bufLen-1] == '.' && core.IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
|
|
last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
|
|
major := int(last[bufLen-2] - '0')
|
|
minor := int(b - '0')
|
|
return major, minor, nil
|
|
}
|
|
|
|
last = append(last[1:bufLen], b)
|
|
}
|
|
|
|
return 0, 0, errors.New("version not found")
|
|
}
|