2016-07-09 14:09:27 +00:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
2016-07-29 17:23:39 +00:00
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
2016-07-09 14:09:27 +00:00
|
|
|
*/
|
|
|
|
|
2016-09-08 17:53:45 +00:00
|
|
|
package core
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
import (
|
2017-07-23 20:20:05 +00:00
|
|
|
"errors"
|
2016-11-28 14:54:38 +00:00
|
|
|
"fmt"
|
2018-06-27 12:25:59 +10:00
|
|
|
"reflect"
|
2016-07-09 14:09:27 +00:00
|
|
|
"sort"
|
|
|
|
|
|
|
|
"github.com/unidoc/unidoc/common"
|
|
|
|
)
|
|
|
|
|
2017-07-23 20:20:05 +00:00
|
|
|
// Check slice range to make sure within bounds for accessing:
|
|
|
|
// slice[a:b] where sliceLen=len(slice).
|
|
|
|
func checkBounds(sliceLen, a, b int) error {
|
|
|
|
if a < 0 || a > sliceLen {
|
|
|
|
return errors.New("Slice index a out of bounds")
|
|
|
|
}
|
|
|
|
if b < a {
|
|
|
|
return errors.New("Invalid slice index b < a")
|
|
|
|
}
|
|
|
|
if b > sliceLen {
|
|
|
|
return errors.New("Slice index b out of bounds")
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-11-28 14:54:38 +00:00
|
|
|
// Inspect analyzes the document object structure.
|
2017-08-03 15:33:51 +00:00
|
|
|
func (parser *PdfParser) Inspect() (map[string]int, error) {
|
|
|
|
return parser.inspect()
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
|
|
|
|
2017-08-05 00:56:05 +00:00
|
|
|
// GetObjectNums returns a sorted list of object numbers of the PDF objects in the file.
|
|
|
|
func (parser *PdfParser) GetObjectNums() []int {
|
|
|
|
objNums := []int{}
|
|
|
|
for _, x := range parser.xrefs {
|
|
|
|
objNums = append(objNums, x.objectNumber)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort the object numbers to give consistent ordering of PDF objects in output.
|
|
|
|
// Needed since parser.xrefs is a map.
|
|
|
|
sort.Ints(objNums)
|
|
|
|
|
|
|
|
return objNums
|
|
|
|
}
|
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
func getUniDocVersion() string {
|
|
|
|
return common.Version
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inspect object types.
|
|
|
|
* Go through all objects in the cross ref table and detect the types.
|
2016-09-08 17:53:45 +00:00
|
|
|
* Mostly for debugging purposes and inspecting odd PDF files.
|
2016-07-09 14:09:27 +00:00
|
|
|
*/
|
2017-08-03 15:33:51 +00:00
|
|
|
func (parser *PdfParser) inspect() (map[string]int, error) {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--------INSPECT ----------")
|
|
|
|
common.Log.Trace("Xref table:")
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
objTypes := map[string]int{}
|
|
|
|
objCount := 0
|
|
|
|
failedCount := 0
|
|
|
|
|
|
|
|
keys := []int{}
|
2017-08-03 15:33:51 +00:00
|
|
|
for k := range parser.xrefs {
|
2016-07-09 14:09:27 +00:00
|
|
|
keys = append(keys, k)
|
|
|
|
}
|
|
|
|
sort.Ints(keys)
|
|
|
|
|
|
|
|
i := 0
|
|
|
|
for _, k := range keys {
|
2017-08-03 15:33:51 +00:00
|
|
|
xref := parser.xrefs[k]
|
2016-07-09 14:09:27 +00:00
|
|
|
if xref.objectNumber == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
objCount++
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("==========")
|
|
|
|
common.Log.Trace("Looking up object number: %d", xref.objectNumber)
|
2017-08-03 15:33:51 +00:00
|
|
|
o, err := parser.LookupByNumber(xref.objectNumber)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("ERROR: Fail to lookup obj %d (%s)", xref.objectNumber, err)
|
2016-07-09 14:09:27 +00:00
|
|
|
failedCount++
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("obj: %s", o)
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
iobj, isIndirect := o.(*PdfIndirectObject)
|
|
|
|
if isIndirect {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("IND OOBJ %d: %s", xref.objectNumber, iobj)
|
2016-07-09 14:09:27 +00:00
|
|
|
dict, isDict := iobj.PdfObject.(*PdfObjectDictionary)
|
|
|
|
if isDict {
|
2016-11-28 22:21:20 +00:00
|
|
|
// Check if has Type parameter.
|
2017-07-08 21:04:13 +00:00
|
|
|
if ot, has := dict.Get("Type").(*PdfObjectName); has {
|
2016-07-09 14:09:27 +00:00
|
|
|
otype := string(*ot)
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("---> Obj type: %s", otype)
|
2016-07-09 14:09:27 +00:00
|
|
|
_, isDefined := objTypes[otype]
|
|
|
|
if isDefined {
|
|
|
|
objTypes[otype]++
|
|
|
|
} else {
|
|
|
|
objTypes[otype] = 1
|
|
|
|
}
|
2017-07-08 21:04:13 +00:00
|
|
|
} else if ot, has := dict.Get("Subtype").(*PdfObjectName); has {
|
2016-11-28 22:21:20 +00:00
|
|
|
// Check if subtype
|
|
|
|
otype := string(*ot)
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("---> Obj subtype: %s", otype)
|
2016-11-28 22:21:20 +00:00
|
|
|
_, isDefined := objTypes[otype]
|
|
|
|
if isDefined {
|
|
|
|
objTypes[otype]++
|
|
|
|
} else {
|
|
|
|
objTypes[otype] = 1
|
|
|
|
}
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2017-07-08 21:04:13 +00:00
|
|
|
if val, has := dict.Get("S").(*PdfObjectName); has && *val == "JavaScript" {
|
2016-11-28 22:21:20 +00:00
|
|
|
// Check if Javascript.
|
|
|
|
_, isDefined := objTypes["JavaScript"]
|
|
|
|
if isDefined {
|
|
|
|
objTypes["JavaScript"]++
|
|
|
|
} else {
|
|
|
|
objTypes["JavaScript"] = 1
|
|
|
|
}
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2016-11-28 22:21:20 +00:00
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
|
|
|
} else if sobj, isStream := o.(*PdfObjectStream); isStream {
|
2017-07-08 21:04:13 +00:00
|
|
|
if otype, ok := sobj.PdfObjectDictionary.Get("Type").(*PdfObjectName); ok {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--> Stream object type: %s", *otype)
|
2016-07-09 14:09:27 +00:00
|
|
|
k := string(*otype)
|
|
|
|
if _, isDefined := objTypes[k]; isDefined {
|
|
|
|
objTypes[k]++
|
|
|
|
} else {
|
|
|
|
objTypes[k] = 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else { // Direct.
|
|
|
|
dict, isDict := o.(*PdfObjectDictionary)
|
|
|
|
if isDict {
|
2017-07-08 21:04:13 +00:00
|
|
|
ot, isName := dict.Get("Type").(*PdfObjectName)
|
2016-07-09 14:09:27 +00:00
|
|
|
if isName {
|
|
|
|
otype := string(*ot)
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--- obj type %s", otype)
|
2016-07-09 14:09:27 +00:00
|
|
|
objTypes[otype]++
|
|
|
|
}
|
|
|
|
}
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("DIRECT OBJ %d: %s", xref.objectNumber, o)
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
i++
|
|
|
|
}
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--------EOF INSPECT ----------")
|
|
|
|
common.Log.Trace("=======")
|
|
|
|
common.Log.Trace("Object count: %d", objCount)
|
|
|
|
common.Log.Trace("Failed lookup: %d", failedCount)
|
2016-07-09 14:09:27 +00:00
|
|
|
for t, c := range objTypes {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("%s: %d", t, c)
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("=======")
|
2016-07-09 14:09:27 +00:00
|
|
|
|
2017-08-03 15:33:51 +00:00
|
|
|
if len(parser.xrefs) < 1 {
|
2016-10-31 21:48:25 +00:00
|
|
|
common.Log.Debug("ERROR: This document is invalid (xref table missing!)")
|
2016-11-28 14:54:38 +00:00
|
|
|
return nil, fmt.Errorf("Invalid document (xref table missing)")
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2016-11-28 14:54:38 +00:00
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
fontObjs, ok := objTypes["Font"]
|
|
|
|
if !ok || fontObjs < 2 {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("This document is probably scanned!")
|
2016-07-09 14:09:27 +00:00
|
|
|
} else {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("This document is valid for extraction!")
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2016-11-28 14:54:38 +00:00
|
|
|
|
|
|
|
return objTypes, nil
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2017-04-19 11:46:53 +00:00
|
|
|
|
|
|
|
func absInt(x int) int {
|
|
|
|
if x < 0 {
|
|
|
|
return -x
|
|
|
|
}
|
2018-06-27 12:25:59 +10:00
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetString returns the string represented by `obj` if `obj` is a PdfObjectString or an error if it isn't.
|
|
|
|
func GetString(obj PdfObject) (string, error) {
|
|
|
|
if s, ok := obj.(*PdfObjectString); ok {
|
|
|
|
return string(*s), nil
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
return "", ErrTypeError
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
|
2018-06-27 14:34:42 +10:00
|
|
|
// GetStringBytes returns the bytes represented by `obj` if `obj` is a PdfObjectString or an error if it isn't.
|
2018-06-27 12:25:59 +10:00
|
|
|
func GetStringBytes(obj PdfObject) ([]byte, error) {
|
|
|
|
if s, ok := obj.(*PdfObjectString); ok {
|
|
|
|
return []byte(*s), nil
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
return []byte{}, ErrTypeError
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
|
2018-06-27 14:34:42 +10:00
|
|
|
// GetName returns the string represented by `obj` if `obj` is a PdfObjectName or an error if it isn't.
|
2018-06-27 12:25:59 +10:00
|
|
|
func GetName(obj PdfObject) (string, error) {
|
|
|
|
if s, ok := obj.(*PdfObjectName); ok {
|
|
|
|
return string(*s), nil
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
return "", ErrTypeError
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
// GetInteger returns the int represented by `obj` if `obj` is a PdfObjectInteger or an error if it isn't.
|
|
|
|
func GetInteger(obj PdfObject) (int, error) {
|
|
|
|
if i, ok := obj.(*PdfObjectInteger); ok {
|
|
|
|
return int(*i), nil
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
return 0, ErrTypeError
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
// GetArray returns the slice of PdfObjects represented by `obj` if `obj` is a PdfObjectArray or an
|
|
|
|
// error if it isn't.
|
|
|
|
func GetArray(obj PdfObject) ([]PdfObject, error) {
|
|
|
|
if s, ok := obj.(*PdfObjectArray); ok {
|
|
|
|
return []PdfObject(*s), nil
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
return nil, ErrTypeError
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
// EqualObjects returns true if `obj1` and `obj2` have the same contents.
|
|
|
|
// NOTE: It is a good idea to flatten obj1 and obj2 with FlattenObject before calling this function
|
|
|
|
// so that contents, rather than references, can be compared.
|
|
|
|
func EqualObjects(obj1, obj2 PdfObject) bool {
|
|
|
|
return equalObjects(obj1, obj2, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// equalObjects returns true if `obj1` and `obj2` have the same contents.
|
|
|
|
// It recursively checks the contents of indirect objects, arrays and dicts to a depth of
|
|
|
|
// TraceMaxDepth. `depth` is the current recusion depth.
|
|
|
|
func equalObjects(obj1, obj2 PdfObject, depth int) bool {
|
|
|
|
if depth > TraceMaxDepth {
|
|
|
|
common.Log.Error("Trace depth level beyond %d - error!", TraceMaxDepth)
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if obj1 == nil && obj2 == nil {
|
|
|
|
return true
|
|
|
|
} else if obj1 == nil || obj2 == nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if reflect.TypeOf(obj1) != reflect.TypeOf(obj2) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// obj1 and obj2 are non-nil and of the same type
|
|
|
|
switch t1 := obj1.(type) {
|
|
|
|
case *PdfObjectNull, *PdfObjectReference:
|
|
|
|
return true
|
|
|
|
case *PdfObjectName:
|
|
|
|
return *t1 == *(obj2.(*PdfObjectName))
|
|
|
|
case *PdfObjectString:
|
|
|
|
return *t1 == *(obj2.(*PdfObjectString))
|
|
|
|
case *PdfObjectInteger:
|
|
|
|
return *t1 == *(obj2.(*PdfObjectInteger))
|
|
|
|
case *PdfObjectBool:
|
|
|
|
return *t1 == *(obj2.(*PdfObjectBool))
|
|
|
|
case *PdfObjectFloat:
|
|
|
|
return *t1 == *(obj2.(*PdfObjectFloat))
|
|
|
|
case *PdfIndirectObject:
|
|
|
|
return equalObjects(TraceToDirectObject(obj1), TraceToDirectObject(obj2), depth+1)
|
|
|
|
case *PdfObjectArray:
|
|
|
|
t2 := obj2.(*PdfObjectArray)
|
|
|
|
if len(*t1) != len(*t2) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
for i, o1 := range *t1 {
|
|
|
|
if !equalObjects(o1, (*t2)[i], depth+1) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
case *PdfObjectDictionary:
|
|
|
|
t2 := obj2.(*PdfObjectDictionary)
|
|
|
|
d1, d2 := (*t1).dict, (*t2).dict
|
|
|
|
if len(d1) != len(d2) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
for k, o1 := range d1 {
|
|
|
|
o2, ok := d2[k]
|
|
|
|
if !ok || !equalObjects(o1, o2, depth+1) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
case *PdfObjectStream:
|
|
|
|
t2 := obj2.(*PdfObjectStream)
|
|
|
|
return equalObjects((*t1).PdfObjectDictionary, (*t2).PdfObjectDictionary, depth+1)
|
|
|
|
default:
|
|
|
|
common.Log.Error("ERROR: Unknown type: %T - should never happen!", obj1)
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// FlattenObject returns the contents of `obj`. In other words, `obj` with indirect objects replaced
|
|
|
|
// by their values.
|
|
|
|
// The replacements are made recursively to a depth of TraceMaxDepth.
|
|
|
|
// NOTE: Dicts are sorted to make objects with same contents have the same PDF object strings.
|
|
|
|
func FlattenObject(obj PdfObject) PdfObject {
|
|
|
|
return flattenObject(obj, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// flattenObject returns `obj` with indirect objects recursively replaced by their values.
|
|
|
|
// `depth` is the recursion depth.
|
|
|
|
func flattenObject(obj PdfObject, depth int) PdfObject {
|
|
|
|
if depth > TraceMaxDepth {
|
|
|
|
common.Log.Error("Trace depth level beyond %d - error!", TraceMaxDepth)
|
2018-06-29 18:09:44 +10:00
|
|
|
return MakeNull()
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
switch t := obj.(type) {
|
|
|
|
case *PdfIndirectObject:
|
|
|
|
obj = flattenObject((*t).PdfObject, depth+1)
|
|
|
|
case *PdfObjectArray:
|
|
|
|
for i, o := range *t {
|
|
|
|
(*t)[i] = flattenObject(o, depth+1)
|
|
|
|
}
|
|
|
|
case *PdfObjectDictionary:
|
|
|
|
for k, o := range (*t).dict {
|
|
|
|
(*t).dict[k] = flattenObject(o, depth+1)
|
|
|
|
}
|
|
|
|
sort.Slice((*t).keys, func(i, j int) bool { return (*t).keys[i] < (*t).keys[j] })
|
|
|
|
}
|
|
|
|
return obj
|
2017-04-19 11:46:53 +00:00
|
|
|
}
|