unipdf/core/utils.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package core

import (
	"errors"
	"fmt"
	"reflect"
	"sort"

	"github.com/unidoc/unipdf/v3/common"
)

// Check slice range to make sure within bounds for accessing:
//    slice[a:b] where sliceLen=len(slice).
func checkBounds(sliceLen, a, b int) error {
	if a < 0 || a > sliceLen {
		return errors.New("slice index a out of bounds")
	}
	if b < a {
		return errors.New("invalid slice index b < a")
	}
	if b > sliceLen {
		return errors.New("slice index b out of bounds")
	}

	return nil
}

// Inspect analyzes the document object structure. Returns a map of object types (by name) with the instance count
// as value.
func (parser *PdfParser) Inspect() (map[string]int, error) {
	return parser.inspect()
}

// GetObjectNums returns a sorted list of object numbers of the PDF objects in the file.
func (parser *PdfParser) GetObjectNums() []int {
	var objNums []int
	for _, x := range parser.xrefs.ObjectMap {
		objNums = append(objNums, x.ObjectNumber)
	}

	// Sort the object numbers to give consistent ordering of PDF objects in output.
	// Needed since parser.xrefs is a map.
	sort.Ints(objNums)

	return objNums
}

// ResolveReference resolves reference if `o` is a *PdfObjectReference and returns the object referenced to.
// Otherwise returns back `o`.
func ResolveReference(obj PdfObject) PdfObject {
	if ref, isRef := obj.(*PdfObjectReference); isRef {
		return ref.Resolve()
	}
	return obj
}

// ResolveReferencesDeep recursively traverses through object `o`, looking up and replacing
// references with indirect objects.
// Optionally a map of already deep-resolved objects can be provided via `traversed`. The `traversed` map
// is updated while traversing the objects to avoid traversing same objects multiple times.
func ResolveReferencesDeep(o PdfObject, traversed map[PdfObject]struct{}) error {
	if traversed == nil {
		traversed = map[PdfObject]struct{}{}
	}
	return resolveReferencesDeep(o, 0, traversed)
}

func resolveReferencesDeep(o PdfObject, depth int, traversed map[PdfObject]struct{}) error {
	common.Log.Trace("Traverse object data (depth = %d)", depth)
	if _, isTraversed := traversed[o]; isTraversed {
		common.Log.Trace("-Already traversed...")
		return nil
	}
	traversed[o] = struct{}{}

	switch t := o.(type) {
	case *PdfIndirectObject:
		io := t
		common.Log.Trace("io: %s", io)
		common.Log.Trace("- %s", io.PdfObject)
		return resolveReferencesDeep(io.PdfObject, depth+1, traversed)
	case *PdfObjectStream:
		so := t
		return resolveReferencesDeep(so.PdfObjectDictionary, depth+1, traversed)
	case *PdfObjectDictionary:
		dict := t
		common.Log.Trace("- dict: %s", dict)
		for _, name := range dict.Keys() {
			v := dict.Get(name)
			if ref, isRef := v.(*PdfObjectReference); isRef {
				resolvedObj := ref.Resolve()
				dict.Set(name, resolvedObj)
				err := resolveReferencesDeep(resolvedObj, depth+1, traversed)
				if err != nil {
					return err
				}
			} else {
				err := resolveReferencesDeep(v, depth+1, traversed)
				if err != nil {
					return err
				}
			}
		}
		return nil
	case *PdfObjectArray:
		arr := t
		common.Log.Trace("- array: %s", arr)
		for idx, v := range arr.Elements() {
			if ref, isRef := v.(*PdfObjectReference); isRef {
				resolvedObj := ref.Resolve()
				arr.Set(idx, resolvedObj)
				err := resolveReferencesDeep(resolvedObj, depth+1, traversed)
				if err != nil {
					return err
				}
			} else {
				err := resolveReferencesDeep(v, depth+1, traversed)
				if err != nil {
					return err
				}
			}
		}
		return nil
	case *PdfObjectReference:
		common.Log.Debug("ERROR: Tracing a reference!")
		return errors.New("error tracing a reference")
	}

	return nil
}

func getUniDocVersion() string {
	return common.Version
}

/*
 * Inspect object types.
 * Go through all objects in the cross ref table and detect the types.
 * Mostly for debugging purposes and inspecting odd PDF files.
 */
func (parser *PdfParser) inspect() (map[string]int, error) {
	common.Log.Trace("--------INSPECT ----------")
	common.Log.Trace("Xref table:")

	objTypes := map[string]int{}
	objCount := 0
	failedCount := 0

	var keys []int
	for k := range parser.xrefs.ObjectMap {
		keys = append(keys, k)
	}
	sort.Ints(keys)

	i := 0
	for _, k := range keys {
		xref := parser.xrefs.ObjectMap[k]
		if xref.ObjectNumber == 0 {
			continue
		}
		objCount++
		common.Log.Trace("==========")
		common.Log.Trace("Looking up object number: %d", xref.ObjectNumber)
		o, err := parser.LookupByNumber(xref.ObjectNumber)
		if err != nil {
			common.Log.Trace("ERROR: Fail to lookup obj %d (%s)", xref.ObjectNumber, err)
			failedCount++
			continue
		}

		common.Log.Trace("obj: %s", o)

		iobj, isIndirect := o.(*PdfIndirectObject)
		if isIndirect {
			common.Log.Trace("IND OOBJ %d: %s", xref.ObjectNumber, iobj)
			dict, isDict := iobj.PdfObject.(*PdfObjectDictionary)
			if isDict {
				// Check if has Type parameter.
				if ot, has := dict.Get("Type").(*PdfObjectName); has {
					otype := string(*ot)
					common.Log.Trace("---> Obj type: %s", otype)
					_, isDefined := objTypes[otype]
					if isDefined {
						objTypes[otype]++
					} else {
						objTypes[otype] = 1
					}
				} else if ot, has := dict.Get("Subtype").(*PdfObjectName); has {
					// Check if subtype
					otype := string(*ot)
					common.Log.Trace("---> Obj subtype: %s", otype)
					_, isDefined := objTypes[otype]
					if isDefined {
						objTypes[otype]++
					} else {
						objTypes[otype] = 1
					}
				}
				if val, has := dict.Get("S").(*PdfObjectName); has && *val == "JavaScript" {
					// Check if Javascript.
					_, isDefined := objTypes["JavaScript"]
					if isDefined {
						objTypes["JavaScript"]++
					} else {
						objTypes["JavaScript"] = 1
					}
				}

			}
		} else if sobj, isStream := o.(*PdfObjectStream); isStream {
			if otype, ok := sobj.PdfObjectDictionary.Get("Type").(*PdfObjectName); ok {
				common.Log.Trace("--> Stream object type: %s", *otype)
				k := string(*otype)
				if _, isDefined := objTypes[k]; isDefined {
					objTypes[k]++
				} else {
					objTypes[k] = 1
				}
			}
		} else { // Direct.
			dict, isDict := o.(*PdfObjectDictionary)
			if isDict {
				ot, isName := dict.Get("Type").(*PdfObjectName)
				if isName {
					otype := string(*ot)
					common.Log.Trace("--- obj type %s", otype)
					objTypes[otype]++
				}
			}
			common.Log.Trace("DIRECT OBJ %d: %s", xref.ObjectNumber, o)
		}

		i++
	}
	common.Log.Trace("--------EOF INSPECT ----------")
	common.Log.Trace("=======")
	common.Log.Trace("Object count: %d", objCount)
	common.Log.Trace("Failed lookup: %d", failedCount)
	for t, c := range objTypes {
		common.Log.Trace("%s: %d", t, c)
	}
	common.Log.Trace("=======")

	if len(parser.xrefs.ObjectMap) < 1 {
		common.Log.Debug("ERROR: This document is invalid (xref table missing!)")
		return nil, fmt.Errorf("invalid document (xref table missing)")
	}

	fontObjs, ok := objTypes["Font"]
	if !ok || fontObjs < 2 {
		common.Log.Trace("This document is probably scanned!")
	} else {
		common.Log.Trace("This document is valid for extraction!")
	}

	return objTypes, nil
}

func absInt(x int) int {
	if x < 0 {
		return -x
	}
	return x
}

// EqualObjects returns true if `obj1` and `obj2` have the same contents.
//
// NOTE: It is a good idea to flatten obj1 and obj2 with FlattenObject before calling this function
// so that contents, rather than references, can be compared.
func EqualObjects(obj1, obj2 PdfObject) bool {
	return equalObjects(obj1, obj2, 0)
}

// equalObjects returns true if `obj1` and `obj2` have the same contents.
// It recursively checks the contents of indirect objects, arrays and dicts to a depth of
// TraceMaxDepth. `depth` is the current recusion depth.
func equalObjects(obj1, obj2 PdfObject, depth int) bool {
	if depth > traceMaxDepth {
		common.Log.Error("Trace depth level beyond %d - error!", traceMaxDepth)
		return false
	}

	if obj1 == nil && obj2 == nil {
		return true
	} else if obj1 == nil || obj2 == nil {
		return false
	}
	if reflect.TypeOf(obj1) != reflect.TypeOf(obj2) {
		return false
	}

	// obj1 and obj2 are non-nil and of the same type
	switch t1 := obj1.(type) {
	case *PdfObjectNull, *PdfObjectReference:
		return true
	case *PdfObjectName:
		return *t1 == *(obj2.(*PdfObjectName))
	case *PdfObjectString:
		return *t1 == *(obj2.(*PdfObjectString))
	case *PdfObjectInteger:
		return *t1 == *(obj2.(*PdfObjectInteger))
	case *PdfObjectBool:
		return *t1 == *(obj2.(*PdfObjectBool))
	case *PdfObjectFloat:
		return *t1 == *(obj2.(*PdfObjectFloat))
	case *PdfIndirectObject:
		return equalObjects(TraceToDirectObject(obj1), TraceToDirectObject(obj2), depth+1)
	case *PdfObjectArray:
		t2 := obj2.(*PdfObjectArray)
		if len((*t1).vec) != len((*t2).vec) {
			return false
		}
		for i, o1 := range (*t1).vec {
			if !equalObjects(o1, (*t2).vec[i], depth+1) {
				return false
			}
		}
		return true
	case *PdfObjectDictionary:
		t2 := obj2.(*PdfObjectDictionary)
		d1, d2 := (*t1).dict, (*t2).dict
		if len(d1) != len(d2) {
			return false
		}
		for k, o1 := range d1 {
			o2, ok := d2[k]
			if !ok || !equalObjects(o1, o2, depth+1) {
				return false
			}
		}
		return true
	case *PdfObjectStream:
		t2 := obj2.(*PdfObjectStream)
		return equalObjects((*t1).PdfObjectDictionary, (*t2).PdfObjectDictionary, depth+1)
	default:
		common.Log.Error("ERROR: Unknown type: %T - should never happen!", obj1)
	}

	return false
}

// FlattenObject returns the contents of `obj`. In other words, `obj` with indirect objects replaced
// by their values.
// The replacements are made recursively to a depth of traceMaxDepth.
// NOTE: Dicts are sorted to make objects with same contents have the same PDF object strings.
func FlattenObject(obj PdfObject) PdfObject {
	return flattenObject(obj, 0)
}

// flattenObject returns `obj` with indirect objects recursively replaced by their values.
// `depth` is the recursion depth.
func flattenObject(obj PdfObject, depth int) PdfObject {
	if depth > traceMaxDepth {
		common.Log.Error("Trace depth level beyond %d - error!", traceMaxDepth)
		return MakeNull()
	}
	switch t := obj.(type) {
	case *PdfIndirectObject:
		obj = flattenObject((*t).PdfObject, depth+1)
	case *PdfObjectArray:
		for i, o := range (*t).vec {
			(*t).vec[i] = flattenObject(o, depth+1)
		}
	case *PdfObjectDictionary:
		for k, o := range (*t).dict {
			(*t).dict[k] = flattenObject(o, depth+1)
		}
		sort.Slice((*t).keys, func(i, j int) bool { return (*t).keys[i] < (*t).keys[j] })
	}
	return obj
}