2016-07-09 14:09:27 +00:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
2016-07-29 17:23:39 +00:00
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
2016-07-09 14:09:27 +00:00
|
|
|
*/
|
|
|
|
|
2016-09-08 17:53:45 +00:00
|
|
|
package core
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
import (
|
2016-11-28 14:54:38 +00:00
|
|
|
"fmt"
|
2016-07-09 14:09:27 +00:00
|
|
|
"sort"
|
|
|
|
|
|
|
|
"github.com/unidoc/unidoc/common"
|
|
|
|
)
|
|
|
|
|
2016-11-28 14:54:38 +00:00
|
|
|
// Inspect analyzes the document object structure.
|
2016-12-02 15:17:19 +00:00
|
|
|
func (this *PdfParser) Inspect() (map[string]int, error) {
|
|
|
|
return this.inspect()
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func getUniDocVersion() string {
|
|
|
|
return common.Version
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inspect object types.
|
|
|
|
* Go through all objects in the cross ref table and detect the types.
|
2016-09-08 17:53:45 +00:00
|
|
|
* Mostly for debugging purposes and inspecting odd PDF files.
|
2016-07-09 14:09:27 +00:00
|
|
|
*/
|
2016-11-28 14:54:38 +00:00
|
|
|
func (this *PdfParser) inspect() (map[string]int, error) {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--------INSPECT ----------")
|
|
|
|
common.Log.Trace("Xref table:")
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
objTypes := map[string]int{}
|
|
|
|
objCount := 0
|
|
|
|
failedCount := 0
|
|
|
|
|
|
|
|
keys := []int{}
|
|
|
|
for k, _ := range this.xrefs {
|
|
|
|
keys = append(keys, k)
|
|
|
|
}
|
|
|
|
sort.Ints(keys)
|
|
|
|
|
|
|
|
i := 0
|
|
|
|
for _, k := range keys {
|
|
|
|
xref := this.xrefs[k]
|
|
|
|
if xref.objectNumber == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
objCount++
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("==========")
|
|
|
|
common.Log.Trace("Looking up object number: %d", xref.objectNumber)
|
2016-07-09 14:09:27 +00:00
|
|
|
o, err := this.LookupByNumber(xref.objectNumber)
|
|
|
|
if err != nil {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("ERROR: Fail to lookup obj %d (%s)", xref.objectNumber, err)
|
2016-07-09 14:09:27 +00:00
|
|
|
failedCount++
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("obj: %s", o)
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
iobj, isIndirect := o.(*PdfIndirectObject)
|
|
|
|
if isIndirect {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("IND OOBJ %d: %s", xref.objectNumber, iobj)
|
2016-07-09 14:09:27 +00:00
|
|
|
dict, isDict := iobj.PdfObject.(*PdfObjectDictionary)
|
|
|
|
if isDict {
|
2016-11-28 22:21:20 +00:00
|
|
|
// Check if has Type parameter.
|
|
|
|
if ot, has := (*dict)["Type"].(*PdfObjectName); has {
|
2016-07-09 14:09:27 +00:00
|
|
|
otype := string(*ot)
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("---> Obj type: %s", otype)
|
2016-07-09 14:09:27 +00:00
|
|
|
_, isDefined := objTypes[otype]
|
|
|
|
if isDefined {
|
|
|
|
objTypes[otype]++
|
|
|
|
} else {
|
|
|
|
objTypes[otype] = 1
|
|
|
|
}
|
2016-11-28 22:21:20 +00:00
|
|
|
} else if ot, has := (*dict)["Subtype"].(*PdfObjectName); has {
|
|
|
|
// Check if subtype
|
|
|
|
otype := string(*ot)
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("---> Obj subtype: %s", otype)
|
2016-11-28 22:21:20 +00:00
|
|
|
_, isDefined := objTypes[otype]
|
|
|
|
if isDefined {
|
|
|
|
objTypes[otype]++
|
|
|
|
} else {
|
|
|
|
objTypes[otype] = 1
|
|
|
|
}
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2016-11-28 22:21:20 +00:00
|
|
|
if val, has := (*dict)["S"].(*PdfObjectName); has && *val == "JavaScript" {
|
|
|
|
// Check if Javascript.
|
|
|
|
_, isDefined := objTypes["JavaScript"]
|
|
|
|
if isDefined {
|
|
|
|
objTypes["JavaScript"]++
|
|
|
|
} else {
|
|
|
|
objTypes["JavaScript"] = 1
|
|
|
|
}
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2016-11-28 22:21:20 +00:00
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
|
|
|
} else if sobj, isStream := o.(*PdfObjectStream); isStream {
|
|
|
|
if otype, ok := (*(sobj.PdfObjectDictionary))["Type"].(*PdfObjectName); ok {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--> Stream object type: %s", *otype)
|
2016-07-09 14:09:27 +00:00
|
|
|
k := string(*otype)
|
|
|
|
if _, isDefined := objTypes[k]; isDefined {
|
|
|
|
objTypes[k]++
|
|
|
|
} else {
|
|
|
|
objTypes[k] = 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else { // Direct.
|
|
|
|
dict, isDict := o.(*PdfObjectDictionary)
|
|
|
|
if isDict {
|
|
|
|
ot, isName := (*dict)["Type"].(*PdfObjectName)
|
|
|
|
if isName {
|
|
|
|
otype := string(*ot)
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--- obj type %s", otype)
|
2016-07-09 14:09:27 +00:00
|
|
|
objTypes[otype]++
|
|
|
|
}
|
|
|
|
}
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("DIRECT OBJ %d: %s", xref.objectNumber, o)
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
i++
|
|
|
|
}
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("--------EOF INSPECT ----------")
|
|
|
|
common.Log.Trace("=======")
|
|
|
|
common.Log.Trace("Object count: %d", objCount)
|
|
|
|
common.Log.Trace("Failed lookup: %d", failedCount)
|
2016-07-09 14:09:27 +00:00
|
|
|
for t, c := range objTypes {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("%s: %d", t, c)
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("=======")
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
if len(this.xrefs) < 1 {
|
2016-10-31 21:48:25 +00:00
|
|
|
common.Log.Debug("ERROR: This document is invalid (xref table missing!)")
|
2016-11-28 14:54:38 +00:00
|
|
|
return nil, fmt.Errorf("Invalid document (xref table missing)")
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2016-11-28 14:54:38 +00:00
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
fontObjs, ok := objTypes["Font"]
|
|
|
|
if !ok || fontObjs < 2 {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("This document is probably scanned!")
|
2016-07-09 14:09:27 +00:00
|
|
|
} else {
|
2017-03-02 18:06:32 +00:00
|
|
|
common.Log.Trace("This document is valid for extraction!")
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2016-11-28 14:54:38 +00:00
|
|
|
|
|
|
|
return objTypes, nil
|
2016-07-09 14:09:27 +00:00
|
|
|
}
|
2017-04-19 11:46:53 +00:00
|
|
|
|
|
|
|
func absInt(x int) int {
|
|
|
|
if x < 0 {
|
|
|
|
return -x
|
|
|
|
} else {
|
|
|
|
return x
|
|
|
|
}
|
|
|
|
}
|