2016-07-09 14:09:27 +00:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
2016-07-29 17:23:39 +00:00
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
2016-07-09 14:09:27 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
package pdf
|
|
|
|
|
|
|
|
import (
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
2016-08-17 00:07:56 +00:00
|
|
|
"strings"
|
2016-07-17 19:59:17 +00:00
|
|
|
|
|
|
|
"github.com/unidoc/unidoc/common"
|
2016-07-09 14:09:27 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
type PdfReader struct {
|
2016-08-17 14:05:29 +00:00
|
|
|
parser *PdfParser
|
|
|
|
root PdfObject
|
|
|
|
pages *PdfObjectDictionary
|
|
|
|
pageList []*PdfIndirectObject
|
|
|
|
PageList []*PdfPage
|
|
|
|
pageCount int
|
|
|
|
catalog *PdfObjectDictionary
|
2016-08-16 09:36:24 +00:00
|
|
|
outlineTree *PdfOutlineTreeNode
|
|
|
|
forms *PdfObjectDictionary
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
// For tracking traversal (cache).
|
|
|
|
traversed map[PdfObject]bool
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewPdfReader(rs io.ReadSeeker) (*PdfReader, error) {
|
|
|
|
pdfReader := &PdfReader{}
|
|
|
|
pdfReader.traversed = map[PdfObject]bool{}
|
|
|
|
|
|
|
|
// Create the parser, loads the cross reference table and trailer.
|
|
|
|
parser, err := NewParser(rs)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
pdfReader.parser = parser
|
|
|
|
|
|
|
|
isEncrypted, err := pdfReader.IsEncrypted()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load pdf doc structure if not encrypted.
|
|
|
|
if !isEncrypted {
|
|
|
|
err = pdfReader.loadStructure()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return pdfReader, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (this *PdfReader) IsEncrypted() (bool, error) {
|
|
|
|
return this.parser.IsEncrypted()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decrypt the PDF file with a specified password. Also tries to
|
|
|
|
// decrypt with an empty password. Returns true if successful,
|
|
|
|
// false otherwise.
|
|
|
|
func (this *PdfReader) Decrypt(password []byte) (bool, error) {
|
|
|
|
success, err := this.parser.Decrypt(password)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
if !success {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
err = this.loadStructure()
|
|
|
|
if err != nil {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Fail to load structure (%s)", err)
|
2016-07-09 14:09:27 +00:00
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2016-08-15 01:31:35 +00:00
|
|
|
// Loads the structure of the pdf file: pages, outlines, etc.
|
2016-07-09 14:09:27 +00:00
|
|
|
func (this *PdfReader) loadStructure() error {
|
|
|
|
if this.parser.crypter != nil && !this.parser.crypter.authenticated {
|
|
|
|
return fmt.Errorf("File need to be decrypted first")
|
|
|
|
}
|
|
|
|
|
2016-08-15 01:31:35 +00:00
|
|
|
// Catalog.
|
2016-07-09 14:09:27 +00:00
|
|
|
root, ok := (*(this.parser.trailer))["Root"].(*PdfObjectReference)
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("Invalid Root (trailer: %s)", *(this.parser.trailer))
|
|
|
|
}
|
|
|
|
oc, err := this.parser.LookupByReference(*root)
|
|
|
|
if err != nil {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Failed to read root element catalog: %s", err)
|
2016-07-09 14:09:27 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
pcatalog, ok := oc.(*PdfIndirectObject)
|
|
|
|
if !ok {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Missing catalog: (root %q) (trailer %s)", oc, *(this.parser.trailer))
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Missing catalog")
|
|
|
|
}
|
|
|
|
catalog, ok := (*pcatalog).PdfObject.(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Invalid catalog (%s)", pcatalog.PdfObject)
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Invalid catalog")
|
|
|
|
}
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Catalog: %s", catalog)
|
2016-07-09 14:09:27 +00:00
|
|
|
|
2016-08-15 01:31:35 +00:00
|
|
|
// Pages.
|
2016-07-09 14:09:27 +00:00
|
|
|
pagesRef, ok := (*catalog)["Pages"].(*PdfObjectReference)
|
|
|
|
if !ok {
|
|
|
|
return errors.New("Pages in catalog should be a reference")
|
|
|
|
}
|
|
|
|
op, err := this.parser.LookupByReference(*pagesRef)
|
|
|
|
if err != nil {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Failed to read pages")
|
2016-07-09 14:09:27 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
ppages, ok := op.(*PdfIndirectObject)
|
|
|
|
if !ok {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Pages object invalid")
|
|
|
|
common.Log.Error("op: %p", ppages)
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Pages object invalid")
|
|
|
|
}
|
|
|
|
pages, ok := ppages.PdfObject.(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Pages object invalid (%s)", ppages)
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Pages object invalid")
|
|
|
|
}
|
|
|
|
pageCount, ok := (*pages)["Count"].(*PdfObjectInteger)
|
|
|
|
if !ok {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Pages count object invalid")
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Pages count invalid")
|
|
|
|
}
|
|
|
|
|
|
|
|
this.root = root
|
|
|
|
this.catalog = catalog
|
|
|
|
this.pages = pages
|
|
|
|
this.pageCount = int(*pageCount)
|
|
|
|
this.pageList = []*PdfIndirectObject{}
|
|
|
|
|
2016-08-15 01:31:35 +00:00
|
|
|
err = this.buildPageList(ppages, nil)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("---")
|
|
|
|
common.Log.Debug("TOC")
|
|
|
|
common.Log.Debug("Pages")
|
|
|
|
common.Log.Debug("%d: %s", len(this.pageList), this.pageList)
|
2016-07-09 14:09:27 +00:00
|
|
|
|
2016-08-17 14:05:29 +00:00
|
|
|
// Outlines.
|
2016-08-16 17:57:23 +00:00
|
|
|
this.outlineTree, err = this.loadOutlines()
|
2016-08-16 09:36:24 +00:00
|
|
|
if err != nil {
|
|
|
|
common.Log.Error("Failed to build outline tree (%s)", err)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
// Get forms.
|
|
|
|
this.forms, err = this.GetForms()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-08-15 01:31:35 +00:00
|
|
|
//
|
|
|
|
// Trace to object. Keeps a list of already visited references to avoid circular references.
|
|
|
|
//
|
|
|
|
// Example circular reference.
|
|
|
|
// 1 0 obj << /Next 2 0 R >>
|
|
|
|
// 2 0 obj << /Next 1 0 R >>
|
|
|
|
//
|
|
|
|
func (this *PdfReader) traceToObjectWrapper(obj PdfObject, refList map[*PdfObjectReference]bool) (PdfObject, error) {
|
|
|
|
// Keep a list of references to avoid circular references.
|
|
|
|
|
|
|
|
ref, isRef := obj.(*PdfObjectReference)
|
|
|
|
if isRef {
|
|
|
|
// Make sure not already visited (circular ref).
|
|
|
|
if _, alreadyTraversed := refList[ref]; alreadyTraversed {
|
|
|
|
return nil, errors.New("Circular reference")
|
|
|
|
}
|
|
|
|
refList[ref] = true
|
|
|
|
obj, err := this.parser.LookupByReference(*ref)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return this.traceToObjectWrapper(obj, refList)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Not a reference, an object. Can be indirect or any direct pdf object (other than reference).
|
|
|
|
return obj, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (this *PdfReader) traceToObject(obj PdfObject) (PdfObject, error) {
|
|
|
|
refList := map[*PdfObjectReference]bool{}
|
|
|
|
return this.traceToObjectWrapper(obj, refList)
|
|
|
|
}
|
|
|
|
|
2016-08-16 17:57:23 +00:00
|
|
|
func (this *PdfReader) loadOutlines() (*PdfOutlineTreeNode, error) {
|
2016-08-15 01:31:35 +00:00
|
|
|
if this.parser.crypter != nil && !this.parser.crypter.authenticated {
|
|
|
|
return nil, fmt.Errorf("File need to be decrypted first")
|
|
|
|
}
|
|
|
|
|
|
|
|
outlines := &PdfOutline{}
|
|
|
|
|
2016-08-16 09:36:24 +00:00
|
|
|
// Has outlines? Otherwise return an empty outlines structure.
|
2016-08-15 01:31:35 +00:00
|
|
|
catalog := this.catalog
|
|
|
|
outlinesObj, hasOutlines := (*catalog)["Outlines"]
|
|
|
|
if !hasOutlines {
|
2016-08-16 09:36:24 +00:00
|
|
|
return &outlines.PdfOutlineTreeNode, nil
|
2016-08-15 01:31:35 +00:00
|
|
|
}
|
|
|
|
|
2016-08-17 00:07:56 +00:00
|
|
|
common.Log.Debug("-Has outlines")
|
2016-08-16 09:36:24 +00:00
|
|
|
// Trace references to the object.
|
2016-08-15 01:31:35 +00:00
|
|
|
outlineRootObj, err := this.traceToObject(outlinesObj)
|
|
|
|
if err != nil {
|
|
|
|
common.Log.Error("Failed to read outlines")
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-08-17 00:07:56 +00:00
|
|
|
common.Log.Debug("Outline root: %v", outlineRootObj)
|
2016-08-15 01:31:35 +00:00
|
|
|
|
|
|
|
outlineRoot, ok := outlineRootObj.(*PdfIndirectObject)
|
|
|
|
if !ok {
|
2016-08-16 09:36:24 +00:00
|
|
|
return &outlines.PdfOutlineTreeNode, errors.New("Outline root should be an indirect object")
|
2016-08-15 01:31:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
dict, ok := outlineRoot.PdfObject.(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-08-16 09:36:24 +00:00
|
|
|
return &outlines.PdfOutlineTreeNode, errors.New("Outline indirect object should contain a dictionary")
|
2016-08-15 01:31:35 +00:00
|
|
|
}
|
|
|
|
|
2016-08-17 00:07:56 +00:00
|
|
|
common.Log.Debug("Outline root dict: %v", dict)
|
|
|
|
|
2016-08-16 09:36:24 +00:00
|
|
|
outlineTree, err := this.buildOutlineTree(dict)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-08-17 00:07:56 +00:00
|
|
|
common.Log.Debug("Resulting outline tree: %v", outlineTree)
|
2016-08-15 01:31:35 +00:00
|
|
|
|
2016-08-16 17:57:23 +00:00
|
|
|
return outlineTree, nil
|
2016-08-15 01:31:35 +00:00
|
|
|
}
|
|
|
|
|
2016-08-16 17:57:23 +00:00
|
|
|
// Recursive build outline tree.
|
2016-08-16 09:36:24 +00:00
|
|
|
func (this *PdfReader) buildOutlineTree(obj PdfObject) (*PdfOutlineTreeNode, error) {
|
2016-08-15 01:31:35 +00:00
|
|
|
dict, ok := TraceToDirectObject(obj).(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-08-16 09:36:24 +00:00
|
|
|
return nil, errors.New("Not a dictionary object")
|
2016-08-15 01:31:35 +00:00
|
|
|
}
|
2016-08-17 00:07:56 +00:00
|
|
|
common.Log.Debug("build outline tree: dict: %v", dict)
|
2016-08-15 01:31:35 +00:00
|
|
|
|
2016-08-16 09:36:24 +00:00
|
|
|
if _, hasTitle := (*dict)["Title"]; hasTitle {
|
2016-08-17 00:07:56 +00:00
|
|
|
// Outline item has a title.
|
2016-08-16 09:36:24 +00:00
|
|
|
outlineItem, err := newPdfOutlineItemFromDict(dict)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
// Resolve the reference to next
|
|
|
|
if nextObj, hasNext := (*dict)["Next"]; hasNext {
|
2016-08-17 00:07:56 +00:00
|
|
|
nextObj, err = this.traceToObject(nextObj)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-08-16 09:36:24 +00:00
|
|
|
nextDict, ok := TraceToDirectObject(nextObj).(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-08-17 00:07:56 +00:00
|
|
|
return nil, fmt.Errorf("Next not a dictionary object (%T)", nextObj)
|
2016-08-16 09:36:24 +00:00
|
|
|
}
|
|
|
|
outlineItem.Next, err = this.buildOutlineTree(nextDict)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if firstObj, hasChildren := (*dict)["First"]; hasChildren {
|
2016-08-17 00:07:56 +00:00
|
|
|
firstObj, err = this.traceToObject(firstObj)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-08-16 09:36:24 +00:00
|
|
|
firstDict, ok := TraceToDirectObject(firstObj).(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-08-17 00:07:56 +00:00
|
|
|
return nil, fmt.Errorf("First not a dictionary object (%T)", firstObj)
|
2016-08-16 09:36:24 +00:00
|
|
|
}
|
|
|
|
outlineItem.First, err = this.buildOutlineTree(firstDict)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
2016-08-17 00:07:56 +00:00
|
|
|
return &outlineItem.PdfOutlineTreeNode, nil
|
2016-08-16 09:36:24 +00:00
|
|
|
} else {
|
2016-08-17 00:07:56 +00:00
|
|
|
// Outline dictionary (structure element).
|
2016-08-16 09:36:24 +00:00
|
|
|
outline, err := newPdfOutlineFromDict(dict)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if firstObj, hasChildren := (*dict)["First"]; hasChildren {
|
2016-08-17 00:07:56 +00:00
|
|
|
firstObj, err = this.traceToObject(firstObj)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-08-16 09:36:24 +00:00
|
|
|
firstDict, ok := TraceToDirectObject(firstObj).(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-08-17 00:07:56 +00:00
|
|
|
return nil, fmt.Errorf("First not a dictionary object (%T)", firstObj)
|
2016-08-16 09:36:24 +00:00
|
|
|
}
|
|
|
|
outline.First, err = this.buildOutlineTree(firstDict)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
2016-08-17 00:07:56 +00:00
|
|
|
return &outline.PdfOutlineTreeNode, nil
|
2016-08-15 01:31:35 +00:00
|
|
|
}
|
2016-08-16 09:36:24 +00:00
|
|
|
}
|
|
|
|
|
2016-08-17 00:07:56 +00:00
|
|
|
func (this *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, error) {
|
|
|
|
outlineNodeList := []*PdfOutlineTreeNode{}
|
|
|
|
flattenedTitleList := []string{}
|
|
|
|
|
|
|
|
// Recursive flattening function.
|
|
|
|
var flattenFunc func(*PdfOutlineTreeNode, *[]*PdfOutlineTreeNode, *[]string, int)
|
|
|
|
flattenFunc = func(node *PdfOutlineTreeNode, outlineList *[]*PdfOutlineTreeNode, titleList *[]string, depth int) {
|
|
|
|
if node == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if node.context == nil {
|
|
|
|
common.Log.Error("Missing node.context") // Should not happen ever.
|
|
|
|
return
|
|
|
|
}
|
2016-08-16 17:57:23 +00:00
|
|
|
|
2016-08-17 00:07:56 +00:00
|
|
|
if item, isItem := node.context.(*PdfOutlineItem); isItem {
|
|
|
|
*outlineList = append(*outlineList, &item.PdfOutlineTreeNode)
|
|
|
|
title := strings.Repeat(" ", depth*2) + string(*item.Title)
|
|
|
|
*titleList = append(*titleList, title)
|
|
|
|
if item.Next != nil {
|
|
|
|
flattenFunc(item.Next, outlineList, titleList, depth)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if node.First != nil {
|
|
|
|
title := strings.Repeat(" ", depth*2) + "+"
|
|
|
|
*titleList = append(*titleList, title)
|
|
|
|
flattenFunc(node.First, outlineList, titleList, depth+1)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
flattenFunc(this.outlineTree, &outlineNodeList, &flattenedTitleList, 0)
|
|
|
|
return outlineNodeList, flattenedTitleList, nil
|
2016-08-15 01:31:35 +00:00
|
|
|
}
|
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
// Get document form data.
|
|
|
|
func (this *PdfReader) GetForms() (*PdfObjectDictionary, error) {
|
|
|
|
if this.parser.crypter != nil && !this.parser.crypter.authenticated {
|
|
|
|
return nil, fmt.Errorf("File need to be decrypted first")
|
|
|
|
}
|
|
|
|
// Has forms?
|
|
|
|
catalog := this.catalog
|
|
|
|
|
|
|
|
var formsDict *PdfObjectDictionary
|
|
|
|
|
|
|
|
if dict, hasFormsDict := (*catalog)["AcroForm"].(*PdfObjectDictionary); hasFormsDict {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Has Acro forms - dictionary under Catalog")
|
2016-07-09 14:09:27 +00:00
|
|
|
formsDict = dict
|
|
|
|
} else if formsRef, hasFormsRef := (*catalog)["AcroForm"].(*PdfObjectReference); hasFormsRef {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Has Acro forms - Indirect object")
|
2016-07-09 14:09:27 +00:00
|
|
|
formsObj, err := this.parser.LookupByReference(*formsRef)
|
|
|
|
if err != nil {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Failed to read forms")
|
2016-07-09 14:09:27 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if iobj, ok := formsObj.(*PdfIndirectObject); ok {
|
|
|
|
if dict, ok := iobj.PdfObject.(*PdfObjectDictionary); ok {
|
|
|
|
formsDict = dict
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if formsDict == nil {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Does not have forms")
|
2016-07-09 14:09:27 +00:00
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Has Acro forms")
|
2016-07-09 14:09:27 +00:00
|
|
|
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Traverse the Acroforms structure")
|
2016-08-16 16:22:01 +00:00
|
|
|
err := this.traverseObjectData(formsDict)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Unable to traverse AcroForms (%s)", err)
|
2016-07-09 14:09:27 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return formsDict, nil
|
|
|
|
}
|
|
|
|
|
2016-08-15 01:31:35 +00:00
|
|
|
func (this *PdfReader) lookupPageByObject(obj PdfObject) (*PdfPage, error) {
|
|
|
|
// can be indirect, direct, or reference
|
|
|
|
// look up the corresponding page
|
|
|
|
return nil, errors.New("Page not found")
|
|
|
|
}
|
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
// Build the table of contents.
|
|
|
|
// tree, ex: Pages -> Pages -> Pages -> Page
|
|
|
|
// Traverse through the whole thing recursively.
|
2016-08-15 01:31:35 +00:00
|
|
|
func (this *PdfReader) buildPageList(node *PdfIndirectObject, parent *PdfIndirectObject) error {
|
2016-07-09 14:09:27 +00:00
|
|
|
if node == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
nodeDict, ok := node.PdfObject.(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
|
|
|
return errors.New("Node not a dictionary")
|
|
|
|
}
|
|
|
|
|
|
|
|
objType, ok := (*nodeDict)["Type"].(*PdfObjectName)
|
|
|
|
if !ok {
|
|
|
|
return errors.New("Node missing Type (Required)")
|
|
|
|
}
|
2016-08-15 01:31:35 +00:00
|
|
|
common.Log.Debug("buildPageList node type: %s", *objType)
|
2016-07-09 14:09:27 +00:00
|
|
|
if *objType == "Page" {
|
2016-08-16 09:36:24 +00:00
|
|
|
p, err := this.newPdfPageFromDict(nodeDict)
|
2016-08-15 01:31:35 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
if parent != nil {
|
|
|
|
// Set the parent (in case missing or incorrect).
|
|
|
|
(*nodeDict)["Parent"] = parent
|
|
|
|
}
|
|
|
|
this.pageList = append(this.pageList, node)
|
2016-08-15 01:31:35 +00:00
|
|
|
this.PageList = append(this.PageList, p)
|
|
|
|
|
2016-07-09 14:09:27 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if *objType != "Pages" {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Table of content containing non Page/Pages object! (%s)", objType)
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Table of content containing non Page/Pages object!")
|
|
|
|
}
|
|
|
|
|
|
|
|
// A Pages object. Update the parent.
|
|
|
|
if parent != nil {
|
|
|
|
(*nodeDict)["Parent"] = parent
|
|
|
|
}
|
|
|
|
|
2016-08-16 16:22:01 +00:00
|
|
|
// Resolve the object recursively.
|
|
|
|
err := this.traverseObjectData(node)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
kidsObj, err := this.parser.Trace((*nodeDict)["Kids"])
|
|
|
|
if err != nil {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Failed loading Kids object")
|
2016-07-09 14:09:27 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
var kids *PdfObjectArray
|
|
|
|
kids, ok = kidsObj.(*PdfObjectArray)
|
|
|
|
if !ok {
|
|
|
|
kidsIndirect, isIndirect := kidsObj.(*PdfIndirectObject)
|
|
|
|
if !isIndirect {
|
|
|
|
return errors.New("Invalid Kids object")
|
|
|
|
}
|
|
|
|
kids, ok = kidsIndirect.PdfObject.(*PdfObjectArray)
|
|
|
|
if !ok {
|
|
|
|
return errors.New("Invalid Kids indirect object")
|
|
|
|
}
|
|
|
|
}
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Kids: %s", kids)
|
2016-07-09 14:09:27 +00:00
|
|
|
for idx, child := range *kids {
|
2016-08-16 16:22:01 +00:00
|
|
|
child, ok := child.(*PdfIndirectObject)
|
2016-07-09 14:09:27 +00:00
|
|
|
if !ok {
|
2016-08-16 16:22:01 +00:00
|
|
|
common.Log.Error("Page not indirect object - (%s)", child)
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Page not indirect object")
|
|
|
|
}
|
|
|
|
(*kids)[idx] = child
|
2016-08-15 01:31:35 +00:00
|
|
|
err = this.buildPageList(child, node)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the number of pages in the document.
|
|
|
|
func (this *PdfReader) GetNumPages() (int, error) {
|
|
|
|
if this.parser.crypter != nil && !this.parser.crypter.authenticated {
|
|
|
|
return -1, fmt.Errorf("File need to be decrypted first")
|
|
|
|
}
|
|
|
|
return len(this.pageList), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Resolves a reference, returning the object and indicates whether or not
|
|
|
|
// it was cached.
|
|
|
|
func (this *PdfReader) resolveReference(ref *PdfObjectReference) (PdfObject, bool, error) {
|
|
|
|
cachedObj, isCached := this.parser.ObjCache[int(ref.ObjectNumber)]
|
|
|
|
if !isCached {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Reader Lookup ref: %s", ref)
|
2016-07-09 14:09:27 +00:00
|
|
|
obj, err := this.parser.LookupByReference(*ref)
|
|
|
|
if err != nil {
|
|
|
|
return nil, false, err
|
|
|
|
}
|
|
|
|
this.parser.ObjCache[int(ref.ObjectNumber)] = obj
|
|
|
|
return obj, false, nil
|
|
|
|
}
|
|
|
|
return cachedObj, true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recursively traverse through the page object data and look up
|
|
|
|
* references to indirect objects.
|
2016-08-16 16:22:01 +00:00
|
|
|
*
|
|
|
|
* GH: Are we fully protected against circular references? (Add tests).
|
2016-07-09 14:09:27 +00:00
|
|
|
*/
|
2016-08-16 16:22:01 +00:00
|
|
|
func (this *PdfReader) traverseObjectData(o PdfObject) error {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Traverse object data")
|
2016-07-09 14:09:27 +00:00
|
|
|
if _, isTraversed := this.traversed[o]; isTraversed {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
this.traversed[o] = true
|
|
|
|
|
|
|
|
if io, isIndirectObj := o.(*PdfIndirectObject); isIndirectObj {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("io: %s", io)
|
|
|
|
common.Log.Debug("- %s", io.PdfObject)
|
2016-08-16 16:22:01 +00:00
|
|
|
err := this.traverseObjectData(io.PdfObject)
|
2016-07-09 14:09:27 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if so, isStreamObj := o.(*PdfObjectStream); isStreamObj {
|
2016-08-16 16:22:01 +00:00
|
|
|
err := this.traverseObjectData(so.PdfObjectDictionary)
|
2016-07-09 14:09:27 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if dict, isDict := o.(*PdfObjectDictionary); isDict {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("- dict: %s", dict)
|
2016-07-09 14:09:27 +00:00
|
|
|
for name, v := range *dict {
|
|
|
|
if ref, isRef := v.(*PdfObjectReference); isRef {
|
|
|
|
resolvedObj, _, err := this.resolveReference(ref)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
(*dict)[name] = resolvedObj
|
2016-08-16 16:22:01 +00:00
|
|
|
err = this.traverseObjectData(resolvedObj)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
} else {
|
2016-08-16 16:22:01 +00:00
|
|
|
err := this.traverseObjectData(v)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if arr, isArray := o.(*PdfObjectArray); isArray {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("- array: %s", arr)
|
2016-07-09 14:09:27 +00:00
|
|
|
for idx, v := range *arr {
|
|
|
|
if ref, isRef := v.(*PdfObjectReference); isRef {
|
|
|
|
resolvedObj, _, err := this.resolveReference(ref)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
(*arr)[idx] = resolvedObj
|
|
|
|
|
2016-08-16 16:22:01 +00:00
|
|
|
err = this.traverseObjectData(resolvedObj)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
} else {
|
2016-08-16 16:22:01 +00:00
|
|
|
err := this.traverseObjectData(v)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, isRef := o.(*PdfObjectReference); isRef {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Reader tracing a reference!")
|
2016-07-09 14:09:27 +00:00
|
|
|
return errors.New("Reader tracing a reference!")
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get outlines referring to a specific page. Only checks the outermost
|
|
|
|
// outlines.
|
2016-08-17 00:07:56 +00:00
|
|
|
/*
|
2016-07-09 14:09:27 +00:00
|
|
|
func (this *PdfReader) GetOutlinesForPage(page PdfObject) ([]*PdfIndirectObject, error) {
|
|
|
|
if this.parser.crypter != nil && !this.parser.crypter.authenticated {
|
|
|
|
return nil, fmt.Errorf("File need to be decrypted first")
|
|
|
|
}
|
|
|
|
pageOutlines := []*PdfIndirectObject{}
|
|
|
|
|
|
|
|
for _, outlineObj := range this.outlines {
|
|
|
|
dict, ok := (*outlineObj).PdfObject.(*PdfObjectDictionary)
|
|
|
|
if !ok {
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Error("Invalid outlines entry")
|
2016-07-09 14:09:27 +00:00
|
|
|
return pageOutlines, fmt.Errorf("Invalid outlines entry")
|
|
|
|
}
|
|
|
|
|
|
|
|
if dest, hasDest := (*dict)["Dest"].(*PdfObjectArray); hasDest {
|
|
|
|
if len(*dest) > 0 {
|
|
|
|
if (*dest)[0] == page {
|
|
|
|
pageOutlines = append(pageOutlines, outlineObj)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Action: GoTo destination (page) can refer directly to a page.
|
|
|
|
// TODO: Support more potential actions. Make generic.
|
|
|
|
// Can we make those sub conditionals cleaner? Some kind of
|
|
|
|
// generic tree traversal / unmarshalling.
|
|
|
|
if dict, hasAdict := (*dict)["A"].(*PdfObjectDictionary); hasAdict {
|
|
|
|
if s, hasS := (*dict)["S"].(*PdfObjectName); hasS {
|
|
|
|
if *s == "GoTo" {
|
|
|
|
if d, hasD := (*dict)["D"].(*PdfObjectArray); hasD {
|
|
|
|
if len(*d) > 0 {
|
|
|
|
if (*d)[0] == page {
|
|
|
|
pageOutlines = append(pageOutlines, outlineObj)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if a, hasA := (*dict)["A"].(*PdfIndirectObject); hasA {
|
|
|
|
if dict, ok := a.PdfObject.(*PdfObjectDictionary); ok {
|
|
|
|
if s, hasS := (*dict)["S"].(*PdfObjectName); hasS {
|
|
|
|
if *s == "GoTo" {
|
|
|
|
if d, hasD := (*dict)["D"].(*PdfObjectArray); hasD {
|
|
|
|
if len(*d) > 0 {
|
|
|
|
if (*d)[0] == page {
|
|
|
|
pageOutlines = append(pageOutlines, outlineObj)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return pageOutlines, nil
|
|
|
|
}
|
2016-08-17 00:07:56 +00:00
|
|
|
*/
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
// Get a page by the page number.
|
|
|
|
// Indirect object with type /Page.
|
|
|
|
func (this *PdfReader) GetPage(pageNumber int) (PdfObject, error) {
|
|
|
|
if this.parser.crypter != nil && !this.parser.crypter.authenticated {
|
|
|
|
return nil, fmt.Errorf("File need to be decrypted first")
|
|
|
|
}
|
|
|
|
if len(this.pageList) < pageNumber {
|
|
|
|
return nil, errors.New("Invalid page number (page count too short)")
|
|
|
|
}
|
|
|
|
page := this.pageList[pageNumber-1]
|
|
|
|
|
|
|
|
// Look up all references related to page and load everything.
|
2016-08-16 16:22:01 +00:00
|
|
|
err := this.traverseObjectData(page)
|
2016-07-09 14:09:27 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-07-17 19:59:17 +00:00
|
|
|
common.Log.Debug("Page: %T %s", page, page)
|
|
|
|
common.Log.Debug("- %T %s", page.PdfObject, page.PdfObject)
|
2016-07-09 14:09:27 +00:00
|
|
|
|
|
|
|
return page, nil
|
|
|
|
}
|