mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00
919 lines
26 KiB
Go
919 lines
26 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package model
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
"github.com/unidoc/unipdf/v3/core/security"
|
|
)
|
|
|
|
// PdfReader represents a PDF file reader. It is a frontend to the lower level parsing mechanism and provides
|
|
// a higher level access to work with PDF structure and information, such as the page structure etc.
|
|
type PdfReader struct {
|
|
parser *core.PdfParser
|
|
root core.PdfObject
|
|
pagesContainer *core.PdfIndirectObject
|
|
pages *core.PdfObjectDictionary
|
|
pageList []*core.PdfIndirectObject
|
|
PageList []*PdfPage
|
|
pageCount int
|
|
catalog *core.PdfObjectDictionary
|
|
outlineTree *PdfOutlineTreeNode
|
|
AcroForm *PdfAcroForm
|
|
|
|
modelManager *modelManager
|
|
|
|
// Lazy loading: When enabled reference objects need to be resolved (via lookup, disk access) rather
|
|
// than loading entire document into memory on load.
|
|
isLazy bool
|
|
|
|
// For tracking traversal (cache).
|
|
traversed map[core.PdfObject]struct{}
|
|
rs io.ReadSeeker
|
|
}
|
|
|
|
// NewPdfReader returns a new PdfReader for an input io.ReadSeeker interface. Can be used to read PDF from
|
|
// memory or file. Immediately loads and traverses the PDF structure including pages and page contents (if
|
|
// not encrypted). Loads entire document structure into memory.
|
|
// Alternatively a lazy-loading reader can be created with NewPdfReaderLazy which loads only references,
|
|
// and references are loaded from disk into memory on an as-needed basis.
|
|
func NewPdfReader(rs io.ReadSeeker) (*PdfReader, error) {
|
|
pdfReader := &PdfReader{
|
|
rs: rs,
|
|
traversed: map[core.PdfObject]struct{}{},
|
|
modelManager: newModelManager(),
|
|
isLazy: false,
|
|
}
|
|
|
|
// Create the parser, loads the cross reference table and trailer.
|
|
parser, err := core.NewParser(rs)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
pdfReader.parser = parser
|
|
|
|
isEncrypted, err := pdfReader.IsEncrypted()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Load pdf doc structure if not encrypted.
|
|
if !isEncrypted {
|
|
err = pdfReader.loadStructure()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return pdfReader, nil
|
|
}
|
|
|
|
// NewPdfReaderLazy creates a new PdfReader for `rs` in lazy-loading mode. The difference
|
|
// from NewPdfReader is that in lazy-loading mode, objects are only loaded into memory when needed
|
|
// rather than entire structure being loaded into memory on reader creation.
|
|
// Note that it may make sense to use the lazy-load reader when processing only parts of files,
|
|
// rather than loading entire file into memory. Example: splitting a few pages from a large PDF file.
|
|
func NewPdfReaderLazy(rs io.ReadSeeker) (*PdfReader, error) {
|
|
pdfReader := &PdfReader{
|
|
rs: rs,
|
|
traversed: map[core.PdfObject]struct{}{},
|
|
modelManager: newModelManager(),
|
|
isLazy: true,
|
|
}
|
|
|
|
// Create the parser, loads the cross reference table and trailer.
|
|
parser, err := core.NewParser(rs)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
pdfReader.parser = parser
|
|
|
|
isEncrypted, err := pdfReader.IsEncrypted()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Load pdf doc structure if not encrypted.
|
|
if !isEncrypted {
|
|
err = pdfReader.loadStructure()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return pdfReader, nil
|
|
}
|
|
|
|
// PdfVersion returns version of the PDF file.
|
|
func (r *PdfReader) PdfVersion() core.Version {
|
|
return r.parser.PdfVersion()
|
|
}
|
|
|
|
// IsEncrypted returns true if the PDF file is encrypted.
|
|
func (r *PdfReader) IsEncrypted() (bool, error) {
|
|
return r.parser.IsEncrypted()
|
|
}
|
|
|
|
// GetEncryptionMethod returns a descriptive information string about the encryption method used.
|
|
func (r *PdfReader) GetEncryptionMethod() string {
|
|
crypter := r.parser.GetCrypter()
|
|
return crypter.String()
|
|
}
|
|
|
|
// Decrypt decrypts the PDF file with a specified password. Also tries to
|
|
// decrypt with an empty password. Returns true if successful,
|
|
// false otherwise.
|
|
func (r *PdfReader) Decrypt(password []byte) (bool, error) {
|
|
success, err := r.parser.Decrypt(password)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if !success {
|
|
return false, nil
|
|
}
|
|
|
|
err = r.loadStructure()
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Fail to load structure (%s)", err)
|
|
return false, err
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// CheckAccessRights checks access rights and permissions for a specified password. If either user/owner
|
|
// password is specified, full rights are granted, otherwise the access rights are specified by the
|
|
// Permissions flag.
|
|
//
|
|
// The bool flag indicates that the user can access and view the file.
|
|
// The AccessPermissions shows what access the user has for editing etc.
|
|
// An error is returned if there was a problem performing the authentication.
|
|
func (r *PdfReader) CheckAccessRights(password []byte) (bool, security.Permissions, error) {
|
|
return r.parser.CheckAccessRights(password)
|
|
}
|
|
|
|
// Loads the structure of the pdf file: pages, outlines, etc.
|
|
func (r *PdfReader) loadStructure() error {
|
|
if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() {
|
|
return fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
|
|
trailerDict := r.parser.GetTrailer()
|
|
if trailerDict == nil {
|
|
return fmt.Errorf("missing trailer")
|
|
}
|
|
|
|
// Catalog.
|
|
root, ok := trailerDict.Get("Root").(*core.PdfObjectReference)
|
|
if !ok {
|
|
return fmt.Errorf("invalid Root (trailer: %s)", trailerDict)
|
|
}
|
|
oc, err := r.parser.LookupByReference(*root)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Failed to read root element catalog: %s", err)
|
|
return err
|
|
}
|
|
pcatalog, ok := oc.(*core.PdfIndirectObject)
|
|
if !ok {
|
|
common.Log.Debug("ERROR: Missing catalog: (root %q) (trailer %s)", oc, *trailerDict)
|
|
return errors.New("missing catalog")
|
|
}
|
|
catalog, ok := (*pcatalog).PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
common.Log.Debug("ERROR: Invalid catalog (%s)", pcatalog.PdfObject)
|
|
return errors.New("invalid catalog")
|
|
}
|
|
common.Log.Trace("Catalog: %s", catalog)
|
|
|
|
// Pages.
|
|
pagesRef, ok := catalog.Get("Pages").(*core.PdfObjectReference)
|
|
if !ok {
|
|
return errors.New("pages in catalog should be a reference")
|
|
}
|
|
op, err := r.parser.LookupByReference(*pagesRef)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Failed to read pages")
|
|
return err
|
|
}
|
|
ppages, ok := op.(*core.PdfIndirectObject)
|
|
if !ok {
|
|
common.Log.Debug("ERROR: Pages object invalid")
|
|
common.Log.Debug("op: %p", ppages)
|
|
return errors.New("pages object invalid")
|
|
}
|
|
pages, ok := ppages.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
common.Log.Debug("ERROR: Pages object invalid (%s)", ppages)
|
|
return errors.New("pages object invalid")
|
|
}
|
|
pageCount, ok := core.GetInt(pages.Get("Count"))
|
|
if !ok {
|
|
common.Log.Debug("ERROR: Pages count object invalid")
|
|
return errors.New("pages count invalid")
|
|
}
|
|
if _, ok = core.GetName(pages.Get("Type")); !ok {
|
|
common.Log.Debug("Pages dict Type field not set. Setting Type to Pages.")
|
|
pages.Set("Type", core.MakeName("Pages"))
|
|
}
|
|
|
|
r.root = root
|
|
r.catalog = catalog
|
|
r.pages = pages
|
|
r.pagesContainer = ppages
|
|
r.pageCount = int(*pageCount)
|
|
r.pageList = []*core.PdfIndirectObject{}
|
|
|
|
traversedPageNodes := map[core.PdfObject]struct{}{}
|
|
err = r.buildPageList(ppages, nil, traversedPageNodes)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
common.Log.Trace("---")
|
|
common.Log.Trace("TOC")
|
|
common.Log.Trace("Pages")
|
|
common.Log.Trace("%d: %s", len(r.pageList), r.pageList)
|
|
|
|
// Outlines.
|
|
r.outlineTree, err = r.loadOutlines()
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Failed to build outline tree (%s)", err)
|
|
return err
|
|
}
|
|
|
|
// Load interactive forms and fields.
|
|
r.AcroForm, err = r.loadForms()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *PdfReader) loadOutlines() (*PdfOutlineTreeNode, error) {
|
|
if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() {
|
|
return nil, fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
|
|
// Has outlines? Otherwise return an empty outlines structure.
|
|
catalog := r.catalog
|
|
outlinesObj := catalog.Get("Outlines")
|
|
if outlinesObj == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
common.Log.Trace("-Has outlines")
|
|
// Trace references to the object.
|
|
outlineRootObj := core.ResolveReference(outlinesObj)
|
|
common.Log.Trace("Outline root: %v", outlineRootObj)
|
|
|
|
if isNull := core.IsNullObject(outlineRootObj); isNull {
|
|
common.Log.Trace("Outline root is null - no outlines")
|
|
return nil, nil
|
|
}
|
|
|
|
outlineRoot, ok := outlineRootObj.(*core.PdfIndirectObject)
|
|
if !ok {
|
|
if _, ok := core.GetDict(outlineRootObj); !ok {
|
|
common.Log.Debug("Invalid outline root - skipping")
|
|
return nil, nil
|
|
}
|
|
|
|
common.Log.Debug("Outline root is a dict. Should be an indirect object")
|
|
outlineRoot = core.MakeIndirectObject(outlineRootObj)
|
|
}
|
|
|
|
dict, ok := outlineRoot.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
return nil, errors.New("outline indirect object should contain a dictionary")
|
|
}
|
|
|
|
common.Log.Trace("Outline root dict: %v", dict)
|
|
|
|
outlineTree, _, err := r.buildOutlineTree(outlineRoot, nil, nil, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
common.Log.Trace("Resulting outline tree: %v", outlineTree)
|
|
|
|
return outlineTree, nil
|
|
}
|
|
|
|
// Recursive build outline tree.
|
|
// prev PdfObject,
|
|
// Input: The indirect object containing an Outlines or Outline item dictionary.
|
|
// Parent, Prev are the parent or previous node in the hierarchy.
|
|
// The function returns the corresponding tree node and the last node which is used
|
|
// for setting the Last pointer of the tree node structures.
|
|
func (r *PdfReader) buildOutlineTree(obj core.PdfObject, parent *PdfOutlineTreeNode, prev *PdfOutlineTreeNode, visited map[core.PdfObject]struct{}) (*PdfOutlineTreeNode, *PdfOutlineTreeNode, error) {
|
|
if visited == nil {
|
|
visited = map[core.PdfObject]struct{}{}
|
|
}
|
|
visited[obj] = struct{}{}
|
|
|
|
container, isInd := obj.(*core.PdfIndirectObject)
|
|
if !isInd {
|
|
return nil, nil, fmt.Errorf("outline container not an indirect object %T", obj)
|
|
}
|
|
dict, ok := container.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
return nil, nil, errors.New("not a dictionary object")
|
|
}
|
|
common.Log.Trace("build outline tree: dict: %v (%v) p: %p", dict, container, container)
|
|
|
|
if obj := dict.Get("Title"); obj != nil {
|
|
// Outline item has a title. (required)
|
|
outlineItem, err := r.newPdfOutlineItemFromIndirectObject(container)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
outlineItem.Parent = parent
|
|
outlineItem.Prev = prev
|
|
|
|
// Build outline tree for node children.
|
|
firstObj := core.ResolveReference(dict.Get("First"))
|
|
if _, processed := visited[firstObj]; firstObj != nil && firstObj != container && !processed {
|
|
if !core.IsNullObject(firstObj) {
|
|
first, last, err := r.buildOutlineTree(firstObj, &outlineItem.PdfOutlineTreeNode, nil, visited)
|
|
if err != nil {
|
|
common.Log.Debug("DEBUG: could not build outline item tree: %v. Skipping node children.", err)
|
|
} else {
|
|
outlineItem.First = first
|
|
outlineItem.Last = last
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build outline tree for the next item.
|
|
nextObj := core.ResolveReference(dict.Get("Next"))
|
|
if _, processed := visited[nextObj]; nextObj != nil && nextObj != container && !processed {
|
|
if !core.IsNullObject(nextObj) {
|
|
next, last, err := r.buildOutlineTree(nextObj, parent, &outlineItem.PdfOutlineTreeNode, visited)
|
|
if err != nil {
|
|
common.Log.Debug("DEBUG: could not build outline tree for Next node: %v. Skipping node.", err)
|
|
} else {
|
|
outlineItem.Next = next
|
|
return &outlineItem.PdfOutlineTreeNode, last, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return &outlineItem.PdfOutlineTreeNode, &outlineItem.PdfOutlineTreeNode, nil
|
|
}
|
|
|
|
// Outline dictionary (structure element).
|
|
outline, err := newPdfOutlineFromIndirectObject(container)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
outline.Parent = parent
|
|
|
|
if firstObj := dict.Get("First"); firstObj != nil {
|
|
// Has children...
|
|
firstObj = core.ResolveReference(firstObj)
|
|
firstObjDirect := core.TraceToDirectObject(firstObj)
|
|
if _, isNull := firstObjDirect.(*core.PdfObjectNull); !isNull && firstObjDirect != nil {
|
|
first, last, err := r.buildOutlineTree(firstObj, &outline.PdfOutlineTreeNode, nil, visited)
|
|
if err != nil {
|
|
common.Log.Debug("DEBUG: could not build outline tree: %v. Skipping node children.", err)
|
|
} else {
|
|
outline.First = first
|
|
outline.Last = last
|
|
}
|
|
}
|
|
}
|
|
|
|
return &outline.PdfOutlineTreeNode, &outline.PdfOutlineTreeNode, nil
|
|
}
|
|
|
|
// GetOutlineTree returns the outline tree.
|
|
func (r *PdfReader) GetOutlineTree() *PdfOutlineTreeNode {
|
|
return r.outlineTree
|
|
}
|
|
|
|
// GetOutlinesFlattened returns a flattened list of tree nodes and titles.
|
|
// NOTE: for most use cases, it is recommended to use the high-level GetOutlines
|
|
// method instead, which also provides information regarding the destination
|
|
// of the outline items.
|
|
func (r *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, error) {
|
|
var outlineNodeList []*PdfOutlineTreeNode
|
|
var flattenedTitleList []string
|
|
|
|
// Recursive flattening function.
|
|
var flattenFunc func(*PdfOutlineTreeNode, *[]*PdfOutlineTreeNode, *[]string, int)
|
|
flattenFunc = func(node *PdfOutlineTreeNode, outlineList *[]*PdfOutlineTreeNode, titleList *[]string, depth int) {
|
|
if node == nil {
|
|
return
|
|
}
|
|
if node.context == nil {
|
|
common.Log.Debug("ERROR: Missing node.context") // Should not happen ever.
|
|
return
|
|
}
|
|
|
|
item, isItem := node.context.(*PdfOutlineItem)
|
|
if isItem {
|
|
*outlineList = append(*outlineList, &item.PdfOutlineTreeNode)
|
|
title := strings.Repeat(" ", depth*2) + item.Title.Decoded()
|
|
*titleList = append(*titleList, title)
|
|
}
|
|
|
|
if node.First != nil {
|
|
title := strings.Repeat(" ", depth*2) + "+"
|
|
*titleList = append(*titleList, title)
|
|
flattenFunc(node.First, outlineList, titleList, depth+1)
|
|
}
|
|
|
|
if isItem && item.Next != nil {
|
|
flattenFunc(item.Next, outlineList, titleList, depth)
|
|
}
|
|
}
|
|
flattenFunc(r.outlineTree, &outlineNodeList, &flattenedTitleList, 0)
|
|
return outlineNodeList, flattenedTitleList, nil
|
|
}
|
|
|
|
// GetOutlines returns a high-level Outline object, based on the outline tree
|
|
// of the reader.
|
|
func (r *PdfReader) GetOutlines() (*Outline, error) {
|
|
if r == nil {
|
|
return nil, errors.New("cannot create outline from nil reader")
|
|
}
|
|
|
|
outlineTree := r.GetOutlineTree()
|
|
if outlineTree == nil {
|
|
return nil, errors.New("the specified reader does not have an outline tree")
|
|
}
|
|
|
|
var traverseFunc func(node *PdfOutlineTreeNode, entries *[]*OutlineItem)
|
|
traverseFunc = func(node *PdfOutlineTreeNode, entries *[]*OutlineItem) {
|
|
if node == nil {
|
|
return
|
|
}
|
|
if node.context == nil {
|
|
common.Log.Debug("ERROR: missing outline entry context")
|
|
return
|
|
}
|
|
|
|
// Check if node is an outline item.
|
|
var entry *OutlineItem
|
|
if item, ok := node.context.(*PdfOutlineItem); ok {
|
|
// Search for outline destination object.
|
|
destObj := item.Dest
|
|
if (destObj == nil || core.IsNullObject(destObj)) && item.A != nil {
|
|
if actionDict, ok := core.GetDict(item.A); ok {
|
|
destObj, _ = core.GetArray(actionDict.Get("D"))
|
|
}
|
|
}
|
|
|
|
// Parse outline destination object.
|
|
var dest OutlineDest
|
|
if destObj != nil && !core.IsNullObject(destObj) {
|
|
if d, err := newOutlineDestFromPdfObject(destObj, r); err == nil {
|
|
dest = *d
|
|
} else {
|
|
common.Log.Debug("WARN: could not parse outline dest (%v): %v", destObj, err)
|
|
}
|
|
}
|
|
|
|
entry = NewOutlineItem(item.Title.Decoded(), dest)
|
|
*entries = append(*entries, entry)
|
|
|
|
// Traverse next node.
|
|
if item.Next != nil {
|
|
traverseFunc(item.Next, entries)
|
|
}
|
|
}
|
|
|
|
// Check if node has children.
|
|
if node.First != nil {
|
|
if entry != nil {
|
|
entries = &entry.Entries
|
|
}
|
|
|
|
// Traverse node children.
|
|
traverseFunc(node.First, entries)
|
|
}
|
|
}
|
|
|
|
outline := NewOutline()
|
|
traverseFunc(outlineTree, &outline.Entries)
|
|
return outline, nil
|
|
}
|
|
|
|
// AcroFormRepairOptions contains options for rebuilding the AcroForm.
|
|
type AcroFormRepairOptions struct {
|
|
}
|
|
|
|
// RepairAcroForm attempts to rebuild the AcroForm fields using the widget
|
|
// annotations present in the document pages. Pass nil for the opts parameter
|
|
// in order to use the default options.
|
|
// NOTE: Currently, the opts parameter is declared in order to enable adding
|
|
// future options, but passing nil will always result in the default options
|
|
// being used.
|
|
func (r *PdfReader) RepairAcroForm(opts *AcroFormRepairOptions) error {
|
|
var fields []*PdfField
|
|
fieldCache := map[*core.PdfIndirectObject]struct{}{}
|
|
for _, page := range r.PageList {
|
|
annotations, err := page.GetAnnotations()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, annotation := range annotations {
|
|
var field *PdfField
|
|
switch t := annotation.GetContext().(type) {
|
|
case *PdfAnnotationWidget:
|
|
if t.parent != nil {
|
|
field = t.parent
|
|
break
|
|
}
|
|
if parentObj, ok := core.GetIndirect(t.Parent); ok {
|
|
field, err = r.newPdfFieldFromIndirectObject(parentObj, nil)
|
|
if err == nil {
|
|
break
|
|
}
|
|
common.Log.Debug("WARN: could not parse form field %+v: %v", parentObj, err)
|
|
}
|
|
if t.container != nil {
|
|
field, err = r.newPdfFieldFromIndirectObject(t.container, nil)
|
|
if err == nil {
|
|
break
|
|
}
|
|
common.Log.Debug("WARN: could not parse form field %+v: %v", t.container, err)
|
|
}
|
|
}
|
|
if field == nil {
|
|
continue
|
|
}
|
|
if _, ok := fieldCache[field.container]; ok {
|
|
continue
|
|
}
|
|
fieldCache[field.container] = struct{}{}
|
|
fields = append(fields, field)
|
|
}
|
|
}
|
|
|
|
if len(fields) == 0 {
|
|
return nil
|
|
}
|
|
if r.AcroForm == nil {
|
|
r.AcroForm = NewPdfAcroForm()
|
|
}
|
|
r.AcroForm.Fields = &fields
|
|
return nil
|
|
}
|
|
|
|
// AcroFormNeedsRepair returns true if the document contains widget annotations
|
|
// linked to fields which are not referenced in the AcroForm. The AcroForm can
|
|
// be repaired using the RepairAcroForm method of the reader.
|
|
func (r *PdfReader) AcroFormNeedsRepair() (bool, error) {
|
|
var fields []*PdfField
|
|
if r.AcroForm != nil {
|
|
fields = r.AcroForm.AllFields()
|
|
}
|
|
|
|
fieldMap := make(map[*PdfField]struct{}, len(fields))
|
|
for _, field := range fields {
|
|
fieldMap[field] = struct{}{}
|
|
}
|
|
|
|
for _, page := range r.PageList {
|
|
annotations, err := page.GetAnnotations()
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
for _, annotation := range annotations {
|
|
widget, ok := annotation.GetContext().(*PdfAnnotationWidget)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
field := widget.Field()
|
|
if field == nil {
|
|
return true, nil
|
|
}
|
|
if _, ok := fieldMap[field]; !ok {
|
|
return true, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
// loadForms loads the AcroForm.
|
|
func (r *PdfReader) loadForms() (*PdfAcroForm, error) {
|
|
if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() {
|
|
return nil, fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
|
|
// Has forms?
|
|
catalog := r.catalog
|
|
obj := catalog.Get("AcroForm")
|
|
if obj == nil {
|
|
// Nothing to load.
|
|
return nil, nil
|
|
}
|
|
|
|
formsContainer, _ := core.GetIndirect(obj)
|
|
|
|
obj = core.TraceToDirectObject(obj)
|
|
if core.IsNullObject(obj) {
|
|
common.Log.Trace("Acroform is a null object (empty)\n")
|
|
return nil, nil
|
|
}
|
|
|
|
formsDict, ok := core.GetDict(obj)
|
|
if !ok {
|
|
common.Log.Debug("Invalid AcroForm entry %T", obj)
|
|
common.Log.Debug("Does not have forms")
|
|
return nil, fmt.Errorf("invalid acroform entry %T", obj)
|
|
}
|
|
common.Log.Trace("Has Acro forms")
|
|
// Load it.
|
|
|
|
// Ensure we have access to everything.
|
|
common.Log.Trace("Traverse the Acroforms structure")
|
|
if !r.isLazy {
|
|
err := r.traverseObjectData(formsDict)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Unable to traverse AcroForms (%s)", err)
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// Create the acro forms object.
|
|
acroForm, err := r.newPdfAcroFormFromDict(formsContainer, formsDict)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return acroForm, nil
|
|
}
|
|
|
|
func (r *PdfReader) lookupPageByObject(obj core.PdfObject) (*PdfPage, error) {
|
|
// can be indirect, direct, or reference
|
|
// look up the corresponding page
|
|
return nil, errors.New("page not found")
|
|
}
|
|
|
|
// Build the table of contents.
|
|
// tree, ex: Pages -> Pages -> Pages -> Page
|
|
// Traverse through the whole thing recursively.
|
|
func (r *PdfReader) buildPageList(node *core.PdfIndirectObject, parent *core.PdfIndirectObject, traversedPageNodes map[core.PdfObject]struct{}) error {
|
|
if node == nil {
|
|
return nil
|
|
}
|
|
|
|
if _, alreadyTraversed := traversedPageNodes[node]; alreadyTraversed {
|
|
common.Log.Debug("Cyclic recursion, skipping (%v)", node.ObjectNumber)
|
|
return nil
|
|
}
|
|
traversedPageNodes[node] = struct{}{}
|
|
|
|
nodeDict, ok := node.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
return errors.New("node not a dictionary")
|
|
}
|
|
|
|
objType, ok := (*nodeDict).Get("Type").(*core.PdfObjectName)
|
|
if !ok {
|
|
if nodeDict.Get("Kids") == nil {
|
|
return errors.New("node missing Type (Required)")
|
|
}
|
|
|
|
common.Log.Debug("ERROR: node missing Type, but has Kids. Assuming Pages node.")
|
|
objType = core.MakeName("Pages")
|
|
nodeDict.Set("Type", objType)
|
|
}
|
|
common.Log.Trace("buildPageList node type: %s (%+v)", *objType, node)
|
|
if *objType == "Page" {
|
|
p, err := r.newPdfPageFromDict(nodeDict)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
p.setContainer(node)
|
|
|
|
if parent != nil {
|
|
// Set the parent (in case missing or incorrect).
|
|
nodeDict.Set("Parent", parent)
|
|
}
|
|
r.pageList = append(r.pageList, node)
|
|
r.PageList = append(r.PageList, p)
|
|
|
|
return nil
|
|
}
|
|
if *objType != "Pages" {
|
|
common.Log.Debug("ERROR: Table of content containing non Page/Pages object! (%s)", objType)
|
|
return errors.New("table of content containing non Page/Pages object")
|
|
}
|
|
|
|
// A Pages object. Update the parent.
|
|
if parent != nil {
|
|
nodeDict.Set("Parent", parent)
|
|
}
|
|
|
|
// Resolve the object recursively.
|
|
if !r.isLazy {
|
|
err := r.traverseObjectData(node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
kidsObj, err := r.parser.Resolve(nodeDict.Get("Kids"))
|
|
if err != nil {
|
|
common.Log.Debug("ERROR: Failed loading Kids object")
|
|
return err
|
|
}
|
|
|
|
var kids *core.PdfObjectArray
|
|
kids, ok = kidsObj.(*core.PdfObjectArray)
|
|
if !ok {
|
|
kidsIndirect, isIndirect := kidsObj.(*core.PdfIndirectObject)
|
|
if !isIndirect {
|
|
return errors.New("invalid Kids object")
|
|
}
|
|
kids, ok = kidsIndirect.PdfObject.(*core.PdfObjectArray)
|
|
if !ok {
|
|
return errors.New("invalid Kids indirect object")
|
|
}
|
|
}
|
|
common.Log.Trace("Kids: %s", kids)
|
|
for idx, child := range kids.Elements() {
|
|
child, ok := core.GetIndirect(child)
|
|
if !ok {
|
|
common.Log.Debug("ERROR: Page not indirect object - (%s)", child)
|
|
return errors.New("page not indirect object")
|
|
}
|
|
kids.Set(idx, child)
|
|
err = r.buildPageList(child, node, traversedPageNodes)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetNumPages returns the number of pages in the document.
|
|
func (r *PdfReader) GetNumPages() (int, error) {
|
|
if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() {
|
|
return 0, fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
return len(r.pageList), nil
|
|
}
|
|
|
|
// Resolves a reference, returning the object and indicates whether or not
|
|
// it was cached.
|
|
func (r *PdfReader) resolveReference(ref *core.PdfObjectReference) (core.PdfObject, bool, error) {
|
|
cachedObj, isCached := r.parser.ObjCache[int(ref.ObjectNumber)]
|
|
if !isCached {
|
|
common.Log.Trace("Reader Lookup ref: %s", ref)
|
|
obj, err := r.parser.LookupByReference(*ref)
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
r.parser.ObjCache[int(ref.ObjectNumber)] = obj
|
|
return obj, false, nil
|
|
}
|
|
return cachedObj, true, nil
|
|
}
|
|
|
|
/*
|
|
* Recursively traverse through the page object data and look up
|
|
* references to indirect objects.
|
|
*
|
|
* GH: Are we fully protected against circular references? (Add tests).
|
|
*/
|
|
func (r *PdfReader) traverseObjectData(o core.PdfObject) error {
|
|
return core.ResolveReferencesDeep(o, r.traversed)
|
|
}
|
|
|
|
// PageFromIndirectObject returns the PdfPage and page number for a given indirect object.
|
|
func (r *PdfReader) PageFromIndirectObject(ind *core.PdfIndirectObject) (*PdfPage, int, error) {
|
|
if len(r.PageList) != len(r.pageList) {
|
|
return nil, 0, errors.New("page list invalid")
|
|
}
|
|
|
|
for i, pageind := range r.pageList {
|
|
if pageind == ind {
|
|
return r.PageList[i], i + 1, nil
|
|
}
|
|
}
|
|
return nil, 0, errors.New("page not found")
|
|
}
|
|
|
|
// GetPage returns the PdfPage model for the specified page number.
|
|
func (r *PdfReader) GetPage(pageNumber int) (*PdfPage, error) {
|
|
if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() {
|
|
return nil, fmt.Errorf("file needs to be decrypted first")
|
|
}
|
|
if len(r.pageList) < pageNumber {
|
|
return nil, errors.New("invalid page number (page count too short)")
|
|
}
|
|
idx := pageNumber - 1
|
|
if idx < 0 {
|
|
return nil, fmt.Errorf("page numbering must start at 1")
|
|
}
|
|
page := r.PageList[idx]
|
|
return page, nil
|
|
}
|
|
|
|
// GetOCProperties returns the optional content properties PdfObject.
|
|
func (r *PdfReader) GetOCProperties() (core.PdfObject, error) {
|
|
dict := r.catalog
|
|
obj := dict.Get("OCProperties")
|
|
obj = core.ResolveReference(obj)
|
|
|
|
// Resolve all references...
|
|
// Should be pretty safe. Should not be referencing to pages or
|
|
// any large structures. Local structures and references
|
|
// to OC Groups.
|
|
if !r.isLazy {
|
|
err := r.traverseObjectData(obj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return obj, nil
|
|
}
|
|
|
|
// GetNamedDestinations returns the Names entry in the PDF catalog.
|
|
// See section 12.3.2.3 "Named Destinations" (p. 367 PDF32000_2008).
|
|
func (r *PdfReader) GetNamedDestinations() (core.PdfObject, error) {
|
|
obj := core.ResolveReference(r.catalog.Get("Names"))
|
|
if obj == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
// Resolve references.
|
|
if !r.isLazy {
|
|
err := r.traverseObjectData(obj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return obj, nil
|
|
}
|
|
|
|
// GetPageLabels returns the PageLabels entry in the PDF catalog.
|
|
// See section 12.4.2 "Page Labels" (p. 382 PDF32000_2008).
|
|
func (r *PdfReader) GetPageLabels() (core.PdfObject, error) {
|
|
obj := core.ResolveReference(r.catalog.Get("PageLabels"))
|
|
if obj == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
// Resolve references.
|
|
if !r.isLazy {
|
|
err := r.traverseObjectData(obj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return obj, nil
|
|
}
|
|
|
|
// Inspect inspects the object types, subtypes and content in the PDF file returning a map of
|
|
// object type to number of instances of each.
|
|
func (r *PdfReader) Inspect() (map[string]int, error) {
|
|
return r.parser.Inspect()
|
|
}
|
|
|
|
// GetObjectNums returns the object numbers of the PDF objects in the file
|
|
// Numbered objects are either indirect objects or stream objects.
|
|
// e.g. objNums := pdfReader.GetObjectNums()
|
|
// The underlying objects can then be accessed with
|
|
// pdfReader.GetIndirectObjectByNumber(objNums[0]) for the first available object.
|
|
func (r *PdfReader) GetObjectNums() []int {
|
|
return r.parser.GetObjectNums()
|
|
}
|
|
|
|
// GetIndirectObjectByNumber retrieves and returns a specific PdfObject by object number.
|
|
func (r *PdfReader) GetIndirectObjectByNumber(number int) (core.PdfObject, error) {
|
|
obj, err := r.parser.LookupByNumber(number)
|
|
return obj, err
|
|
}
|
|
|
|
// GetTrailer returns the PDF's trailer dictionary.
|
|
func (r *PdfReader) GetTrailer() (*core.PdfObjectDictionary, error) {
|
|
trailerDict := r.parser.GetTrailer()
|
|
if trailerDict == nil {
|
|
return nil, errors.New("trailer missing")
|
|
}
|
|
|
|
return trailerDict, nil
|
|
}
|