/* * This file is subject to the terms and conditions defined in * file 'LICENSE.md', which is part of this source code package. */ package model import ( "errors" "fmt" "io" "strings" "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core" "github.com/unidoc/unipdf/v3/core/security" ) // PdfReader represents a PDF file reader. It is a frontend to the lower level parsing mechanism and provides // a higher level access to work with PDF structure and information, such as the page structure etc. type PdfReader struct { parser *core.PdfParser root core.PdfObject pagesContainer *core.PdfIndirectObject pages *core.PdfObjectDictionary pageList []*core.PdfIndirectObject PageList []*PdfPage pageCount int catalog *core.PdfObjectDictionary outlineTree *PdfOutlineTreeNode AcroForm *PdfAcroForm modelManager *modelManager // Lazy loading: When enabled reference objects need to be resolved (via lookup, disk access) rather // than loading entire document into memory on load. isLazy bool // For tracking traversal (cache). traversed map[core.PdfObject]struct{} rs io.ReadSeeker } // NewPdfReader returns a new PdfReader for an input io.ReadSeeker interface. Can be used to read PDF from // memory or file. Immediately loads and traverses the PDF structure including pages and page contents (if // not encrypted). Loads entire document structure into memory. // Alternatively a lazy-loading reader can be created with NewPdfReaderLazy which loads only references, // and references are loaded from disk into memory on an as-needed basis. func NewPdfReader(rs io.ReadSeeker) (*PdfReader, error) { pdfReader := &PdfReader{ rs: rs, traversed: map[core.PdfObject]struct{}{}, modelManager: newModelManager(), isLazy: false, } // Create the parser, loads the cross reference table and trailer. parser, err := core.NewParser(rs) if err != nil { return nil, err } pdfReader.parser = parser isEncrypted, err := pdfReader.IsEncrypted() if err != nil { return nil, err } // Load pdf doc structure if not encrypted. if !isEncrypted { err = pdfReader.loadStructure() if err != nil { return nil, err } } return pdfReader, nil } // NewPdfReaderLazy creates a new PdfReader for `rs` in lazy-loading mode. The difference // from NewPdfReader is that in lazy-loading mode, objects are only loaded into memory when needed // rather than entire structure being loaded into memory on reader creation. // Note that it may make sense to use the lazy-load reader when processing only parts of files, // rather than loading entire file into memory. Example: splitting a few pages from a large PDF file. func NewPdfReaderLazy(rs io.ReadSeeker) (*PdfReader, error) { pdfReader := &PdfReader{ rs: rs, traversed: map[core.PdfObject]struct{}{}, modelManager: newModelManager(), isLazy: true, } // Create the parser, loads the cross reference table and trailer. parser, err := core.NewParser(rs) if err != nil { return nil, err } pdfReader.parser = parser isEncrypted, err := pdfReader.IsEncrypted() if err != nil { return nil, err } // Load pdf doc structure if not encrypted. if !isEncrypted { err = pdfReader.loadStructure() if err != nil { return nil, err } } return pdfReader, nil } // PdfVersion returns version of the PDF file. func (r *PdfReader) PdfVersion() core.Version { return r.parser.PdfVersion() } // IsEncrypted returns true if the PDF file is encrypted. func (r *PdfReader) IsEncrypted() (bool, error) { return r.parser.IsEncrypted() } // GetEncryptionMethod returns a descriptive information string about the encryption method used. func (r *PdfReader) GetEncryptionMethod() string { crypter := r.parser.GetCrypter() return crypter.String() } // Decrypt decrypts the PDF file with a specified password. Also tries to // decrypt with an empty password. Returns true if successful, // false otherwise. func (r *PdfReader) Decrypt(password []byte) (bool, error) { success, err := r.parser.Decrypt(password) if err != nil { return false, err } if !success { return false, nil } err = r.loadStructure() if err != nil { common.Log.Debug("ERROR: Fail to load structure (%s)", err) return false, err } return true, nil } // CheckAccessRights checks access rights and permissions for a specified password. If either user/owner // password is specified, full rights are granted, otherwise the access rights are specified by the // Permissions flag. // // The bool flag indicates that the user can access and view the file. // The AccessPermissions shows what access the user has for editing etc. // An error is returned if there was a problem performing the authentication. func (r *PdfReader) CheckAccessRights(password []byte) (bool, security.Permissions, error) { return r.parser.CheckAccessRights(password) } // Loads the structure of the pdf file: pages, outlines, etc. func (r *PdfReader) loadStructure() error { if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() { return fmt.Errorf("file need to be decrypted first") } trailerDict := r.parser.GetTrailer() if trailerDict == nil { return fmt.Errorf("missing trailer") } // Catalog. root, ok := trailerDict.Get("Root").(*core.PdfObjectReference) if !ok { return fmt.Errorf("invalid Root (trailer: %s)", trailerDict) } oc, err := r.parser.LookupByReference(*root) if err != nil { common.Log.Debug("ERROR: Failed to read root element catalog: %s", err) return err } pcatalog, ok := oc.(*core.PdfIndirectObject) if !ok { common.Log.Debug("ERROR: Missing catalog: (root %q) (trailer %s)", oc, *trailerDict) return errors.New("missing catalog") } catalog, ok := (*pcatalog).PdfObject.(*core.PdfObjectDictionary) if !ok { common.Log.Debug("ERROR: Invalid catalog (%s)", pcatalog.PdfObject) return errors.New("invalid catalog") } common.Log.Trace("Catalog: %s", catalog) // Pages. pagesRef, ok := catalog.Get("Pages").(*core.PdfObjectReference) if !ok { return errors.New("pages in catalog should be a reference") } op, err := r.parser.LookupByReference(*pagesRef) if err != nil { common.Log.Debug("ERROR: Failed to read pages") return err } ppages, ok := op.(*core.PdfIndirectObject) if !ok { common.Log.Debug("ERROR: Pages object invalid") common.Log.Debug("op: %p", ppages) return errors.New("pages object invalid") } pages, ok := ppages.PdfObject.(*core.PdfObjectDictionary) if !ok { common.Log.Debug("ERROR: Pages object invalid (%s)", ppages) return errors.New("pages object invalid") } pageCount, ok := core.GetInt(pages.Get("Count")) if !ok { common.Log.Debug("ERROR: Pages count object invalid") return errors.New("pages count invalid") } if _, ok = core.GetName(pages.Get("Type")); !ok { common.Log.Debug("Pages dict Type field not set. Setting Type to Pages.") pages.Set("Type", core.MakeName("Pages")) } r.root = root r.catalog = catalog r.pages = pages r.pagesContainer = ppages r.pageCount = int(*pageCount) r.pageList = []*core.PdfIndirectObject{} traversedPageNodes := map[core.PdfObject]struct{}{} err = r.buildPageList(ppages, nil, traversedPageNodes) if err != nil { return err } common.Log.Trace("---") common.Log.Trace("TOC") common.Log.Trace("Pages") common.Log.Trace("%d: %s", len(r.pageList), r.pageList) // Outlines. r.outlineTree, err = r.loadOutlines() if err != nil { common.Log.Debug("ERROR: Failed to build outline tree (%s)", err) return err } // Load interactive forms and fields. r.AcroForm, err = r.loadForms() if err != nil { return err } return nil } func (r *PdfReader) loadOutlines() (*PdfOutlineTreeNode, error) { if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() { return nil, fmt.Errorf("file need to be decrypted first") } // Has outlines? Otherwise return an empty outlines structure. catalog := r.catalog outlinesObj := catalog.Get("Outlines") if outlinesObj == nil { return nil, nil } common.Log.Trace("-Has outlines") // Trace references to the object. outlineRootObj := core.ResolveReference(outlinesObj) common.Log.Trace("Outline root: %v", outlineRootObj) if isNull := core.IsNullObject(outlineRootObj); isNull { common.Log.Trace("Outline root is null - no outlines") return nil, nil } outlineRoot, ok := outlineRootObj.(*core.PdfIndirectObject) if !ok { if _, ok := core.GetDict(outlineRootObj); !ok { common.Log.Debug("Invalid outline root - skipping") return nil, nil } common.Log.Debug("Outline root is a dict. Should be an indirect object") outlineRoot = core.MakeIndirectObject(outlineRootObj) } dict, ok := outlineRoot.PdfObject.(*core.PdfObjectDictionary) if !ok { return nil, errors.New("outline indirect object should contain a dictionary") } common.Log.Trace("Outline root dict: %v", dict) outlineTree, _, err := r.buildOutlineTree(outlineRoot, nil, nil, nil) if err != nil { return nil, err } common.Log.Trace("Resulting outline tree: %v", outlineTree) return outlineTree, nil } // Recursive build outline tree. // prev PdfObject, // Input: The indirect object containing an Outlines or Outline item dictionary. // Parent, Prev are the parent or previous node in the hierarchy. // The function returns the corresponding tree node and the last node which is used // for setting the Last pointer of the tree node structures. func (r *PdfReader) buildOutlineTree(obj core.PdfObject, parent *PdfOutlineTreeNode, prev *PdfOutlineTreeNode, visited map[core.PdfObject]struct{}) (*PdfOutlineTreeNode, *PdfOutlineTreeNode, error) { if visited == nil { visited = map[core.PdfObject]struct{}{} } visited[obj] = struct{}{} container, isInd := obj.(*core.PdfIndirectObject) if !isInd { return nil, nil, fmt.Errorf("outline container not an indirect object %T", obj) } dict, ok := container.PdfObject.(*core.PdfObjectDictionary) if !ok { return nil, nil, errors.New("not a dictionary object") } common.Log.Trace("build outline tree: dict: %v (%v) p: %p", dict, container, container) if obj := dict.Get("Title"); obj != nil { // Outline item has a title. (required) outlineItem, err := r.newPdfOutlineItemFromIndirectObject(container) if err != nil { return nil, nil, err } outlineItem.Parent = parent outlineItem.Prev = prev // Build outline tree for node children. firstObj := core.ResolveReference(dict.Get("First")) if _, processed := visited[firstObj]; firstObj != nil && firstObj != container && !processed { if !core.IsNullObject(firstObj) { first, last, err := r.buildOutlineTree(firstObj, &outlineItem.PdfOutlineTreeNode, nil, visited) if err != nil { common.Log.Debug("DEBUG: could not build outline item tree: %v. Skipping node children.", err) } else { outlineItem.First = first outlineItem.Last = last } } } // Build outline tree for the next item. nextObj := core.ResolveReference(dict.Get("Next")) if _, processed := visited[nextObj]; nextObj != nil && nextObj != container && !processed { if !core.IsNullObject(nextObj) { next, last, err := r.buildOutlineTree(nextObj, parent, &outlineItem.PdfOutlineTreeNode, visited) if err != nil { common.Log.Debug("DEBUG: could not build outline tree for Next node: %v. Skipping node.", err) } else { outlineItem.Next = next return &outlineItem.PdfOutlineTreeNode, last, nil } } } return &outlineItem.PdfOutlineTreeNode, &outlineItem.PdfOutlineTreeNode, nil } // Outline dictionary (structure element). outline, err := newPdfOutlineFromIndirectObject(container) if err != nil { return nil, nil, err } outline.Parent = parent if firstObj := dict.Get("First"); firstObj != nil { // Has children... firstObj = core.ResolveReference(firstObj) firstObjDirect := core.TraceToDirectObject(firstObj) if _, isNull := firstObjDirect.(*core.PdfObjectNull); !isNull && firstObjDirect != nil { first, last, err := r.buildOutlineTree(firstObj, &outline.PdfOutlineTreeNode, nil, visited) if err != nil { common.Log.Debug("DEBUG: could not build outline tree: %v. Skipping node children.", err) } else { outline.First = first outline.Last = last } } } return &outline.PdfOutlineTreeNode, &outline.PdfOutlineTreeNode, nil } // GetOutlineTree returns the outline tree. func (r *PdfReader) GetOutlineTree() *PdfOutlineTreeNode { return r.outlineTree } // GetOutlinesFlattened returns a flattened list of tree nodes and titles. // NOTE: for most use cases, it is recommended to use the high-level GetOutlines // method instead, which also provides information regarding the destination // of the outline items. func (r *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, error) { var outlineNodeList []*PdfOutlineTreeNode var flattenedTitleList []string // Recursive flattening function. var flattenFunc func(*PdfOutlineTreeNode, *[]*PdfOutlineTreeNode, *[]string, int) flattenFunc = func(node *PdfOutlineTreeNode, outlineList *[]*PdfOutlineTreeNode, titleList *[]string, depth int) { if node == nil { return } if node.context == nil { common.Log.Debug("ERROR: Missing node.context") // Should not happen ever. return } item, isItem := node.context.(*PdfOutlineItem) if isItem { *outlineList = append(*outlineList, &item.PdfOutlineTreeNode) title := strings.Repeat(" ", depth*2) + item.Title.Decoded() *titleList = append(*titleList, title) } if node.First != nil { title := strings.Repeat(" ", depth*2) + "+" *titleList = append(*titleList, title) flattenFunc(node.First, outlineList, titleList, depth+1) } if isItem && item.Next != nil { flattenFunc(item.Next, outlineList, titleList, depth) } } flattenFunc(r.outlineTree, &outlineNodeList, &flattenedTitleList, 0) return outlineNodeList, flattenedTitleList, nil } // GetOutlines returns a high-level Outline object, based on the outline tree // of the reader. func (r *PdfReader) GetOutlines() (*Outline, error) { if r == nil { return nil, errors.New("cannot create outline from nil reader") } outlineTree := r.GetOutlineTree() if outlineTree == nil { return nil, errors.New("the specified reader does not have an outline tree") } var traverseFunc func(node *PdfOutlineTreeNode, entries *[]*OutlineItem) traverseFunc = func(node *PdfOutlineTreeNode, entries *[]*OutlineItem) { if node == nil { return } if node.context == nil { common.Log.Debug("ERROR: missing outline entry context") return } // Check if node is an outline item. var entry *OutlineItem if item, ok := node.context.(*PdfOutlineItem); ok { // Search for outline destination object. destObj := item.Dest if (destObj == nil || core.IsNullObject(destObj)) && item.A != nil { if actionDict, ok := core.GetDict(item.A); ok { destObj, _ = core.GetArray(actionDict.Get("D")) } } // Parse outline destination object. var dest OutlineDest if destObj != nil && !core.IsNullObject(destObj) { if d, err := newOutlineDestFromPdfObject(destObj, r); err == nil { dest = *d } else { common.Log.Debug("WARN: could not parse outline dest (%v): %v", destObj, err) } } entry = NewOutlineItem(item.Title.Decoded(), dest) *entries = append(*entries, entry) // Traverse next node. if item.Next != nil { traverseFunc(item.Next, entries) } } // Check if node has children. if node.First != nil { if entry != nil { entries = &entry.Entries } // Traverse node children. traverseFunc(node.First, entries) } } outline := NewOutline() traverseFunc(outlineTree, &outline.Entries) return outline, nil } // AcroFormRepairOptions contains options for rebuilding the AcroForm. type AcroFormRepairOptions struct { } // RepairAcroForm attempts to rebuild the AcroForm fields using the widget // annotations present in the document pages. Pass nil for the opts parameter // in order to use the default options. // NOTE: Currently, the opts parameter is declared in order to enable adding // future options, but passing nil will always result in the default options // being used. func (r *PdfReader) RepairAcroForm(opts *AcroFormRepairOptions) error { var fields []*PdfField fieldCache := map[*core.PdfIndirectObject]struct{}{} for _, page := range r.PageList { annotations, err := page.GetAnnotations() if err != nil { return err } for _, annotation := range annotations { var field *PdfField switch t := annotation.GetContext().(type) { case *PdfAnnotationWidget: if t.parent != nil { field = t.parent break } if parentObj, ok := core.GetIndirect(t.Parent); ok { field, err = r.newPdfFieldFromIndirectObject(parentObj, nil) if err == nil { break } common.Log.Debug("WARN: could not parse form field %+v: %v", parentObj, err) } if t.container != nil { field, err = r.newPdfFieldFromIndirectObject(t.container, nil) if err == nil { break } common.Log.Debug("WARN: could not parse form field %+v: %v", t.container, err) } } if field == nil { continue } if _, ok := fieldCache[field.container]; ok { continue } fieldCache[field.container] = struct{}{} fields = append(fields, field) } } if len(fields) == 0 { return nil } if r.AcroForm == nil { r.AcroForm = NewPdfAcroForm() } r.AcroForm.Fields = &fields return nil } // AcroFormNeedsRepair returns true if the document contains widget annotations // linked to fields which are not referenced in the AcroForm. The AcroForm can // be repaired using the RepairAcroForm method of the reader. func (r *PdfReader) AcroFormNeedsRepair() (bool, error) { var fields []*PdfField if r.AcroForm != nil { fields = r.AcroForm.AllFields() } fieldMap := make(map[*PdfField]struct{}, len(fields)) for _, field := range fields { fieldMap[field] = struct{}{} } for _, page := range r.PageList { annotations, err := page.GetAnnotations() if err != nil { return false, err } for _, annotation := range annotations { widget, ok := annotation.GetContext().(*PdfAnnotationWidget) if !ok { continue } field := widget.Field() if field == nil { return true, nil } if _, ok := fieldMap[field]; !ok { return true, nil } } } return false, nil } // loadForms loads the AcroForm. func (r *PdfReader) loadForms() (*PdfAcroForm, error) { if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() { return nil, fmt.Errorf("file need to be decrypted first") } // Has forms? catalog := r.catalog obj := catalog.Get("AcroForm") if obj == nil { // Nothing to load. return nil, nil } formsContainer, _ := core.GetIndirect(obj) obj = core.TraceToDirectObject(obj) if core.IsNullObject(obj) { common.Log.Trace("Acroform is a null object (empty)\n") return nil, nil } formsDict, ok := core.GetDict(obj) if !ok { common.Log.Debug("Invalid AcroForm entry %T", obj) common.Log.Debug("Does not have forms") return nil, fmt.Errorf("invalid acroform entry %T", obj) } common.Log.Trace("Has Acro forms") // Load it. // Ensure we have access to everything. common.Log.Trace("Traverse the Acroforms structure") if !r.isLazy { err := r.traverseObjectData(formsDict) if err != nil { common.Log.Debug("ERROR: Unable to traverse AcroForms (%s)", err) return nil, err } } // Create the acro forms object. acroForm, err := r.newPdfAcroFormFromDict(formsContainer, formsDict) if err != nil { return nil, err } return acroForm, nil } func (r *PdfReader) lookupPageByObject(obj core.PdfObject) (*PdfPage, error) { // can be indirect, direct, or reference // look up the corresponding page return nil, errors.New("page not found") } // Build the table of contents. // tree, ex: Pages -> Pages -> Pages -> Page // Traverse through the whole thing recursively. func (r *PdfReader) buildPageList(node *core.PdfIndirectObject, parent *core.PdfIndirectObject, traversedPageNodes map[core.PdfObject]struct{}) error { if node == nil { return nil } if _, alreadyTraversed := traversedPageNodes[node]; alreadyTraversed { common.Log.Debug("Cyclic recursion, skipping (%v)", node.ObjectNumber) return nil } traversedPageNodes[node] = struct{}{} nodeDict, ok := node.PdfObject.(*core.PdfObjectDictionary) if !ok { return errors.New("node not a dictionary") } objType, ok := (*nodeDict).Get("Type").(*core.PdfObjectName) if !ok { if nodeDict.Get("Kids") == nil { return errors.New("node missing Type (Required)") } common.Log.Debug("ERROR: node missing Type, but has Kids. Assuming Pages node.") objType = core.MakeName("Pages") nodeDict.Set("Type", objType) } common.Log.Trace("buildPageList node type: %s (%+v)", *objType, node) if *objType == "Page" { p, err := r.newPdfPageFromDict(nodeDict) if err != nil { return err } p.setContainer(node) if parent != nil { // Set the parent (in case missing or incorrect). nodeDict.Set("Parent", parent) } r.pageList = append(r.pageList, node) r.PageList = append(r.PageList, p) return nil } if *objType != "Pages" { common.Log.Debug("ERROR: Table of content containing non Page/Pages object! (%s)", objType) return errors.New("table of content containing non Page/Pages object") } // A Pages object. Update the parent. if parent != nil { nodeDict.Set("Parent", parent) } // Resolve the object recursively. if !r.isLazy { err := r.traverseObjectData(node) if err != nil { return err } } kidsObj, err := r.parser.Resolve(nodeDict.Get("Kids")) if err != nil { common.Log.Debug("ERROR: Failed loading Kids object") return err } var kids *core.PdfObjectArray kids, ok = kidsObj.(*core.PdfObjectArray) if !ok { kidsIndirect, isIndirect := kidsObj.(*core.PdfIndirectObject) if !isIndirect { return errors.New("invalid Kids object") } kids, ok = kidsIndirect.PdfObject.(*core.PdfObjectArray) if !ok { return errors.New("invalid Kids indirect object") } } common.Log.Trace("Kids: %s", kids) for idx, child := range kids.Elements() { child, ok := core.GetIndirect(child) if !ok { common.Log.Debug("ERROR: Page not indirect object - (%s)", child) return errors.New("page not indirect object") } kids.Set(idx, child) err = r.buildPageList(child, node, traversedPageNodes) if err != nil { return err } } return nil } // GetNumPages returns the number of pages in the document. func (r *PdfReader) GetNumPages() (int, error) { if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() { return 0, fmt.Errorf("file need to be decrypted first") } return len(r.pageList), nil } // Resolves a reference, returning the object and indicates whether or not // it was cached. func (r *PdfReader) resolveReference(ref *core.PdfObjectReference) (core.PdfObject, bool, error) { cachedObj, isCached := r.parser.ObjCache[int(ref.ObjectNumber)] if !isCached { common.Log.Trace("Reader Lookup ref: %s", ref) obj, err := r.parser.LookupByReference(*ref) if err != nil { return nil, false, err } r.parser.ObjCache[int(ref.ObjectNumber)] = obj return obj, false, nil } return cachedObj, true, nil } /* * Recursively traverse through the page object data and look up * references to indirect objects. * * GH: Are we fully protected against circular references? (Add tests). */ func (r *PdfReader) traverseObjectData(o core.PdfObject) error { return core.ResolveReferencesDeep(o, r.traversed) } // PageFromIndirectObject returns the PdfPage and page number for a given indirect object. func (r *PdfReader) PageFromIndirectObject(ind *core.PdfIndirectObject) (*PdfPage, int, error) { if len(r.PageList) != len(r.pageList) { return nil, 0, errors.New("page list invalid") } for i, pageind := range r.pageList { if pageind == ind { return r.PageList[i], i + 1, nil } } return nil, 0, errors.New("page not found") } // GetPage returns the PdfPage model for the specified page number. func (r *PdfReader) GetPage(pageNumber int) (*PdfPage, error) { if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() { return nil, fmt.Errorf("file needs to be decrypted first") } if len(r.pageList) < pageNumber { return nil, errors.New("invalid page number (page count too short)") } idx := pageNumber - 1 if idx < 0 { return nil, fmt.Errorf("page numbering must start at 1") } page := r.PageList[idx] return page, nil } // GetOCProperties returns the optional content properties PdfObject. func (r *PdfReader) GetOCProperties() (core.PdfObject, error) { dict := r.catalog obj := dict.Get("OCProperties") obj = core.ResolveReference(obj) // Resolve all references... // Should be pretty safe. Should not be referencing to pages or // any large structures. Local structures and references // to OC Groups. if !r.isLazy { err := r.traverseObjectData(obj) if err != nil { return nil, err } } return obj, nil } // GetNamedDestinations returns the Names entry in the PDF catalog. // See section 12.3.2.3 "Named Destinations" (p. 367 PDF32000_2008). func (r *PdfReader) GetNamedDestinations() (core.PdfObject, error) { obj := core.ResolveReference(r.catalog.Get("Names")) if obj == nil { return nil, nil } // Resolve references. if !r.isLazy { err := r.traverseObjectData(obj) if err != nil { return nil, err } } return obj, nil } // GetPageLabels returns the PageLabels entry in the PDF catalog. // See section 12.4.2 "Page Labels" (p. 382 PDF32000_2008). func (r *PdfReader) GetPageLabels() (core.PdfObject, error) { obj := core.ResolveReference(r.catalog.Get("PageLabels")) if obj == nil { return nil, nil } // Resolve references. if !r.isLazy { err := r.traverseObjectData(obj) if err != nil { return nil, err } } return obj, nil } // Inspect inspects the object types, subtypes and content in the PDF file returning a map of // object type to number of instances of each. func (r *PdfReader) Inspect() (map[string]int, error) { return r.parser.Inspect() } // GetObjectNums returns the object numbers of the PDF objects in the file // Numbered objects are either indirect objects or stream objects. // e.g. objNums := pdfReader.GetObjectNums() // The underlying objects can then be accessed with // pdfReader.GetIndirectObjectByNumber(objNums[0]) for the first available object. func (r *PdfReader) GetObjectNums() []int { return r.parser.GetObjectNums() } // GetIndirectObjectByNumber retrieves and returns a specific PdfObject by object number. func (r *PdfReader) GetIndirectObjectByNumber(number int) (core.PdfObject, error) { obj, err := r.parser.LookupByNumber(number) return obj, err } // GetTrailer returns the PDF's trailer dictionary. func (r *PdfReader) GetTrailer() (*core.PdfObjectDictionary, error) { trailerDict := r.parser.GetTrailer() if trailerDict == nil { return nil, errors.New("trailer missing") } return trailerDict, nil }