From d2941b5477aceb41fbf663c19ec5cc966e34ecf0 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Wed, 20 May 2020 19:04:02 +0300 Subject: [PATCH 1/4] Add reader method for checking if the AcroForm needs repair (#356) * Add AcroFormNeeds repair method * Add AcroForm repair check test case --- model/form_test.go | 33 +++++++++++++++++++++++++++++++++ model/reader.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/model/form_test.go b/model/form_test.go index 9c7d0c8c..b4abca38 100644 --- a/model/form_test.go +++ b/model/form_test.go @@ -176,3 +176,36 @@ func TestRepairAcroForm(t *testing.T) { repaired := *reader.AcroForm.Fields require.ElementsMatch(t, original, repaired) } + +func TestAcroFormNeedsRepair(t *testing.T) { + f, err := os.Open("./testdata/OoPdfFormExample.pdf") + require.NoError(t, err) + defer f.Close() + + reader, err := NewPdfReader(f) + require.NoError(t, err) + + // Original AcroForm repair status check. + needsRepair, err := reader.AcroFormNeedsRepair() + require.NoError(t, err) + require.Equal(t, needsRepair, false) + + // Nil AcroForm repair status check. + reader.AcroForm = nil + needsRepair, err = reader.AcroFormNeedsRepair() + require.NoError(t, err) + require.Equal(t, needsRepair, true) + + // Repaired AcroForm repair status check. + require.NoError(t, reader.RepairAcroForm(nil)) + needsRepair, err = reader.AcroFormNeedsRepair() + require.NoError(t, err) + require.Equal(t, needsRepair, false) + + // Missing AcroForm fields repair status check. + fields := (*reader.AcroForm.Fields)[1:] + reader.AcroForm.Fields = &fields + needsRepair, err = reader.AcroFormNeedsRepair() + require.NoError(t, err) + require.Equal(t, needsRepair, true) +} diff --git a/model/reader.go b/model/reader.go index 43362c90..01a8cc8a 100644 --- a/model/reader.go +++ b/model/reader.go @@ -570,6 +570,45 @@ func (r *PdfReader) RepairAcroForm(opts *AcroFormRepairOptions) error { return nil } +// AcroFormNeedsRepair returns true if the document contains widget annotations +// linked to fields which are not referenced in the AcroForm. The AcroForm can +// be repaired using the RepairAcroForm method of the reader. +func (r *PdfReader) AcroFormNeedsRepair() (bool, error) { + var fields []*PdfField + if r.AcroForm != nil { + fields = r.AcroForm.AllFields() + } + + fieldMap := make(map[*PdfField]struct{}, len(fields)) + for _, field := range fields { + fieldMap[field] = struct{}{} + } + + for _, page := range r.PageList { + annotations, err := page.GetAnnotations() + if err != nil { + return false, err + } + + for _, annotation := range annotations { + widget, ok := annotation.GetContext().(*PdfAnnotationWidget) + if !ok { + continue + } + + field := widget.Field() + if field == nil { + return true, nil + } + if _, ok := fieldMap[field]; !ok { + return true, nil + } + } + } + + return false, nil +} + // loadForms loads the AcroForm. func (r *PdfReader) loadForms() (*PdfAcroForm, error) { if r.parser.GetCrypter() != nil && !r.parser.IsAuthenticated() { From 033f410eac592889e82c8cd2a73683d8466ac06d Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Wed, 20 May 2020 20:58:54 +0300 Subject: [PATCH 2/4] Account for inverted annotation rects when calculation appearance bounds (#357) --- annotator/field_appearance.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/annotator/field_appearance.go b/annotator/field_appearance.go index e7268f70..07b2d7ef 100644 --- a/annotator/field_appearance.go +++ b/annotator/field_appearance.go @@ -168,16 +168,12 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT if !ok { return nil, errors.New("invalid Rect") } - rect, err := array.ToFloat64Array() + rect, err := model.NewPdfRectangle(*array) if err != nil { return nil, err } - if len(rect) != 4 { - return nil, errors.New("len(Rect) != 4") - } - - width := rect[2] - rect[0] - height := rect[3] - rect[1] + width := rect.Width() + height := rect.Height() if mkDict, has := core.GetDict(wa.MK); has { bsDict, _ := core.GetDict(wa.BS) From 5efaa02e238028af8b013cfeb6c833360e646b90 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Fri, 22 May 2020 19:19:43 +0300 Subject: [PATCH 3/4] Use page indirect object for internal outline destinations (#359) * Use page indirect object for internal outlines * Use page indirect object in creator outline destinations * Adapt creator test case to test outline creation and retrieval --- creator/creator.go | 21 +++++++++++++---- creator/creator_test.go | 31 +++++++++++++++++++++---- model/outline.go | 51 +++++++++++++++++++++++++++++------------ 3 files changed, 78 insertions(+), 25 deletions(-) diff --git a/creator/creator.go b/creator/creator.go index 5df43512..6a081553 100644 --- a/creator/creator.go +++ b/creator/creator.go @@ -481,6 +481,13 @@ func (c *Creator) Finalize() error { adjustOutlineDest = func(item *model.OutlineItem) { item.Dest.Page += int64(genpages) + // Get page indirect object. + if page := int(item.Dest.Page); page >= 0 && page < len(c.pages) { + item.Dest.PageObj = c.pages[page].GetPageAsIndirectObject() + } else { + common.Log.Debug("WARN: could not get page container for page %d", page) + } + // Reverse the Y axis of the destination coordinates. // The user passes in the annotation coordinates as if // position 0, 0 is at the top left of the page. @@ -501,15 +508,19 @@ func (c *Creator) Finalize() error { // Add outline TOC item. if c.AddTOC { - var tocPage int64 + var tocPage int if hasFrontPage { tocPage = 1 } - c.outline.Insert(0, model.NewOutlineItem( - "Table of Contents", - model.NewOutlineDest(tocPage, 0, c.pageHeight), - )) + // Create TOC outline item. + dest := model.NewOutlineDest(int64(tocPage), 0, c.pageHeight) + if tocPage >= 0 && tocPage < len(c.pages) { + dest.PageObj = c.pages[tocPage].GetPageAsIndirectObject() + } else { + common.Log.Debug("WARN: could not get page container for page %d", tocPage) + } + c.outline.Insert(0, model.NewOutlineItem("Table of Contents", dest)) } } diff --git a/creator/creator_test.go b/creator/creator_test.go index 588c4068..432c16fb 100644 --- a/creator/creator_test.go +++ b/creator/creator_test.go @@ -13,6 +13,7 @@ import ( "bytes" "crypto/md5" "encoding/hex" + "encoding/json" "errors" "fmt" goimage "image" @@ -1040,11 +1041,31 @@ func TestSubchapters(t *testing.T) { addHeadersAndFooters(c) - err := c.WriteToFile(tempFile("3_subchapters.pdf")) - if err != nil { - t.Errorf("Fail: %v\n", err) - return - } + // Finalize creator in order to get final version of the outlines. + require.NoError(t, c.Finalize()) + + // Get outline data as JSON. + srcJson, err := json.Marshal(c.outline) + require.NoError(t, err) + + // Write output file. + outputPath := tempFile("3_subchapters.pdf") + require.NoError(t, c.WriteToFile(outputPath)) + + // Read output file. + outputFile, err := os.Open(outputPath) + require.NoError(t, err) + defer outputFile.Close() + + reader, err := model.NewPdfReader(outputFile) + require.NoError(t, err) + + // Compare outlines JSON data. + dstOutline, err := reader.GetOutlines() + require.NoError(t, err) + dstJson, err := json.Marshal(dstOutline) + require.NoError(t, err) + require.Equal(t, srcJson, dstJson) } // Test creating and drawing a table. diff --git a/model/outline.go b/model/outline.go index 7f603b77..4c8d5ec1 100644 --- a/model/outline.go +++ b/model/outline.go @@ -16,11 +16,12 @@ import ( // OutlineDest represents the destination of an outline item. // It holds the page and the position on the page an outline item points to. type OutlineDest struct { - Page int64 `json:"page"` - Mode string `json:"mode"` - X float64 `json:"x"` - Y float64 `json:"y"` - Zoom float64 `json:"zoom"` + PageObj *core.PdfIndirectObject `json:"-"` + Page int64 `json:"page"` + Mode string `json:"mode"` + X float64 `json:"x"` + Y float64 `json:"y"` + Zoom float64 `json:"zoom"` } // NewOutlineDest returns a new outline destination which can be used @@ -56,10 +57,18 @@ func newOutlineDestFromPdfObject(o core.PdfObject, r *PdfReader) (*OutlineDest, // Page object is provided. Identify page number using the reader. if _, pageNum, err := r.PageFromIndirectObject(pageInd); err == nil { dest.Page = int64(pageNum - 1) + } else { + common.Log.Debug("WARN: could not get page index for page %+v", pageInd) } - } else if pageNum, ok := core.GetIntVal(pageObj); ok { - // Page number is provided. - dest.Page = int64(pageNum) + dest.PageObj = pageInd + } else if pageIdx, ok := core.GetIntVal(pageObj); ok { + // Page index is provided. Get indirect object to page. + if pageIdx >= 0 && pageIdx < len(r.PageList) { + dest.PageObj = r.PageList[pageIdx].GetPageAsIndirectObject() + } else { + common.Log.Debug("WARN: could not get page container for page %d", pageIdx) + } + dest.Page = int64(pageIdx) } else { return nil, fmt.Errorf("invalid outline destination page: %T", pageObj) } @@ -106,14 +115,22 @@ func newOutlineDestFromPdfObject(o core.PdfObject, r *PdfReader) (*OutlineDest, // ToPdfObject returns a PDF object representation of the outline destination. func (od OutlineDest) ToPdfObject() core.PdfObject { - if od.Page < 0 || od.Mode == "" { + if (od.PageObj == nil && od.Page < 0) || od.Mode == "" { return core.MakeNull() } - dest := core.MakeArray( - core.MakeInteger(od.Page), - core.MakeName(od.Mode), - ) + // Add destination page. + dest := core.MakeArray() + if od.PageObj != nil { + // Internal outline. + dest.Append(od.PageObj) + } else { + // External outline. + dest.Append(core.MakeInteger(od.Page)) + } + + // Add destination mode. + dest.Append(core.MakeName(od.Mode)) // See section 12.3.2.2 "Explicit Destinations" (page 374). switch od.Mode { @@ -180,10 +197,11 @@ func (o *Outline) ToPdfOutline() *PdfOutline { // Create outline items. var outlineItems []*PdfOutlineItem + var lenDescendants int64 var prev *PdfOutlineItem for _, item := range o.Entries { - outlineItem, _ := item.ToPdfOutlineItem() + outlineItem, lenChildren := item.ToPdfOutlineItem() outlineItem.Parent = &outline.PdfOutlineTreeNode if prev != nil { @@ -192,15 +210,18 @@ func (o *Outline) ToPdfOutline() *PdfOutline { } outlineItems = append(outlineItems, outlineItem) + lenDescendants += lenChildren prev = outlineItem } // Add outline linked list properties. lenOutlineItems := int64(len(outlineItems)) + lenDescendants += int64(lenOutlineItems) + if lenOutlineItems > 0 { outline.First = &outlineItems[0].PdfOutlineTreeNode outline.Last = &outlineItems[lenOutlineItems-1].PdfOutlineTreeNode - outline.Count = &lenOutlineItems + outline.Count = &lenDescendants } return outline From d078608da4a7801fc0695b92ef731c13dc24742e Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Tue, 26 May 2020 02:34:44 +0300 Subject: [PATCH 4/4] Account for parent CTM when calculating positions of extracted forms (#349) * Take parent CTM into account for form field text * Pass a modified graphics state instance to new text objects --- extractor/text.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/extractor/text.go b/extractor/text.go index a91eff75..01c6a06f 100644 --- a/extractor/text.go +++ b/extractor/text.go @@ -47,7 +47,7 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM // ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText. func (e *Extractor) ExtractPageText() (*PageText, int, int, error) { - pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0) + pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, transform.IdentityMatrix(), 0) if err != nil { return nil, numChars, numMisses, err } @@ -60,7 +60,7 @@ func (e *Extractor) ExtractPageText() (*PageText, int, int, error) { // extractPageText returns the text contents of content stream `e` and resouces `resources` as a // PageText. // This can be called on a page or a form XObject. -func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) ( +func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, parentCTM transform.Matrix, level int) ( *PageText, int, int, error) { common.Log.Trace("extractPageText: level=%d", level) pageText := &PageText{} @@ -118,7 +118,10 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes pageText.marks = append(pageText.marks, to.marks...) } inTextObj = true - to = newTextObject(e, resources, gs, &state, &fontStack) + + graphicsState := gs + graphicsState.CTM = parentCTM.Mult(graphicsState.CTM) + to = newTextObject(e, resources, graphicsState, &state, &fontStack) case "ET": // End Text // End text object, discarding text matrix. If the current // text object contains text marks, they are added to the @@ -331,8 +334,9 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes if formResources == nil { formResources = resources } + tList, numChars, numMisses, err := e.extractPageText(string(formContent), - formResources, level+1) + formResources, parentCTM.Mult(gs.CTM), level+1) if err != nil { common.Log.Debug("ERROR: %v", err) return err @@ -1134,7 +1138,7 @@ func (tm TextMark) String() string { func (pt *PageText) computeViews() { fontHeight := pt.height() // We sort with a y tolerance to allow for subscripts, diacritics etc. - tol := minFloat(fontHeight*0.2, 5.0) + tol := minFloat(fontHeight*0.19, 5.0) common.Log.Trace("ToTextLocation: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol) // Uncomment the 2 following Debug statements to see the effects of sorting. // common.Log.Debug("computeViews: Before sorting %s", pt)