Recurse through form XObjects for text extractions.

2025-04-29 13:48:54 +08:00 · 2018-12-27 20:51:34 +11:00 · 2018-12-27 20:51:34 +11:00 · af99ee41db
commit af99ee41db
parent 686a6e511e
2 changed files with 86 additions and 18 deletions
--- a/pdf/extractor/extractor.go
+++ b/pdf/extractor/extractor.go
@ -5,17 +5,26 @@

 package extractor

-import "github.com/unidoc/unidoc/pdf/model"
+import (
+	"fmt"
+
+	"github.com/unidoc/unidoc/pdf/model"
+)

 // Extractor stores and offers functionality for extracting content from PDF pages.
 type Extractor struct {
-	contents  string
-	resources *model.PdfPageResources
+	// stream contents and resources for page
+	contents      string
+	pageResources *model.PdfPageResources

 	// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
 	// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
 	fontCache map[string]fontEntry

+	// text results from running extractXYText on forms within the page.
+	// TODO(peterwilliams): Cache this map accross all pages in a PDF to speed up processig.
+	formResults map[string]textResult
+
 	// accessCount is used to set fontEntry.access to an incrementing number.
 	accessCount int64

@ -36,9 +45,10 @@ func New(page *model.PdfPage) (*Extractor, error) {
 	// fmt.Println("========================= ::: =========================")

 	e := &Extractor{
-		contents:  contents,
-		resources: page.Resources,
-		fontCache: map[string]fontEntry{},
+		contents:       contents,
+		pageResources:  page.Resources,
+		fontCache:      map[string]fontEntry{},
+		formResults: map[string]textResult{},
 	}
 	return e, nil
 }
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@ -40,14 +40,23 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM
 	return textList.ToText(), numChars, numMisses, nil
 }

-// ExtractXYText returns the text contents of `e` as a TextList.
+// ExtractXYText returns the text contents of `e` (an Extractor for a page) as a TextList.
 func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
+	return e.extractXYText(e.contents, e.pageResources, 0)
+}
+
+// extractXYText returns the text contents of content stream `e` and resouces `resources` as a 
+// TextList.
+// This can be called on a page or a Form XObject.
+func (e *Extractor) extractXYText(contents string, resources *model.PdfPageResources, level int) (*TextList, int, int, error) {
+
+	common.Log.Trace("extractXYText: level=%d", level)
 	textList := &TextList{}
 	state := newTextState()
 	fontStack := fontStacker{}
 	var to *textObject

-	cstreamParser := contentstream.NewContentStreamParser(e.contents)
+	cstreamParser := contentstream.NewContentStreamParser(contents)
 	operations, err := cstreamParser.Parse()
 	if err != nil {
 		common.Log.Debug("ERROR: ExtractXYText parse failed. err=%v", err)
@ -62,6 +71,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {

 			operand := op.Operand

+
 			switch operand {
 			case "q":
 				if !fontStack.empty() {
@ -92,7 +102,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
 				if to != nil {
 					common.Log.Debug("BT called while in a text object")
 				}
-				to = newTextObject(e, gs, &state, &fontStack)
+				to = newTextObject(e, resources, gs, &state, &fontStack)
 			case "ET": // End Text
 				*textList = append(*textList, to.Texts...)
 				to = nil
@ -188,7 +198,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
 			case "Tf": // Set font
 				if to == nil {
 					// This is needed for 26-Hazard-Thermal-environment.pdf
-					to = newTextObject(e, gs, &state, &fontStack)
+					to = newTextObject(e, resources, gs, &state, &fontStack)
 				}
 				if ok, err := to.checkOp(op, 2, true); !ok {
 					common.Log.Debug("ERROR: Tf err=%v", err)
@ -264,18 +274,61 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
 					return err
 				}
 				to.setHorizScaling(y)
-			}

+			case "Do":
+				// XObject.
+				name := *op.Params[0].(*core.PdfObjectName)
+				_, xtype := resources.GetXObjectByName(name)
+				if xtype != model.XObjectTypeForm {
+					break
+				}
+				// Only process each one once.
+				formResult, ok := e.formResults[string(name)]
+				if !ok {
+					xform, err := resources.GetXObjectFormByName(name)
+					if err != nil {
+						common.Log.Debug("ERROR: %v", err)
+						return err
+					}
+					formContent, err := xform.GetContentStream()
+					if err != nil {
+						common.Log.Debug("ERROR: %v", err)
+						return err
+					}
+					formResources := xform.Resources
+					if formResources == nil {
+						formResources = resources
+					}
+					tList, numChars, numMisses, err := e.extractXYText(string(formContent),
+						formResources, level+1)
+					if err != nil {
+						common.Log.Debug("ERROR: %v", err)
+						return err
+					}
+					formResult = textResult{*tList, numChars, numMisses}
+					e.formResults[string(name)] = formResult
+				}
+
+				*textList = append(*textList, formResult.textList...)
+				state.numChars += formResult.numChars
+				state.numMisses += formResult.numMisses
+			}
 			return nil
 		})

-	err = processor.Process(e.resources)
+	err = processor.Process(resources)
 	if err != nil {
 		common.Log.Debug("ERROR: Processing: err=%v", err)
 	}
 	return textList, state.numChars, state.numMisses, err
 }

+type textResult struct {
+	textList  TextList
+	numChars  int
+	numMisses int
+}
+
 //
 // Text operators
 //
@ -389,7 +442,7 @@ func (to *textObject) setFont(name string, size float64) error {
 			(*to.fontStack)[len(*to.fontStack)-1] = font
 		}
 	} else if err == model.ErrFontNotSupported {
-		// XXX: Do we need to handle this case in a special way?
+		// XXX(peterwilliams97): Do we need to handle this case in a special way?
 		return err
 	} else {
 		return err
@ -570,6 +623,7 @@ type textState struct {
 // textObject represents a PDF text object.
 type textObject struct {
 	e         *Extractor
+	resources *model.PdfPageResources
 	gs        contentstream.GraphicsState
 	fontStack *fontStacker
 	State     *textState
@ -587,10 +641,12 @@ func newTextState() textState {
 }

 // newTextObject returns a default textObject.
-func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textState,
+func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
+	state *textState,
 	fontStack *fontStacker) *textObject {
 	return &textObject{
 		e:         e,
+		resources: resources,
 		gs:        gs,
 		fontStack: fontStack,
 		State:     state,
@ -797,7 +853,7 @@ func (tl TextList) ToText() string {
 	fontHeight := tl.height()
 	// We sort with a y tolerance to allow for subscripts, diacritics etc.
 	tol := minFloat(fontHeight*0.2, 5.0)
-	common.Log.Trace("ToText: fontHeight=%.1f tol=%.1f", fontHeight, tol)
+	common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(tl), fontHeight, tol)

 	tl.SortPosition(tol)

@ -1084,7 +1140,7 @@ func combine(parts []string) string {
 }

 // countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
-// non-diacritics in `w` (0 or 1)
+// non-diacritics in `w` (0 or 1).
 func countDiacritic(w string) (string, int) {
 	runes := []rune(w)
 	if len(runes) != 1 {
@ -1092,7 +1148,8 @@ func countDiacritic(w string) (string, int) {
 	}
 	r := runes[0]
 	c := 1
-	if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) {
+	if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
+		r != '\'' && r != '"' && r != '`' {
 		c = 0
 	}
 	if w2, ok := diacritics[r]; ok {
@ -1213,7 +1270,7 @@ func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
 // getFontDict returns the font dict with key `name` if it exists in the page's Font resources or
 // an error if it doesn't.
 func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
-	resources := to.e.resources
+	resources := to.resources
 	if resources == nil {
 		common.Log.Debug("getFontDict. No resources. name=%#q", name)
 		return nil, nil
@ -1221,6 +1278,7 @@ func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err erro
 	fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
 	if !found {
 		common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
+		panic(errors.New("font not in resources"))
 		return nil, errors.New("font not in resources")
 	}
 	return fontObj, nil