Recurse through form XObjects for text extractions.

This commit is contained in:
Peter Williams 2018-12-27 20:51:34 +11:00
parent 686a6e511e
commit af99ee41db
2 changed files with 86 additions and 18 deletions

View File

@ -5,17 +5,26 @@
package extractor
import "github.com/unidoc/unidoc/pdf/model"
import (
"fmt"
"github.com/unidoc/unidoc/pdf/model"
)
// Extractor stores and offers functionality for extracting content from PDF pages.
type Extractor struct {
// stream contents and resources for page
contents string
resources *model.PdfPageResources
pageResources *model.PdfPageResources
// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
fontCache map[string]fontEntry
// text results from running extractXYText on forms within the page.
// TODO(peterwilliams): Cache this map accross all pages in a PDF to speed up processig.
formResults map[string]textResult
// accessCount is used to set fontEntry.access to an incrementing number.
accessCount int64
@ -37,8 +46,9 @@ func New(page *model.PdfPage) (*Extractor, error) {
e := &Extractor{
contents: contents,
resources: page.Resources,
pageResources: page.Resources,
fontCache: map[string]fontEntry{},
formResults: map[string]textResult{},
}
return e, nil
}

View File

@ -40,14 +40,23 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM
return textList.ToText(), numChars, numMisses, nil
}
// ExtractXYText returns the text contents of `e` as a TextList.
// ExtractXYText returns the text contents of `e` (an Extractor for a page) as a TextList.
func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
return e.extractXYText(e.contents, e.pageResources, 0)
}
// extractXYText returns the text contents of content stream `e` and resouces `resources` as a
// TextList.
// This can be called on a page or a Form XObject.
func (e *Extractor) extractXYText(contents string, resources *model.PdfPageResources, level int) (*TextList, int, int, error) {
common.Log.Trace("extractXYText: level=%d", level)
textList := &TextList{}
state := newTextState()
fontStack := fontStacker{}
var to *textObject
cstreamParser := contentstream.NewContentStreamParser(e.contents)
cstreamParser := contentstream.NewContentStreamParser(contents)
operations, err := cstreamParser.Parse()
if err != nil {
common.Log.Debug("ERROR: ExtractXYText parse failed. err=%v", err)
@ -62,6 +71,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
operand := op.Operand
switch operand {
case "q":
if !fontStack.empty() {
@ -92,7 +102,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
if to != nil {
common.Log.Debug("BT called while in a text object")
}
to = newTextObject(e, gs, &state, &fontStack)
to = newTextObject(e, resources, gs, &state, &fontStack)
case "ET": // End Text
*textList = append(*textList, to.Texts...)
to = nil
@ -188,7 +198,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
case "Tf": // Set font
if to == nil {
// This is needed for 26-Hazard-Thermal-environment.pdf
to = newTextObject(e, gs, &state, &fontStack)
to = newTextObject(e, resources, gs, &state, &fontStack)
}
if ok, err := to.checkOp(op, 2, true); !ok {
common.Log.Debug("ERROR: Tf err=%v", err)
@ -264,18 +274,61 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
return err
}
to.setHorizScaling(y)
case "Do":
// XObject.
name := *op.Params[0].(*core.PdfObjectName)
_, xtype := resources.GetXObjectByName(name)
if xtype != model.XObjectTypeForm {
break
}
// Only process each one once.
formResult, ok := e.formResults[string(name)]
if !ok {
xform, err := resources.GetXObjectFormByName(name)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
formContent, err := xform.GetContentStream()
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
formResources := xform.Resources
if formResources == nil {
formResources = resources
}
tList, numChars, numMisses, err := e.extractXYText(string(formContent),
formResources, level+1)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
formResult = textResult{*tList, numChars, numMisses}
e.formResults[string(name)] = formResult
}
*textList = append(*textList, formResult.textList...)
state.numChars += formResult.numChars
state.numMisses += formResult.numMisses
}
return nil
})
err = processor.Process(e.resources)
err = processor.Process(resources)
if err != nil {
common.Log.Debug("ERROR: Processing: err=%v", err)
}
return textList, state.numChars, state.numMisses, err
}
type textResult struct {
textList TextList
numChars int
numMisses int
}
//
// Text operators
//
@ -389,7 +442,7 @@ func (to *textObject) setFont(name string, size float64) error {
(*to.fontStack)[len(*to.fontStack)-1] = font
}
} else if err == model.ErrFontNotSupported {
// XXX: Do we need to handle this case in a special way?
// XXX(peterwilliams97): Do we need to handle this case in a special way?
return err
} else {
return err
@ -570,6 +623,7 @@ type textState struct {
// textObject represents a PDF text object.
type textObject struct {
e *Extractor
resources *model.PdfPageResources
gs contentstream.GraphicsState
fontStack *fontStacker
State *textState
@ -587,10 +641,12 @@ func newTextState() textState {
}
// newTextObject returns a default textObject.
func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textState,
func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
state *textState,
fontStack *fontStacker) *textObject {
return &textObject{
e: e,
resources: resources,
gs: gs,
fontStack: fontStack,
State: state,
@ -797,7 +853,7 @@ func (tl TextList) ToText() string {
fontHeight := tl.height()
// We sort with a y tolerance to allow for subscripts, diacritics etc.
tol := minFloat(fontHeight*0.2, 5.0)
common.Log.Trace("ToText: fontHeight=%.1f tol=%.1f", fontHeight, tol)
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(tl), fontHeight, tol)
tl.SortPosition(tol)
@ -1084,7 +1140,7 @@ func combine(parts []string) string {
}
// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
// non-diacritics in `w` (0 or 1)
// non-diacritics in `w` (0 or 1).
func countDiacritic(w string) (string, int) {
runes := []rune(w)
if len(runes) != 1 {
@ -1092,7 +1148,8 @@ func countDiacritic(w string) (string, int) {
}
r := runes[0]
c := 1
if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) {
if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
r != '\'' && r != '"' && r != '`' {
c = 0
}
if w2, ok := diacritics[r]; ok {
@ -1213,7 +1270,7 @@ func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
// getFontDict returns the font dict with key `name` if it exists in the page's Font resources or
// an error if it doesn't.
func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
resources := to.e.resources
resources := to.resources
if resources == nil {
common.Log.Debug("getFontDict. No resources. name=%#q", name)
return nil, nil
@ -1221,6 +1278,7 @@ func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err erro
fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
if !found {
common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
panic(errors.New("font not in resources"))
return nil, errors.New("font not in resources")
}
return fontObj, nil