mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-29 13:48:54 +08:00
Recurse through form XObjects for text extractions.
This commit is contained in:
parent
686a6e511e
commit
af99ee41db
@ -5,17 +5,26 @@
|
||||
|
||||
package extractor
|
||||
|
||||
import "github.com/unidoc/unidoc/pdf/model"
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/unidoc/unidoc/pdf/model"
|
||||
)
|
||||
|
||||
// Extractor stores and offers functionality for extracting content from PDF pages.
|
||||
type Extractor struct {
|
||||
contents string
|
||||
resources *model.PdfPageResources
|
||||
// stream contents and resources for page
|
||||
contents string
|
||||
pageResources *model.PdfPageResources
|
||||
|
||||
// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
|
||||
// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
|
||||
fontCache map[string]fontEntry
|
||||
|
||||
// text results from running extractXYText on forms within the page.
|
||||
// TODO(peterwilliams): Cache this map accross all pages in a PDF to speed up processig.
|
||||
formResults map[string]textResult
|
||||
|
||||
// accessCount is used to set fontEntry.access to an incrementing number.
|
||||
accessCount int64
|
||||
|
||||
@ -36,9 +45,10 @@ func New(page *model.PdfPage) (*Extractor, error) {
|
||||
// fmt.Println("========================= ::: =========================")
|
||||
|
||||
e := &Extractor{
|
||||
contents: contents,
|
||||
resources: page.Resources,
|
||||
fontCache: map[string]fontEntry{},
|
||||
contents: contents,
|
||||
pageResources: page.Resources,
|
||||
fontCache: map[string]fontEntry{},
|
||||
formResults: map[string]textResult{},
|
||||
}
|
||||
return e, nil
|
||||
}
|
||||
|
@ -40,14 +40,23 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM
|
||||
return textList.ToText(), numChars, numMisses, nil
|
||||
}
|
||||
|
||||
// ExtractXYText returns the text contents of `e` as a TextList.
|
||||
// ExtractXYText returns the text contents of `e` (an Extractor for a page) as a TextList.
|
||||
func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
||||
return e.extractXYText(e.contents, e.pageResources, 0)
|
||||
}
|
||||
|
||||
// extractXYText returns the text contents of content stream `e` and resouces `resources` as a
|
||||
// TextList.
|
||||
// This can be called on a page or a Form XObject.
|
||||
func (e *Extractor) extractXYText(contents string, resources *model.PdfPageResources, level int) (*TextList, int, int, error) {
|
||||
|
||||
common.Log.Trace("extractXYText: level=%d", level)
|
||||
textList := &TextList{}
|
||||
state := newTextState()
|
||||
fontStack := fontStacker{}
|
||||
var to *textObject
|
||||
|
||||
cstreamParser := contentstream.NewContentStreamParser(e.contents)
|
||||
cstreamParser := contentstream.NewContentStreamParser(contents)
|
||||
operations, err := cstreamParser.Parse()
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: ExtractXYText parse failed. err=%v", err)
|
||||
@ -62,6 +71,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
||||
|
||||
operand := op.Operand
|
||||
|
||||
|
||||
switch operand {
|
||||
case "q":
|
||||
if !fontStack.empty() {
|
||||
@ -92,7 +102,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
||||
if to != nil {
|
||||
common.Log.Debug("BT called while in a text object")
|
||||
}
|
||||
to = newTextObject(e, gs, &state, &fontStack)
|
||||
to = newTextObject(e, resources, gs, &state, &fontStack)
|
||||
case "ET": // End Text
|
||||
*textList = append(*textList, to.Texts...)
|
||||
to = nil
|
||||
@ -188,7 +198,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
||||
case "Tf": // Set font
|
||||
if to == nil {
|
||||
// This is needed for 26-Hazard-Thermal-environment.pdf
|
||||
to = newTextObject(e, gs, &state, &fontStack)
|
||||
to = newTextObject(e, resources, gs, &state, &fontStack)
|
||||
}
|
||||
if ok, err := to.checkOp(op, 2, true); !ok {
|
||||
common.Log.Debug("ERROR: Tf err=%v", err)
|
||||
@ -264,18 +274,61 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
||||
return err
|
||||
}
|
||||
to.setHorizScaling(y)
|
||||
}
|
||||
|
||||
case "Do":
|
||||
// XObject.
|
||||
name := *op.Params[0].(*core.PdfObjectName)
|
||||
_, xtype := resources.GetXObjectByName(name)
|
||||
if xtype != model.XObjectTypeForm {
|
||||
break
|
||||
}
|
||||
// Only process each one once.
|
||||
formResult, ok := e.formResults[string(name)]
|
||||
if !ok {
|
||||
xform, err := resources.GetXObjectFormByName(name)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
}
|
||||
formContent, err := xform.GetContentStream()
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
}
|
||||
formResources := xform.Resources
|
||||
if formResources == nil {
|
||||
formResources = resources
|
||||
}
|
||||
tList, numChars, numMisses, err := e.extractXYText(string(formContent),
|
||||
formResources, level+1)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
}
|
||||
formResult = textResult{*tList, numChars, numMisses}
|
||||
e.formResults[string(name)] = formResult
|
||||
}
|
||||
|
||||
*textList = append(*textList, formResult.textList...)
|
||||
state.numChars += formResult.numChars
|
||||
state.numMisses += formResult.numMisses
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
err = processor.Process(e.resources)
|
||||
err = processor.Process(resources)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: Processing: err=%v", err)
|
||||
}
|
||||
return textList, state.numChars, state.numMisses, err
|
||||
}
|
||||
|
||||
type textResult struct {
|
||||
textList TextList
|
||||
numChars int
|
||||
numMisses int
|
||||
}
|
||||
|
||||
//
|
||||
// Text operators
|
||||
//
|
||||
@ -389,7 +442,7 @@ func (to *textObject) setFont(name string, size float64) error {
|
||||
(*to.fontStack)[len(*to.fontStack)-1] = font
|
||||
}
|
||||
} else if err == model.ErrFontNotSupported {
|
||||
// XXX: Do we need to handle this case in a special way?
|
||||
// XXX(peterwilliams97): Do we need to handle this case in a special way?
|
||||
return err
|
||||
} else {
|
||||
return err
|
||||
@ -570,6 +623,7 @@ type textState struct {
|
||||
// textObject represents a PDF text object.
|
||||
type textObject struct {
|
||||
e *Extractor
|
||||
resources *model.PdfPageResources
|
||||
gs contentstream.GraphicsState
|
||||
fontStack *fontStacker
|
||||
State *textState
|
||||
@ -587,10 +641,12 @@ func newTextState() textState {
|
||||
}
|
||||
|
||||
// newTextObject returns a default textObject.
|
||||
func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textState,
|
||||
func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
|
||||
state *textState,
|
||||
fontStack *fontStacker) *textObject {
|
||||
return &textObject{
|
||||
e: e,
|
||||
resources: resources,
|
||||
gs: gs,
|
||||
fontStack: fontStack,
|
||||
State: state,
|
||||
@ -797,7 +853,7 @@ func (tl TextList) ToText() string {
|
||||
fontHeight := tl.height()
|
||||
// We sort with a y tolerance to allow for subscripts, diacritics etc.
|
||||
tol := minFloat(fontHeight*0.2, 5.0)
|
||||
common.Log.Trace("ToText: fontHeight=%.1f tol=%.1f", fontHeight, tol)
|
||||
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(tl), fontHeight, tol)
|
||||
|
||||
tl.SortPosition(tol)
|
||||
|
||||
@ -1084,7 +1140,7 @@ func combine(parts []string) string {
|
||||
}
|
||||
|
||||
// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
|
||||
// non-diacritics in `w` (0 or 1)
|
||||
// non-diacritics in `w` (0 or 1).
|
||||
func countDiacritic(w string) (string, int) {
|
||||
runes := []rune(w)
|
||||
if len(runes) != 1 {
|
||||
@ -1092,7 +1148,8 @@ func countDiacritic(w string) (string, int) {
|
||||
}
|
||||
r := runes[0]
|
||||
c := 1
|
||||
if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) {
|
||||
if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
|
||||
r != '\'' && r != '"' && r != '`' {
|
||||
c = 0
|
||||
}
|
||||
if w2, ok := diacritics[r]; ok {
|
||||
@ -1213,7 +1270,7 @@ func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
|
||||
// getFontDict returns the font dict with key `name` if it exists in the page's Font resources or
|
||||
// an error if it doesn't.
|
||||
func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
|
||||
resources := to.e.resources
|
||||
resources := to.resources
|
||||
if resources == nil {
|
||||
common.Log.Debug("getFontDict. No resources. name=%#q", name)
|
||||
return nil, nil
|
||||
@ -1221,6 +1278,7 @@ func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err erro
|
||||
fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
|
||||
if !found {
|
||||
common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
|
||||
panic(errors.New("font not in resources"))
|
||||
return nil, errors.New("font not in resources")
|
||||
}
|
||||
return fontObj, nil
|
||||
|
Loading…
x
Reference in New Issue
Block a user