Handle more cases of fonts not being set in text extraction code.

This commit is contained in:
Peter Williams 2020-05-28 12:08:15 +10:00
parent 418f859d44
commit 2260e245f7
3 changed files with 36 additions and 7 deletions

View File

@ -51,6 +51,10 @@ Get these files working:
challenging-modified.pdf
transitions_test.pdf
### radical.txt
Evaluate the potential impact of each
s t r a t e g y u s i n g t h e V i s i o n /
TEST FILES
---------

View File

@ -21,7 +21,7 @@ type Extractor struct {
fontCache map[string]fontEntry
// text results from running extractXYText on forms within the page.
// TODO(peterwilliams): Cache this map accross all pages in a PDF to speed up processig.
// TODO(peterwilliams97): Cache this map accross all pages in a PDF to speed up processing.
formResults map[string]textResult
// accessCount is used to set fontEntry.access to an incrementing number.

View File

@ -24,6 +24,10 @@ import (
const verbose = false
// maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack
// overflow and high enough to accomodate customers' PDFs
const maxFormStack 10
// ExtractText processes and extracts all text data in content streams and returns as a string.
// It takes into account character encodings in the PDF file, which are decoded by
// CharcodeBytesToUnicode.
@ -67,8 +71,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &savedStates)
var inTextObj bool
if level > 5 {
err := errors.New("stack overflow")
if level > maxFormStack {
err := errors.New("form stack overflow")
common.Log.Debug("ERROR: extractPageText. recursion level=%d err=%w", level, err)
return pageText, state.numChars, state.numMisses, err
}
@ -245,8 +249,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
return err
}
err = to.setFont(name, size)
to.invalidFont = err == model.ErrType3FontNotSupported ||
(err != nil && strings.Contains(err.Error(), "unsupported font encoding:"))
to.invalidFont = unsupportedFontErr(err)
if err != nil && !to.invalidFont {
return err
}
@ -364,6 +367,24 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
return pageText, state.numChars, state.numMisses, err
}
// unsupportedFontErr returns true if `err` indicated that the selected font or encoding is not supported.
func unsupportedFontErr(err error) bool {
if err == model.ErrFontNotSupported ||
err == model.ErrType1CFontNotSupported ||
err == model.ErrType3FontNotSupported ||
err == model.ErrTTCmapNotSupported {
return true
}
if err == nil {
return false
}
errStr := err.Error()
return strings.Contains(errStr, "unsupported font encoding:") ||
strings.Contains(errStr, "unexpected subtable format:") ||
strings.Contains(errStr, "fonts based on PostScript outlines are not supported")
}
// textResult is used for holding results of PDF form processig
type textResult struct {
pageText PageText
numChars int
@ -1101,11 +1122,15 @@ var spaceMark = TextMark{
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
// empty.
func (to *textObject) getCurrentFont() *model.PdfFont {
if to.savedStates.empty() {
var font *model.PdfFont
if !to.savedStates.empty() {
font = to.savedStates.top().tfont
}
if font == nil {
common.Log.Debug("ERROR: No font defined. Using default.")
return model.DefaultFont()
}
return to.savedStates.top().tfont
return font
}
// getFont returns the font named `name` if it exists in the page's resources or an error if it