Changes missed in previous commit.

This commit is contained in:
Peter Williams 2019-01-04 16:07:03 +11:00
parent e251b6b2f2
commit 4aa7e5051e

View File

@ -30,28 +30,28 @@ func (e *Extractor) ExtractText() (string, error) {
return text, err return text, err
} }
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output (`numChars`) and the // ExtractTextWithStats works like ExtractText but returns the number of characters in the output
// the number of characters that were not decoded (`numMisses`). // (`numChars`) and the number of characters that were not decoded (`numMisses`).
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) { func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
textList, numChars, numMisses, err := e.ExtractTextList() pageText, numChars, numMisses, err := e.ExtractPageText()
if err != nil { if err != nil {
return "", numChars, numMisses, err return "", numChars, numMisses, err
} }
return textList.ToText(), numChars, numMisses, nil return pageText.ToText(), numChars, numMisses, nil
} }
// ExtractTextList returns the text contents of `e` (an Extractor for a page) as a TextList. // ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
func (e *Extractor) ExtractTextList() (*TextList, int, int, error) { func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
return e.extractTextList(e.contents, e.resources, 0) return e.extractPageText(e.contents, e.resources, 0)
} }
// extractTextList returns the text contents of content stream `e` and resouces `resources` as a // extractPageText returns the text contents of content stream `e` and resouces `resources` as a
// TextList. // PageText.
// This can be called on a page or a form XObject. // This can be called on a page or a form XObject.
func (e *Extractor) extractTextList(contents string, resources *model.PdfPageResources, level int) (*TextList, int, int, error) { func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (*PageText, int, int, error) {
common.Log.Trace("extractTextList: level=%d", level) common.Log.Trace("extractPageText: level=%d", level)
textList := &TextList{} pageText := &PageText{}
state := newTextState() state := newTextState()
fontStack := fontStacker{} fontStack := fontStacker{}
var to *textObject var to *textObject
@ -59,8 +59,8 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
cstreamParser := contentstream.NewContentStreamParser(contents) cstreamParser := contentstream.NewContentStreamParser(contents)
operations, err := cstreamParser.Parse() operations, err := cstreamParser.Parse()
if err != nil { if err != nil {
common.Log.Debug("ERROR: extractTextList parse failed. err=%v", err) common.Log.Debug("ERROR: extractPageText parse failed. err=%v", err)
return textList, state.numChars, state.numMisses, err return pageText, state.numChars, state.numMisses, err
} }
processor := contentstream.NewContentStreamProcessor(*operations) processor := contentstream.NewContentStreamProcessor(*operations)
@ -103,7 +103,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
} }
to = newTextObject(e, resources, gs, &state, &fontStack) to = newTextObject(e, resources, gs, &state, &fontStack)
case "ET": // End Text case "ET": // End Text
(*textList).marks = append((*textList).marks, to.marks...) (*pageText).marks = append((*pageText).marks, to.marks...)
to = nil to = nil
case "T*": // Move to start of next text line case "T*": // Move to start of next text line
to.nextLine() to.nextLine()
@ -297,7 +297,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
if formResources == nil { if formResources == nil {
formResources = resources formResources = resources
} }
tList, numChars, numMisses, err := e.extractTextList(string(formContent), tList, numChars, numMisses, err := e.extractPageText(string(formContent),
formResources, level+1) formResources, level+1)
if err != nil { if err != nil {
common.Log.Debug("ERROR: %v", err) common.Log.Debug("ERROR: %v", err)
@ -307,7 +307,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
e.formResults[string(name)] = formResult e.formResults[string(name)] = formResult
} }
(*textList).marks = append((*textList).marks, formResult.textList.marks...) (*pageText).marks = append((*pageText).marks, formResult.pageText.marks...)
state.numChars += formResult.numChars state.numChars += formResult.numChars
state.numMisses += formResult.numMisses state.numMisses += formResult.numMisses
} }
@ -318,11 +318,11 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
if err != nil { if err != nil {
common.Log.Debug("ERROR: Processing: err=%v", err) common.Log.Debug("ERROR: Processing: err=%v", err)
} }
return textList, state.numChars, state.numMisses, err return pageText, state.numChars, state.numMisses, err
} }
type textResult struct { type textResult struct {
textList TextList pageText PageText
numChars int numChars int
numMisses int numMisses int
} }
@ -827,31 +827,31 @@ func (t textMark) Width() float64 {
return math.Abs(t.orientedStart.X - t.orientedEnd.X) return math.Abs(t.orientedStart.X - t.orientedEnd.X)
} }
// TextList represents the layout of text on a device page. // PageText represents the layout of text on a device page.
// It's implementation is opaque to allow for future optimizations. // It's implementation is opaque to allow for future optimizations.
type TextList struct { type PageText struct {
// TextList is currently implemented as a list of texts and their positions on a PDF page. // PageText is currently implemented as a list of texts and their positions on a PDF page.
marks []textMark marks []textMark
} }
// String returns a string describing `tl`. // String returns a string describing `pt`.
func (tl TextList) String() string { func (pt PageText) String() string {
parts := []string{fmt.Sprintf("TextList: %d elements", tl.length())} parts := []string{fmt.Sprintf("PageText: %d elements", pt.length())}
for _, t := range tl.marks { for _, t := range pt.marks {
parts = append(parts, t.String()) parts = append(parts, t.String())
} }
return strings.Join(parts, "\n") return strings.Join(parts, "\n")
} }
// length returns the number of elements in `tl.marks`. // length returns the number of elements in `pt.marks`.
func (tl TextList) length() int { func (pt PageText) length() int {
return len(tl.marks) return len(pt.marks)
} }
// height returns the max height of the elements in `tl.marks`. // height returns the max height of the elements in `pt.marks`.
func (tl TextList) height() float64 { func (pt PageText) height() float64 {
fontHeight := 0.0 fontHeight := 0.0
for _, t := range tl.marks { for _, t := range pt.marks {
if t.height > fontHeight { if t.height > fontHeight {
fontHeight = t.height fontHeight = t.height
} }
@ -859,19 +859,19 @@ func (tl TextList) height() float64 {
return fontHeight return fontHeight
} }
// ToText returns the contents of `tl` as a single string. // ToText returns the contents of `pt` as a single string.
func (tl TextList) ToText() string { func (pt PageText) ToText() string {
fontHeight := tl.height() fontHeight := pt.height()
// We sort with a y tolerance to allow for subscripts, diacritics etc. // We sort with a y tolerance to allow for subscripts, diacritics etc.
tol := minFloat(fontHeight*0.2, 5.0) tol := minFloat(fontHeight*0.2, 5.0)
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(tl.marks), fontHeight, tol) common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)
// Uncomment the 2 following Trace statements to see the effects of sorting/ // Uncomment the 2 following Trace statements to see the effects of sorting/
// common.Log.Trace("ToText: Before sorting %s", tl) // common.Log.Trace("ToText: Before sorting %s", pt)
tl.sortPosition(tol) pt.sortPosition(tol)
// common.Log.Trace("ToText: After sorting %s", tl) // common.Log.Trace("ToText: After sorting %s", pt)
lines := tl.toLines(tol) lines := pt.toLines(tol)
texts := make([]string, 0, len(lines)) texts := make([]string, 0, len(lines))
for _, l := range lines { for _, l := range lines {
texts = append(texts, l.text) texts = append(texts, l.text)
@ -882,9 +882,9 @@ func (tl TextList) ToText() string {
// sortPosition sorts a text list by its elements' position on a page. // sortPosition sorts a text list by its elements' position on a page.
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text // Sorting is by orientation then top to bottom, left to right when page is orientated so that text
// is horizontal. // is horizontal.
func (tl *TextList) sortPosition(tol float64) { func (pt *PageText) sortPosition(tol float64) {
sort.SliceStable((*tl).marks, func(i, j int) bool { sort.SliceStable((*pt).marks, func(i, j int) bool {
ti, tj := (*tl).marks[i], (*tl).marks[j] ti, tj := (*pt).marks[i], (*pt).marks[j]
if ti.orient != tj.orient { if ti.orient != tj.orient {
return ti.orient < tj.orient return ti.orient < tj.orient
} }
@ -903,44 +903,44 @@ type textLine struct {
words []string // words in the line. words []string // words in the line.
} }
// toLines returns the text and positions in `tl.marks` as a slice of textLine. // toLines returns the text and positions in `pt.marks` as a slice of textLine.
// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so // NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
// that text is horizontal) before calling this function. // that text is horizontal) before calling this function.
func (tl TextList) toLines(tol float64) []textLine { func (pt PageText) toLines(tol float64) []textLine {
// We divide `tl.marks` into slices which contain texts with the same orientation, extract the lines // We divide `pt.marks` into slices which contain texts with the same orientation, extract the lines
// for each orientation then return the concatention of these lines sorted by orientation. // for each orientation then return the concatention of these lines sorted by orientation.
tlOrient := make(map[int][]textMark, len(tl.marks)) tlOrient := make(map[int][]textMark, len(pt.marks))
for _, t := range tl.marks { for _, t := range pt.marks {
tlOrient[t.orient] = append(tlOrient[t.orient], t) tlOrient[t.orient] = append(tlOrient[t.orient], t)
} }
var lines []textLine var lines []textLine
for _, o := range orientKeys(tlOrient) { for _, o := range orientKeys(tlOrient) {
lines = append(lines, TextList{tlOrient[o]}.toLinesOrient(tol)...) lines = append(lines, PageText{tlOrient[o]}.toLinesOrient(tol)...)
} }
return lines return lines
} }
// toLinesOrient returns the text and positions in `tl.marks` as a slice of textLine. // toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
// NOTE: This function only works on text lists where all text is the same orientation so it should // NOTE: This function only works on text lists where all text is the same orientation so it should
// only be called from toLines. // only be called from toLines.
// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so // Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
// that text is horizontal) before calling this function. // that text is horizontal) before calling this function.
func (tl TextList) toLinesOrient(tol float64) []textLine { func (pt PageText) toLinesOrient(tol float64) []textLine {
if len(tl.marks) == 0 { if len(pt.marks) == 0 {
return []textLine{} return []textLine{}
} }
var lines []textLine var lines []textLine
var words []string var words []string
var x []float64 var x []float64
y := tl.marks[0].orientedStart.Y y := pt.marks[0].orientedStart.Y
scanning := false scanning := false
averageCharWidth := exponAve{} averageCharWidth := exponAve{}
wordSpacing := exponAve{} wordSpacing := exponAve{}
lastEndX := 0.0 // lastEndX is tl.marks[i-1].orientedEnd.X lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X
for _, t := range tl.marks { for _, t := range pt.marks {
if t.orientedStart.Y+tol < y { if t.orientedStart.Y+tol < y {
if len(words) > 0 { if len(words) > 0 {
line := newLine(y, x, words) line := newLine(y, x, words)