mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-30 13:48:51 +08:00
Changes missed in previous commit.
This commit is contained in:
parent
e251b6b2f2
commit
4aa7e5051e
@ -30,28 +30,28 @@ func (e *Extractor) ExtractText() (string, error) {
|
|||||||
return text, err
|
return text, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output (`numChars`) and the
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
||||||
// the number of characters that were not decoded (`numMisses`).
|
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
||||||
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
|
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
|
||||||
textList, numChars, numMisses, err := e.ExtractTextList()
|
pageText, numChars, numMisses, err := e.ExtractPageText()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", numChars, numMisses, err
|
return "", numChars, numMisses, err
|
||||||
}
|
}
|
||||||
return textList.ToText(), numChars, numMisses, nil
|
return pageText.ToText(), numChars, numMisses, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractTextList returns the text contents of `e` (an Extractor for a page) as a TextList.
|
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
||||||
func (e *Extractor) ExtractTextList() (*TextList, int, int, error) {
|
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
|
||||||
return e.extractTextList(e.contents, e.resources, 0)
|
return e.extractPageText(e.contents, e.resources, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractTextList returns the text contents of content stream `e` and resouces `resources` as a
|
// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
|
||||||
// TextList.
|
// PageText.
|
||||||
// This can be called on a page or a form XObject.
|
// This can be called on a page or a form XObject.
|
||||||
func (e *Extractor) extractTextList(contents string, resources *model.PdfPageResources, level int) (*TextList, int, int, error) {
|
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (*PageText, int, int, error) {
|
||||||
|
|
||||||
common.Log.Trace("extractTextList: level=%d", level)
|
common.Log.Trace("extractPageText: level=%d", level)
|
||||||
textList := &TextList{}
|
pageText := &PageText{}
|
||||||
state := newTextState()
|
state := newTextState()
|
||||||
fontStack := fontStacker{}
|
fontStack := fontStacker{}
|
||||||
var to *textObject
|
var to *textObject
|
||||||
@ -59,8 +59,8 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
|||||||
cstreamParser := contentstream.NewContentStreamParser(contents)
|
cstreamParser := contentstream.NewContentStreamParser(contents)
|
||||||
operations, err := cstreamParser.Parse()
|
operations, err := cstreamParser.Parse()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.Log.Debug("ERROR: extractTextList parse failed. err=%v", err)
|
common.Log.Debug("ERROR: extractPageText parse failed. err=%v", err)
|
||||||
return textList, state.numChars, state.numMisses, err
|
return pageText, state.numChars, state.numMisses, err
|
||||||
}
|
}
|
||||||
|
|
||||||
processor := contentstream.NewContentStreamProcessor(*operations)
|
processor := contentstream.NewContentStreamProcessor(*operations)
|
||||||
@ -103,7 +103,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
|||||||
}
|
}
|
||||||
to = newTextObject(e, resources, gs, &state, &fontStack)
|
to = newTextObject(e, resources, gs, &state, &fontStack)
|
||||||
case "ET": // End Text
|
case "ET": // End Text
|
||||||
(*textList).marks = append((*textList).marks, to.marks...)
|
(*pageText).marks = append((*pageText).marks, to.marks...)
|
||||||
to = nil
|
to = nil
|
||||||
case "T*": // Move to start of next text line
|
case "T*": // Move to start of next text line
|
||||||
to.nextLine()
|
to.nextLine()
|
||||||
@ -297,7 +297,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
|||||||
if formResources == nil {
|
if formResources == nil {
|
||||||
formResources = resources
|
formResources = resources
|
||||||
}
|
}
|
||||||
tList, numChars, numMisses, err := e.extractTextList(string(formContent),
|
tList, numChars, numMisses, err := e.extractPageText(string(formContent),
|
||||||
formResources, level+1)
|
formResources, level+1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.Log.Debug("ERROR: %v", err)
|
common.Log.Debug("ERROR: %v", err)
|
||||||
@ -307,7 +307,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
|||||||
e.formResults[string(name)] = formResult
|
e.formResults[string(name)] = formResult
|
||||||
}
|
}
|
||||||
|
|
||||||
(*textList).marks = append((*textList).marks, formResult.textList.marks...)
|
(*pageText).marks = append((*pageText).marks, formResult.pageText.marks...)
|
||||||
state.numChars += formResult.numChars
|
state.numChars += formResult.numChars
|
||||||
state.numMisses += formResult.numMisses
|
state.numMisses += formResult.numMisses
|
||||||
}
|
}
|
||||||
@ -318,11 +318,11 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
common.Log.Debug("ERROR: Processing: err=%v", err)
|
common.Log.Debug("ERROR: Processing: err=%v", err)
|
||||||
}
|
}
|
||||||
return textList, state.numChars, state.numMisses, err
|
return pageText, state.numChars, state.numMisses, err
|
||||||
}
|
}
|
||||||
|
|
||||||
type textResult struct {
|
type textResult struct {
|
||||||
textList TextList
|
pageText PageText
|
||||||
numChars int
|
numChars int
|
||||||
numMisses int
|
numMisses int
|
||||||
}
|
}
|
||||||
@ -827,31 +827,31 @@ func (t textMark) Width() float64 {
|
|||||||
return math.Abs(t.orientedStart.X - t.orientedEnd.X)
|
return math.Abs(t.orientedStart.X - t.orientedEnd.X)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TextList represents the layout of text on a device page.
|
// PageText represents the layout of text on a device page.
|
||||||
// It's implementation is opaque to allow for future optimizations.
|
// It's implementation is opaque to allow for future optimizations.
|
||||||
type TextList struct {
|
type PageText struct {
|
||||||
// TextList is currently implemented as a list of texts and their positions on a PDF page.
|
// PageText is currently implemented as a list of texts and their positions on a PDF page.
|
||||||
marks []textMark
|
marks []textMark
|
||||||
}
|
}
|
||||||
|
|
||||||
// String returns a string describing `tl`.
|
// String returns a string describing `pt`.
|
||||||
func (tl TextList) String() string {
|
func (pt PageText) String() string {
|
||||||
parts := []string{fmt.Sprintf("TextList: %d elements", tl.length())}
|
parts := []string{fmt.Sprintf("PageText: %d elements", pt.length())}
|
||||||
for _, t := range tl.marks {
|
for _, t := range pt.marks {
|
||||||
parts = append(parts, t.String())
|
parts = append(parts, t.String())
|
||||||
}
|
}
|
||||||
return strings.Join(parts, "\n")
|
return strings.Join(parts, "\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
// length returns the number of elements in `tl.marks`.
|
// length returns the number of elements in `pt.marks`.
|
||||||
func (tl TextList) length() int {
|
func (pt PageText) length() int {
|
||||||
return len(tl.marks)
|
return len(pt.marks)
|
||||||
}
|
}
|
||||||
|
|
||||||
// height returns the max height of the elements in `tl.marks`.
|
// height returns the max height of the elements in `pt.marks`.
|
||||||
func (tl TextList) height() float64 {
|
func (pt PageText) height() float64 {
|
||||||
fontHeight := 0.0
|
fontHeight := 0.0
|
||||||
for _, t := range tl.marks {
|
for _, t := range pt.marks {
|
||||||
if t.height > fontHeight {
|
if t.height > fontHeight {
|
||||||
fontHeight = t.height
|
fontHeight = t.height
|
||||||
}
|
}
|
||||||
@ -859,19 +859,19 @@ func (tl TextList) height() float64 {
|
|||||||
return fontHeight
|
return fontHeight
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToText returns the contents of `tl` as a single string.
|
// ToText returns the contents of `pt` as a single string.
|
||||||
func (tl TextList) ToText() string {
|
func (pt PageText) ToText() string {
|
||||||
fontHeight := tl.height()
|
fontHeight := pt.height()
|
||||||
// We sort with a y tolerance to allow for subscripts, diacritics etc.
|
// We sort with a y tolerance to allow for subscripts, diacritics etc.
|
||||||
tol := minFloat(fontHeight*0.2, 5.0)
|
tol := minFloat(fontHeight*0.2, 5.0)
|
||||||
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(tl.marks), fontHeight, tol)
|
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)
|
||||||
|
|
||||||
// Uncomment the 2 following Trace statements to see the effects of sorting/
|
// Uncomment the 2 following Trace statements to see the effects of sorting/
|
||||||
// common.Log.Trace("ToText: Before sorting %s", tl)
|
// common.Log.Trace("ToText: Before sorting %s", pt)
|
||||||
tl.sortPosition(tol)
|
pt.sortPosition(tol)
|
||||||
// common.Log.Trace("ToText: After sorting %s", tl)
|
// common.Log.Trace("ToText: After sorting %s", pt)
|
||||||
|
|
||||||
lines := tl.toLines(tol)
|
lines := pt.toLines(tol)
|
||||||
texts := make([]string, 0, len(lines))
|
texts := make([]string, 0, len(lines))
|
||||||
for _, l := range lines {
|
for _, l := range lines {
|
||||||
texts = append(texts, l.text)
|
texts = append(texts, l.text)
|
||||||
@ -882,9 +882,9 @@ func (tl TextList) ToText() string {
|
|||||||
// sortPosition sorts a text list by its elements' position on a page.
|
// sortPosition sorts a text list by its elements' position on a page.
|
||||||
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
|
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
|
||||||
// is horizontal.
|
// is horizontal.
|
||||||
func (tl *TextList) sortPosition(tol float64) {
|
func (pt *PageText) sortPosition(tol float64) {
|
||||||
sort.SliceStable((*tl).marks, func(i, j int) bool {
|
sort.SliceStable((*pt).marks, func(i, j int) bool {
|
||||||
ti, tj := (*tl).marks[i], (*tl).marks[j]
|
ti, tj := (*pt).marks[i], (*pt).marks[j]
|
||||||
if ti.orient != tj.orient {
|
if ti.orient != tj.orient {
|
||||||
return ti.orient < tj.orient
|
return ti.orient < tj.orient
|
||||||
}
|
}
|
||||||
@ -903,44 +903,44 @@ type textLine struct {
|
|||||||
words []string // words in the line.
|
words []string // words in the line.
|
||||||
}
|
}
|
||||||
|
|
||||||
// toLines returns the text and positions in `tl.marks` as a slice of textLine.
|
// toLines returns the text and positions in `pt.marks` as a slice of textLine.
|
||||||
// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
||||||
// that text is horizontal) before calling this function.
|
// that text is horizontal) before calling this function.
|
||||||
func (tl TextList) toLines(tol float64) []textLine {
|
func (pt PageText) toLines(tol float64) []textLine {
|
||||||
// We divide `tl.marks` into slices which contain texts with the same orientation, extract the lines
|
// We divide `pt.marks` into slices which contain texts with the same orientation, extract the lines
|
||||||
// for each orientation then return the concatention of these lines sorted by orientation.
|
// for each orientation then return the concatention of these lines sorted by orientation.
|
||||||
tlOrient := make(map[int][]textMark, len(tl.marks))
|
tlOrient := make(map[int][]textMark, len(pt.marks))
|
||||||
for _, t := range tl.marks {
|
for _, t := range pt.marks {
|
||||||
tlOrient[t.orient] = append(tlOrient[t.orient], t)
|
tlOrient[t.orient] = append(tlOrient[t.orient], t)
|
||||||
}
|
}
|
||||||
var lines []textLine
|
var lines []textLine
|
||||||
for _, o := range orientKeys(tlOrient) {
|
for _, o := range orientKeys(tlOrient) {
|
||||||
lines = append(lines, TextList{tlOrient[o]}.toLinesOrient(tol)...)
|
lines = append(lines, PageText{tlOrient[o]}.toLinesOrient(tol)...)
|
||||||
}
|
}
|
||||||
return lines
|
return lines
|
||||||
}
|
}
|
||||||
|
|
||||||
// toLinesOrient returns the text and positions in `tl.marks` as a slice of textLine.
|
// toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
|
||||||
// NOTE: This function only works on text lists where all text is the same orientation so it should
|
// NOTE: This function only works on text lists where all text is the same orientation so it should
|
||||||
// only be called from toLines.
|
// only be called from toLines.
|
||||||
// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
||||||
// that text is horizontal) before calling this function.
|
// that text is horizontal) before calling this function.
|
||||||
func (tl TextList) toLinesOrient(tol float64) []textLine {
|
func (pt PageText) toLinesOrient(tol float64) []textLine {
|
||||||
if len(tl.marks) == 0 {
|
if len(pt.marks) == 0 {
|
||||||
return []textLine{}
|
return []textLine{}
|
||||||
}
|
}
|
||||||
var lines []textLine
|
var lines []textLine
|
||||||
var words []string
|
var words []string
|
||||||
var x []float64
|
var x []float64
|
||||||
y := tl.marks[0].orientedStart.Y
|
y := pt.marks[0].orientedStart.Y
|
||||||
|
|
||||||
scanning := false
|
scanning := false
|
||||||
|
|
||||||
averageCharWidth := exponAve{}
|
averageCharWidth := exponAve{}
|
||||||
wordSpacing := exponAve{}
|
wordSpacing := exponAve{}
|
||||||
lastEndX := 0.0 // lastEndX is tl.marks[i-1].orientedEnd.X
|
lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X
|
||||||
|
|
||||||
for _, t := range tl.marks {
|
for _, t := range pt.marks {
|
||||||
if t.orientedStart.Y+tol < y {
|
if t.orientedStart.Y+tol < y {
|
||||||
if len(words) > 0 {
|
if len(words) > 0 {
|
||||||
line := newLine(y, x, words)
|
line := newLine(y, x, words)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user