mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-29 13:48:54 +08:00
Changes missed in previous commit.
This commit is contained in:
parent
e251b6b2f2
commit
4aa7e5051e
@ -30,28 +30,28 @@ func (e *Extractor) ExtractText() (string, error) {
|
||||
return text, err
|
||||
}
|
||||
|
||||
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output (`numChars`) and the
|
||||
// the number of characters that were not decoded (`numMisses`).
|
||||
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
||||
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
||||
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
|
||||
textList, numChars, numMisses, err := e.ExtractTextList()
|
||||
pageText, numChars, numMisses, err := e.ExtractPageText()
|
||||
if err != nil {
|
||||
return "", numChars, numMisses, err
|
||||
}
|
||||
return textList.ToText(), numChars, numMisses, nil
|
||||
return pageText.ToText(), numChars, numMisses, nil
|
||||
}
|
||||
|
||||
// ExtractTextList returns the text contents of `e` (an Extractor for a page) as a TextList.
|
||||
func (e *Extractor) ExtractTextList() (*TextList, int, int, error) {
|
||||
return e.extractTextList(e.contents, e.resources, 0)
|
||||
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
||||
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
|
||||
return e.extractPageText(e.contents, e.resources, 0)
|
||||
}
|
||||
|
||||
// extractTextList returns the text contents of content stream `e` and resouces `resources` as a
|
||||
// TextList.
|
||||
// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
|
||||
// PageText.
|
||||
// This can be called on a page or a form XObject.
|
||||
func (e *Extractor) extractTextList(contents string, resources *model.PdfPageResources, level int) (*TextList, int, int, error) {
|
||||
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (*PageText, int, int, error) {
|
||||
|
||||
common.Log.Trace("extractTextList: level=%d", level)
|
||||
textList := &TextList{}
|
||||
common.Log.Trace("extractPageText: level=%d", level)
|
||||
pageText := &PageText{}
|
||||
state := newTextState()
|
||||
fontStack := fontStacker{}
|
||||
var to *textObject
|
||||
@ -59,8 +59,8 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
||||
cstreamParser := contentstream.NewContentStreamParser(contents)
|
||||
operations, err := cstreamParser.Parse()
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: extractTextList parse failed. err=%v", err)
|
||||
return textList, state.numChars, state.numMisses, err
|
||||
common.Log.Debug("ERROR: extractPageText parse failed. err=%v", err)
|
||||
return pageText, state.numChars, state.numMisses, err
|
||||
}
|
||||
|
||||
processor := contentstream.NewContentStreamProcessor(*operations)
|
||||
@ -103,7 +103,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
||||
}
|
||||
to = newTextObject(e, resources, gs, &state, &fontStack)
|
||||
case "ET": // End Text
|
||||
(*textList).marks = append((*textList).marks, to.marks...)
|
||||
(*pageText).marks = append((*pageText).marks, to.marks...)
|
||||
to = nil
|
||||
case "T*": // Move to start of next text line
|
||||
to.nextLine()
|
||||
@ -297,7 +297,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
||||
if formResources == nil {
|
||||
formResources = resources
|
||||
}
|
||||
tList, numChars, numMisses, err := e.extractTextList(string(formContent),
|
||||
tList, numChars, numMisses, err := e.extractPageText(string(formContent),
|
||||
formResources, level+1)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
@ -307,7 +307,7 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
||||
e.formResults[string(name)] = formResult
|
||||
}
|
||||
|
||||
(*textList).marks = append((*textList).marks, formResult.textList.marks...)
|
||||
(*pageText).marks = append((*pageText).marks, formResult.pageText.marks...)
|
||||
state.numChars += formResult.numChars
|
||||
state.numMisses += formResult.numMisses
|
||||
}
|
||||
@ -318,11 +318,11 @@ func (e *Extractor) extractTextList(contents string, resources *model.PdfPageRes
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: Processing: err=%v", err)
|
||||
}
|
||||
return textList, state.numChars, state.numMisses, err
|
||||
return pageText, state.numChars, state.numMisses, err
|
||||
}
|
||||
|
||||
type textResult struct {
|
||||
textList TextList
|
||||
pageText PageText
|
||||
numChars int
|
||||
numMisses int
|
||||
}
|
||||
@ -827,31 +827,31 @@ func (t textMark) Width() float64 {
|
||||
return math.Abs(t.orientedStart.X - t.orientedEnd.X)
|
||||
}
|
||||
|
||||
// TextList represents the layout of text on a device page.
|
||||
// PageText represents the layout of text on a device page.
|
||||
// It's implementation is opaque to allow for future optimizations.
|
||||
type TextList struct {
|
||||
// TextList is currently implemented as a list of texts and their positions on a PDF page.
|
||||
type PageText struct {
|
||||
// PageText is currently implemented as a list of texts and their positions on a PDF page.
|
||||
marks []textMark
|
||||
}
|
||||
|
||||
// String returns a string describing `tl`.
|
||||
func (tl TextList) String() string {
|
||||
parts := []string{fmt.Sprintf("TextList: %d elements", tl.length())}
|
||||
for _, t := range tl.marks {
|
||||
// String returns a string describing `pt`.
|
||||
func (pt PageText) String() string {
|
||||
parts := []string{fmt.Sprintf("PageText: %d elements", pt.length())}
|
||||
for _, t := range pt.marks {
|
||||
parts = append(parts, t.String())
|
||||
}
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
// length returns the number of elements in `tl.marks`.
|
||||
func (tl TextList) length() int {
|
||||
return len(tl.marks)
|
||||
// length returns the number of elements in `pt.marks`.
|
||||
func (pt PageText) length() int {
|
||||
return len(pt.marks)
|
||||
}
|
||||
|
||||
// height returns the max height of the elements in `tl.marks`.
|
||||
func (tl TextList) height() float64 {
|
||||
// height returns the max height of the elements in `pt.marks`.
|
||||
func (pt PageText) height() float64 {
|
||||
fontHeight := 0.0
|
||||
for _, t := range tl.marks {
|
||||
for _, t := range pt.marks {
|
||||
if t.height > fontHeight {
|
||||
fontHeight = t.height
|
||||
}
|
||||
@ -859,19 +859,19 @@ func (tl TextList) height() float64 {
|
||||
return fontHeight
|
||||
}
|
||||
|
||||
// ToText returns the contents of `tl` as a single string.
|
||||
func (tl TextList) ToText() string {
|
||||
fontHeight := tl.height()
|
||||
// ToText returns the contents of `pt` as a single string.
|
||||
func (pt PageText) ToText() string {
|
||||
fontHeight := pt.height()
|
||||
// We sort with a y tolerance to allow for subscripts, diacritics etc.
|
||||
tol := minFloat(fontHeight*0.2, 5.0)
|
||||
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(tl.marks), fontHeight, tol)
|
||||
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)
|
||||
|
||||
// Uncomment the 2 following Trace statements to see the effects of sorting/
|
||||
// common.Log.Trace("ToText: Before sorting %s", tl)
|
||||
tl.sortPosition(tol)
|
||||
// common.Log.Trace("ToText: After sorting %s", tl)
|
||||
// common.Log.Trace("ToText: Before sorting %s", pt)
|
||||
pt.sortPosition(tol)
|
||||
// common.Log.Trace("ToText: After sorting %s", pt)
|
||||
|
||||
lines := tl.toLines(tol)
|
||||
lines := pt.toLines(tol)
|
||||
texts := make([]string, 0, len(lines))
|
||||
for _, l := range lines {
|
||||
texts = append(texts, l.text)
|
||||
@ -882,9 +882,9 @@ func (tl TextList) ToText() string {
|
||||
// sortPosition sorts a text list by its elements' position on a page.
|
||||
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
|
||||
// is horizontal.
|
||||
func (tl *TextList) sortPosition(tol float64) {
|
||||
sort.SliceStable((*tl).marks, func(i, j int) bool {
|
||||
ti, tj := (*tl).marks[i], (*tl).marks[j]
|
||||
func (pt *PageText) sortPosition(tol float64) {
|
||||
sort.SliceStable((*pt).marks, func(i, j int) bool {
|
||||
ti, tj := (*pt).marks[i], (*pt).marks[j]
|
||||
if ti.orient != tj.orient {
|
||||
return ti.orient < tj.orient
|
||||
}
|
||||
@ -903,44 +903,44 @@ type textLine struct {
|
||||
words []string // words in the line.
|
||||
}
|
||||
|
||||
// toLines returns the text and positions in `tl.marks` as a slice of textLine.
|
||||
// toLines returns the text and positions in `pt.marks` as a slice of textLine.
|
||||
// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
||||
// that text is horizontal) before calling this function.
|
||||
func (tl TextList) toLines(tol float64) []textLine {
|
||||
// We divide `tl.marks` into slices which contain texts with the same orientation, extract the lines
|
||||
func (pt PageText) toLines(tol float64) []textLine {
|
||||
// We divide `pt.marks` into slices which contain texts with the same orientation, extract the lines
|
||||
// for each orientation then return the concatention of these lines sorted by orientation.
|
||||
tlOrient := make(map[int][]textMark, len(tl.marks))
|
||||
for _, t := range tl.marks {
|
||||
tlOrient := make(map[int][]textMark, len(pt.marks))
|
||||
for _, t := range pt.marks {
|
||||
tlOrient[t.orient] = append(tlOrient[t.orient], t)
|
||||
}
|
||||
var lines []textLine
|
||||
for _, o := range orientKeys(tlOrient) {
|
||||
lines = append(lines, TextList{tlOrient[o]}.toLinesOrient(tol)...)
|
||||
lines = append(lines, PageText{tlOrient[o]}.toLinesOrient(tol)...)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
// toLinesOrient returns the text and positions in `tl.marks` as a slice of textLine.
|
||||
// toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
|
||||
// NOTE: This function only works on text lists where all text is the same orientation so it should
|
||||
// only be called from toLines.
|
||||
// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
||||
// that text is horizontal) before calling this function.
|
||||
func (tl TextList) toLinesOrient(tol float64) []textLine {
|
||||
if len(tl.marks) == 0 {
|
||||
func (pt PageText) toLinesOrient(tol float64) []textLine {
|
||||
if len(pt.marks) == 0 {
|
||||
return []textLine{}
|
||||
}
|
||||
var lines []textLine
|
||||
var words []string
|
||||
var x []float64
|
||||
y := tl.marks[0].orientedStart.Y
|
||||
y := pt.marks[0].orientedStart.Y
|
||||
|
||||
scanning := false
|
||||
|
||||
averageCharWidth := exponAve{}
|
||||
wordSpacing := exponAve{}
|
||||
lastEndX := 0.0 // lastEndX is tl.marks[i-1].orientedEnd.X
|
||||
lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X
|
||||
|
||||
for _, t := range tl.marks {
|
||||
for _, t := range pt.marks {
|
||||
if t.orientedStart.Y+tol < y {
|
||||
if len(words) > 0 {
|
||||
line := newLine(y, x, words)
|
||||
|
Loading…
x
Reference in New Issue
Block a user