mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
355 lines
11 KiB
Go
355 lines
11 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"sort"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
|
|
// type so we can have methods on it.
|
|
type paraList []*textPara
|
|
|
|
// textPara is a group of words in a rectangular region of a page that get read together.
|
|
// A paragraph in a document might span multiple pages. This is a paragraph fragment on one page.
|
|
// textParas can be tables in which case the content is in `table`, otherwise the content is in `lines`.
|
|
// textTable cells are textParas so this gives one level of recursion
|
|
type textPara struct {
|
|
model.PdfRectangle // Bounding box.
|
|
eBBox model.PdfRectangle // Extended bounding box needed to compute reading order.
|
|
lines []*textLine // The lines in the paragraph. (nil for the table case)
|
|
table *textTable // The table contained in this region if there is one. nil otherwise
|
|
// The following fields are used for detecting and extracting tables.
|
|
isCell bool // Is this para a cell in a textTable?
|
|
// The unique highest para completely to the left of this that overlaps it in the y-direction, if one exists..
|
|
left *textPara
|
|
// The unique highest para completely to the right of this that overlaps it in the y-direction, if one exists.
|
|
right *textPara
|
|
// The unique highest para completely above this that overlaps it in the x-direction, if one exists.
|
|
above *textPara
|
|
// The unique highest para completely below this that overlaps it in the x-direction, if one exists.
|
|
below *textPara
|
|
}
|
|
|
|
// makeTextPara returns a textPara with bounding rectangle `bbox`.
|
|
func makeTextPara(bbox model.PdfRectangle, lines []*textLine) *textPara {
|
|
return &textPara{PdfRectangle: bbox, lines: lines}
|
|
}
|
|
|
|
// String returns a description of `p`.
|
|
func (p *textPara) String() string {
|
|
table := ""
|
|
if p.table != nil {
|
|
table = fmt.Sprintf("[%dx%d] ", p.table.w, p.table.h)
|
|
}
|
|
return fmt.Sprintf("%6.2f %s%d lines %q",
|
|
p.PdfRectangle, table, len(p.lines), truncate(p.text(), 50))
|
|
}
|
|
|
|
// depth returns the paragraph's depth. which is the depth of its top line.
|
|
// We return the top line depth because textPara depth is used to tell if 2 paras have the same
|
|
// depth. English readers compare paragraph depths by their top lines.
|
|
func (p *textPara) depth() float64 {
|
|
if len(p.lines) > 0 {
|
|
return p.lines[0].depth
|
|
}
|
|
// Use the top left cell of the table if there is one
|
|
return p.table.get(0, 0).depth()
|
|
}
|
|
|
|
// text is a convenience function that returns the text `p` including tables.
|
|
func (p *textPara) text() string {
|
|
w := new(bytes.Buffer)
|
|
p.writeText(w)
|
|
return w.String()
|
|
}
|
|
|
|
// writeText writes the text of `p` including tables to `w`.
|
|
func (p *textPara) writeText(w io.Writer) {
|
|
if p.table == nil {
|
|
p.writeCellText(w)
|
|
return
|
|
}
|
|
for y := 0; y < p.table.h; y++ {
|
|
for x := 0; x < p.table.w; x++ {
|
|
cell := p.table.get(x, y)
|
|
if cell == nil {
|
|
w.Write([]byte("\t"))
|
|
} else {
|
|
cell.writeCellText(w)
|
|
}
|
|
w.Write([]byte(" "))
|
|
}
|
|
if y < p.table.h-1 {
|
|
w.Write([]byte("\n"))
|
|
}
|
|
}
|
|
}
|
|
|
|
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
|
|
// paras `p`.writeText().
|
|
func (p *textPara) toTextMarks(offset *int) []TextMark {
|
|
if p.table == nil {
|
|
return p.toCellTextMarks(offset)
|
|
}
|
|
var marks []TextMark
|
|
for y := 0; y < p.table.h; y++ {
|
|
for x := 0; x < p.table.w; x++ {
|
|
cell := p.table.get(x, y)
|
|
if cell == nil {
|
|
marks = appendSpaceMark(marks, offset, "\t")
|
|
} else {
|
|
cellMarks := cell.toCellTextMarks(offset)
|
|
marks = append(marks, cellMarks...)
|
|
}
|
|
marks = appendSpaceMark(marks, offset, " ")
|
|
}
|
|
if y < p.table.h-1 {
|
|
marks = appendSpaceMark(marks, offset, "\n")
|
|
}
|
|
}
|
|
return marks
|
|
}
|
|
|
|
// writeCellText writes the text of `p` not including tables to `w`.
|
|
func (p *textPara) writeCellText(w io.Writer) {
|
|
for il, line := range p.lines {
|
|
lineText := line.text()
|
|
reduced := doHyphens && line.endsInHyphen() && il != len(p.lines)-1
|
|
if reduced { // Line ending with hyphen. Remove it.
|
|
lineText = removeLastRune(lineText)
|
|
}
|
|
w.Write([]byte(lineText))
|
|
if !(reduced || il == len(p.lines)-1) {
|
|
w.Write([]byte(getSpace(line.depth, p.lines[il+1].depth)))
|
|
}
|
|
}
|
|
}
|
|
|
|
// toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by
|
|
// paras `p`.writeCellText().
|
|
func (p *textPara) toCellTextMarks(offset *int) []TextMark {
|
|
var marks []TextMark
|
|
for il, line := range p.lines {
|
|
lineMarks := line.toTextMarks(offset)
|
|
reduced := doHyphens && line.endsInHyphen() && il != len(p.lines)-1
|
|
if reduced { // Line ending with hyphen. Remove it.
|
|
lineMarks = removeLastTextMarkRune(lineMarks, offset)
|
|
}
|
|
marks = append(marks, lineMarks...)
|
|
if !(reduced || il == len(p.lines)-1) {
|
|
marks = appendSpaceMark(marks, offset, getSpace(line.depth, p.lines[il+1].depth))
|
|
}
|
|
}
|
|
return marks
|
|
}
|
|
|
|
// removeLastTextMarkRune removes the last rune from `marks`.
|
|
func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark {
|
|
tm := marks[len(marks)-1]
|
|
runes := []rune(tm.Text)
|
|
if len(runes) == 1 {
|
|
marks = marks[:len(marks)-1]
|
|
tm1 := marks[len(marks)-1]
|
|
*offset = tm1.Offset + len(tm1.Text)
|
|
} else {
|
|
text := removeLastRune(tm.Text)
|
|
*offset += len(text) - len(tm.Text)
|
|
tm.Text = text
|
|
}
|
|
return marks
|
|
}
|
|
|
|
// removeLastRune removes the last run from `text`.
|
|
func removeLastRune(text string) string {
|
|
runes := []rune(text)
|
|
return string(runes[:len(runes)-1])
|
|
}
|
|
|
|
// getSpace returns the space to insert between lines of depth `depth1` and `depth2`.
|
|
// Next line is the same depth so it's the same line as this one in the extracted text
|
|
func getSpace(depth1, depth2 float64) string {
|
|
eol := !isZero(depth1 - depth2)
|
|
if eol {
|
|
return "\n"
|
|
}
|
|
return " "
|
|
}
|
|
|
|
// bbox makes textPara implement the `bounded` interface.
|
|
func (p *textPara) bbox() model.PdfRectangle {
|
|
return p.PdfRectangle
|
|
}
|
|
|
|
// fontsize return the para's fontsize which we take to be the first line's fontsize.
|
|
// Caller must check that `p` has at least one line.
|
|
func (p *textPara) fontsize() float64 {
|
|
return p.lines[0].fontsize
|
|
}
|
|
|
|
// removeDuplicates removes duplicate word fragments such as those used for bolding.
|
|
func (b *wordBag) removeDuplicates() {
|
|
for _, depthIdx := range b.depthIndexes() {
|
|
if len(b.bins[depthIdx]) == 0 {
|
|
continue
|
|
}
|
|
word := b.bins[depthIdx][0]
|
|
delta := maxDuplicateWordR * word.fontsize
|
|
minDepth := word.depth
|
|
for _, idx := range b.depthBand(minDepth, minDepth+delta) {
|
|
duplicates := map[*textWord]struct{}{}
|
|
words := b.bins[idx]
|
|
for _, w := range words {
|
|
if w != word && w.text == word.text &&
|
|
math.Abs(w.Llx-word.Llx) < delta &&
|
|
math.Abs(w.Urx-word.Urx) < delta &&
|
|
math.Abs(w.Lly-word.Lly) < delta &&
|
|
math.Abs(w.Ury-word.Ury) < delta {
|
|
duplicates[w] = struct{}{}
|
|
}
|
|
}
|
|
if len(duplicates) > 0 {
|
|
i := 0
|
|
for _, w := range words {
|
|
if _, ok := duplicates[w]; !ok {
|
|
words[i] = w
|
|
i++
|
|
}
|
|
}
|
|
b.bins[idx] = words[:len(words)-len(duplicates)]
|
|
if len(b.bins[idx]) == 0 {
|
|
delete(b.bins, idx)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// arrangeText arranges the word fragments (textWords) in `b` into lines and words.
|
|
// The lines are groups of textWords of similar depths.
|
|
// The textWords in each line are sorted in reading order and those that start whole words (as
|
|
// opposed to word fragments) have their `newWord` flag set to true.
|
|
func (b *wordBag) arrangeText() *textPara {
|
|
b.sort() // Sort the words in `b`'s bins in the reading direction.
|
|
|
|
if doRemoveDuplicates {
|
|
b.removeDuplicates()
|
|
}
|
|
|
|
var lines []*textLine
|
|
|
|
// Build the lines by iterating through the words from top to bottom.
|
|
// In the current implementation, we do this by emptying the word bins in increasing depth order.
|
|
for _, depthIdx := range b.depthIndexes() {
|
|
for !b.empty(depthIdx) {
|
|
|
|
// firstWord is the left-most word near the top of the bin with index `depthIdx`. As we
|
|
// are scanning down `b`, this is the left-most word near the top of the `b`
|
|
firstReadingIdx := b.firstReadingIndex(depthIdx)
|
|
firstWord := b.firstWord(firstReadingIdx)
|
|
// Create a new line.
|
|
line := newTextLine(b, firstReadingIdx)
|
|
|
|
// Compute the search range based on `b` first word fontsize.
|
|
fontsize := firstWord.fontsize
|
|
minDepth := firstWord.depth - lineDepthR*fontsize
|
|
maxDepth := firstWord.depth + lineDepthR*fontsize
|
|
maxIntraWordGap := maxIntraWordGapR * fontsize
|
|
maxIntraLineOverlap := maxIntraLineOverlapR * fontsize
|
|
|
|
// Find the rest of the words in the line that starts with `firstWord`
|
|
// Search down from `minDepth`, half a line above `firstWord` to `maxDepth`, half a line
|
|
// below `firstWord` for the leftmost word to the right of the last word in `line`.
|
|
remainingWords:
|
|
for {
|
|
var nextWord *textWord // The next word to add to `line` if there is one.
|
|
nextDepthIdx := 0 // nextWord's depthIndex
|
|
// We start with this highest remaining word
|
|
for _, depthIdx := range b.depthBand(minDepth, maxDepth) {
|
|
word := b.highestWord(depthIdx, minDepth, maxDepth)
|
|
if word == nil {
|
|
continue
|
|
}
|
|
gap := gapReading(word, line.words[len(line.words)-1])
|
|
if gap < -maxIntraLineOverlap { // Reverted too far to left. Can't be same line.
|
|
break remainingWords
|
|
}
|
|
if gap > maxIntraWordGap { // Advanced too far too right. Might not be same line.
|
|
continue
|
|
}
|
|
if nextWord != nil && diffReading(word, nextWord) >= 0 { // Not leftmost world
|
|
continue
|
|
}
|
|
nextWord = word
|
|
nextDepthIdx = depthIdx
|
|
}
|
|
if nextWord == nil { // No more words in this line.
|
|
break
|
|
}
|
|
// remove `nextWord` from `b` and append it to `line`.
|
|
line.pullWord(b, nextWord, nextDepthIdx)
|
|
}
|
|
|
|
line.markWordBoundaries()
|
|
lines = append(lines, line)
|
|
}
|
|
}
|
|
|
|
if len(lines) == 0 {
|
|
return nil
|
|
}
|
|
|
|
sort.Slice(lines, func(i, j int) bool {
|
|
return diffDepthReading(lines[i], lines[j]) < 0
|
|
})
|
|
|
|
para := makeTextPara(b.PdfRectangle, lines)
|
|
|
|
if verbosePara {
|
|
common.Log.Info("arrangeText !!! para=%s", para.String())
|
|
if verboseParaLine {
|
|
for i, line := range para.lines {
|
|
fmt.Printf("%4d: %s\n", i, line.String())
|
|
if verboseParaWord {
|
|
for j, word := range line.words {
|
|
fmt.Printf("%8d: %s\n", j, word.String())
|
|
for k, mark := range word.marks {
|
|
fmt.Printf("%12d: %s\n", k, mark.String())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return para
|
|
}
|
|
|
|
// log logs the contents of `paras`.
|
|
func (paras paraList) log(title string) {
|
|
if !verbosePage {
|
|
return
|
|
}
|
|
common.Log.Info("%8s: %d paras =======-------=======", title, len(paras))
|
|
for i, para := range paras {
|
|
if para == nil {
|
|
continue
|
|
}
|
|
text := para.text()
|
|
tabl := " "
|
|
if para.table != nil {
|
|
tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
|
|
}
|
|
fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
|
|
}
|
|
}
|