mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-01 22:17:29 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
376 lines
11 KiB
Go
376 lines
11 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// wordBag is just a list of textWords in a rectangular region. It is needed for efficient
|
|
// comparison of the bounding boxes of the words to arrange them into paragraph regions.
|
|
// The implementation is not important as long as it implements the main function scanBand()
|
|
// efficiently.
|
|
// In the current implementation, wordBag is a list of word fragment bins arranged by their depth on
|
|
// a page with the word fragments in each bin are sorted in reading order.
|
|
type wordBag struct {
|
|
model.PdfRectangle // Bounding box of all the textWord in the wordBag.
|
|
fontsize float64 // The size of the largest font in the wordBag.
|
|
// The following fields are for the current bin based implementation
|
|
pageHeight float64 // Used to calculate depths
|
|
bins map[int][]*textWord // bins[n] = w: n*depthBinPoints <= w.depth < (n+1)*depthBinPoints
|
|
}
|
|
|
|
// makeWordBag return a wordBag containg `words`
|
|
// In the current implementation, it does this by putting the words into the appropriate depth bins.
|
|
// Caller must check that `words` has at least one element.
|
|
func makeWordBag(words []*textWord, pageHeight float64) *wordBag {
|
|
b := newWordBag(words[0], pageHeight)
|
|
for _, w := range words[1:] {
|
|
depthIdx := depthIndex(w.depth)
|
|
b.bins[depthIdx] = append(b.bins[depthIdx], w)
|
|
}
|
|
b.sort()
|
|
return b
|
|
}
|
|
|
|
// newWordBag returns a wordBag with page height `pageHeight` with the single word fragment `word`.
|
|
func newWordBag(word *textWord, pageHeight float64) *wordBag {
|
|
depthIdx := depthIndex(word.depth)
|
|
words := []*textWord{word}
|
|
bag := wordBag{
|
|
bins: map[int][]*textWord{depthIdx: words},
|
|
PdfRectangle: word.PdfRectangle,
|
|
fontsize: word.fontsize,
|
|
pageHeight: pageHeight,
|
|
}
|
|
return &bag
|
|
}
|
|
|
|
// String returns a description of `b`.
|
|
func (b *wordBag) String() string {
|
|
var texts []string
|
|
for _, depthIdx := range b.depthIndexes() {
|
|
words, _ := b.bins[depthIdx]
|
|
for _, w := range words {
|
|
texts = append(texts, w.text)
|
|
}
|
|
}
|
|
return fmt.Sprintf("%.2f fontsize=%.2f %d %q", b.PdfRectangle, b.fontsize, len(texts), texts)
|
|
}
|
|
|
|
// scanBand scans the bins for words w:
|
|
// `minDepth` <= w.depth <= `maxDepth` && // in the depth diraction
|
|
// `readingOverlap`(`para`, w) && // in the reading directon
|
|
// math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance
|
|
// and applies `moveWord`(depthIdx, s,para w) to them.
|
|
// If `detectOnly` is true, moveWord is not applied.
|
|
// If `freezeDepth` is true, minDepth and maxDepth are not updated in scan as words are added.
|
|
func (b *wordBag) scanBand(title string, para *wordBag,
|
|
readingOverlap func(para *wordBag, word *textWord) bool,
|
|
minDepth, maxDepth, fontTol float64,
|
|
detectOnly, freezeDepth bool) int {
|
|
fontsize := para.fontsize
|
|
lineDepth := lineDepthR * fontsize
|
|
n := 0
|
|
minDepth0, maxDepth0 := minDepth, maxDepth
|
|
var newWords []*textWord
|
|
for _, depthIdx := range b.depthBand(minDepth-lineDepth, maxDepth+lineDepth) {
|
|
for _, word := range b.bins[depthIdx] {
|
|
if !(minDepth-lineDepth <= word.depth && word.depth <= maxDepth+lineDepth) {
|
|
continue
|
|
}
|
|
if !readingOverlap(para, word) {
|
|
continue
|
|
}
|
|
fontRatio1 := math.Abs(word.fontsize-fontsize) / fontsize
|
|
fontRatio2 := word.fontsize / fontsize
|
|
fontRatio := math.Min(fontRatio1, fontRatio2)
|
|
if fontTol > 0 {
|
|
if fontRatio > fontTol {
|
|
continue
|
|
}
|
|
}
|
|
|
|
if !detectOnly {
|
|
para.pullWord(b, word, depthIdx)
|
|
}
|
|
newWords = append(newWords, word)
|
|
n++
|
|
if !freezeDepth {
|
|
if word.depth < minDepth {
|
|
minDepth = word.depth
|
|
}
|
|
if word.depth > maxDepth {
|
|
maxDepth = word.depth
|
|
}
|
|
}
|
|
// Has no effect on results
|
|
// fontsize = para.fontsize
|
|
// lineDepth = lineDepthR * fontsize
|
|
if detectOnly {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if verbose {
|
|
if len(title) > 0 {
|
|
common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f fontsize=%.2f %q",
|
|
title,
|
|
minDepth0, maxDepth0,
|
|
minDepth, maxDepth,
|
|
para.PdfRectangle, para.fontsize, truncate(para.text(), 20))
|
|
for i, word := range newWords {
|
|
fmt.Printf(" %q", word.text)
|
|
if i >= 5 {
|
|
break
|
|
}
|
|
}
|
|
if len(newWords) > 0 {
|
|
fmt.Println()
|
|
}
|
|
}
|
|
}
|
|
return n
|
|
}
|
|
|
|
// highestWord returns the hight word in b.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
|
|
func (b *wordBag) highestWord(depthIdx int, minDepth, maxDepth float64) *textWord {
|
|
for _, word := range b.bins[depthIdx] {
|
|
if minDepth <= word.depth && word.depth <= maxDepth {
|
|
return word
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`.
|
|
func (b *wordBag) depthBand(minDepth, maxDepth float64) []int {
|
|
if len(b.bins) == 0 {
|
|
return nil
|
|
}
|
|
return b.depthRange(b.getDepthIdx(minDepth), b.getDepthIdx(maxDepth))
|
|
}
|
|
|
|
// depthRange returns the sorted keys of b.bins for depths indexes [`minDepth`,`maxDepth`).
|
|
func (b *wordBag) depthRange(minDepthIdx, maxDepthIdx int) []int {
|
|
indexes := b.depthIndexes()
|
|
var rangeIndexes []int
|
|
for _, depthIdx := range indexes {
|
|
if minDepthIdx <= depthIdx && depthIdx <= maxDepthIdx {
|
|
rangeIndexes = append(rangeIndexes, depthIdx)
|
|
}
|
|
}
|
|
return rangeIndexes
|
|
}
|
|
|
|
// firstReadingIndex returns the index of the bin containing the left-most word near the top of `b`.
|
|
// Precisely, this is the index of the depth bin that starts with that word with the smallest
|
|
// reading direction value in the depth region `minDepthIndex` < depth <= minDepthIndex+ 4*fontsize
|
|
// The point of this function is to find the top-most left-most word in `b` that is not a superscript.
|
|
func (b *wordBag) firstReadingIndex(minDepthIdx int) int {
|
|
fontsize := b.firstWord(minDepthIdx).fontsize
|
|
minDepth := float64(minDepthIdx+1) * depthBinPoints
|
|
maxDepth := minDepth + topWordRangeR*fontsize
|
|
firstReadingIdx := minDepthIdx
|
|
for _, depthIdx := range b.depthBand(minDepth, maxDepth) {
|
|
if diffReading(b.firstWord(depthIdx), b.firstWord(firstReadingIdx)) < 0 {
|
|
firstReadingIdx = depthIdx
|
|
}
|
|
}
|
|
return firstReadingIdx
|
|
}
|
|
|
|
// getDepthIdx returns the index into `b.bins` for depth axis value `depth`.
|
|
// Caller must check that len(b.bins) > 0.
|
|
func (b *wordBag) getDepthIdx(depth float64) int {
|
|
indexes := b.depthIndexes()
|
|
depthIdx := depthIndex(depth)
|
|
if depthIdx < indexes[0] {
|
|
return indexes[0]
|
|
}
|
|
if depthIdx > indexes[len(indexes)-1] {
|
|
return indexes[len(indexes)-1]
|
|
}
|
|
return depthIdx
|
|
}
|
|
|
|
// empty returns true if the depth bin with index `depthIdx` is empty.
|
|
// NOTE: We delete bins as soon as they become empty so we just have to check for the bin's existence.
|
|
func (b *wordBag) empty(depthIdx int) bool {
|
|
_, ok := b.bins[depthIdx]
|
|
return !ok
|
|
}
|
|
|
|
// firstWord returns the first word in reading order in bin `depthIdx`.
|
|
func (b *wordBag) firstWord(depthIdx int) *textWord {
|
|
return b.bins[depthIdx][0]
|
|
}
|
|
|
|
// stratum returns a copy of `b`.bins[`depthIdx`].
|
|
// stratum is guaranteed to return a non-nil value. It must be called with a valid depth index.
|
|
// NOTE: We need to return a copy because remove() and other functions manipulate the array
|
|
// underlying the slice.
|
|
func (b *wordBag) stratum(depthIdx int) []*textWord {
|
|
words := b.bins[depthIdx]
|
|
dup := make([]*textWord, len(words))
|
|
copy(dup, words)
|
|
return dup
|
|
}
|
|
|
|
// pullWord adds `word` to `b` and removes it from `bag`.
|
|
// `depthIdx` is the depth index of `word` in all wordBags.
|
|
// TODO(peterwilliams97): Compute depthIdx from `word` instead of passing it around.
|
|
func (b *wordBag) pullWord(bag *wordBag, word *textWord, depthIdx int) {
|
|
b.PdfRectangle = rectUnion(b.PdfRectangle, word.PdfRectangle)
|
|
if word.fontsize > b.fontsize {
|
|
b.fontsize = word.fontsize
|
|
}
|
|
b.bins[depthIdx] = append(b.bins[depthIdx], word)
|
|
bag.removeWord(word, depthIdx)
|
|
}
|
|
|
|
// removeWord removes `word`from `b`.
|
|
// In the current implementation it removes `word`from `b`.bins[`depthIdx`].
|
|
// NOTE: We delete bins as soon as they become empty to save code that calls other wordBag
|
|
// functions from having to check for empty bins.
|
|
// TODO(peterwilliams97): Find a more efficient way of doing this.
|
|
func (b *wordBag) removeWord(word *textWord, depthIdx int) {
|
|
words := removeWord(b.stratum(depthIdx), word)
|
|
if len(words) == 0 {
|
|
delete(b.bins, depthIdx)
|
|
} else {
|
|
b.bins[depthIdx] = words
|
|
}
|
|
}
|
|
|
|
// mergeWordBags merges the bags less than a character width to the left of a bag into that bag.
|
|
func mergeWordBags(paraWords []*wordBag) []*wordBag {
|
|
if len(paraWords) <= 1 {
|
|
return paraWords
|
|
}
|
|
if verbose {
|
|
common.Log.Info("mergeWordBags:")
|
|
}
|
|
sort.Slice(paraWords, func(i, j int) bool {
|
|
pi, pj := paraWords[i], paraWords[j]
|
|
ai := pi.Width() * pi.Height()
|
|
aj := pj.Width() * pj.Height()
|
|
if ai != aj {
|
|
return ai > aj
|
|
}
|
|
if pi.Height() != pj.Height() {
|
|
return pi.Height() > pj.Height()
|
|
}
|
|
return i < j
|
|
})
|
|
var merged []*wordBag
|
|
absorbed := map[int]struct{}{}
|
|
for i0 := 0; i0 < len(paraWords); i0++ {
|
|
if _, ok := absorbed[i0]; ok {
|
|
continue
|
|
}
|
|
para0 := paraWords[i0]
|
|
for i1 := i0 + 1; i1 < len(paraWords); i1++ {
|
|
if _, ok := absorbed[i0]; ok {
|
|
continue
|
|
}
|
|
para1 := paraWords[i1]
|
|
r := para0.PdfRectangle
|
|
r.Llx -= para0.fontsize
|
|
if rectContainsRect(r, para1.PdfRectangle) {
|
|
para0.absorb(para1)
|
|
absorbed[i1] = struct{}{}
|
|
}
|
|
}
|
|
merged = append(merged, para0)
|
|
}
|
|
|
|
if len(paraWords) != len(merged)+len(absorbed) {
|
|
common.Log.Error("mergeWordBags: %d->%d absorbed=%d",
|
|
len(paraWords), len(merged), len(absorbed))
|
|
}
|
|
return merged
|
|
}
|
|
|
|
// absorb combines the words from `bag` into `b`.
|
|
func (b *wordBag) absorb(bag *wordBag) {
|
|
for depthIdx, words := range bag.bins {
|
|
for _, word := range words {
|
|
b.pullWord(bag, word, depthIdx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// depthIndex returns a bin index for depth `depth`.
|
|
// The returned depthIdx obeys the following rule.
|
|
// depthIdx * depthBinPoints <= depth <= (depthIdx+1) * depthBinPoint
|
|
func depthIndex(depth float64) int {
|
|
var depthIdx int
|
|
if depth >= 0 {
|
|
depthIdx = int(depth / depthBinPoints)
|
|
} else {
|
|
depthIdx = int(depth/depthBinPoints) - 1
|
|
}
|
|
return depthIdx
|
|
}
|
|
|
|
// depthIndexes returns the sorted keys of b.bins.
|
|
func (b *wordBag) depthIndexes() []int {
|
|
if len(b.bins) == 0 {
|
|
return nil
|
|
}
|
|
indexes := make([]int, len(b.bins))
|
|
i := 0
|
|
for idx := range b.bins {
|
|
indexes[i] = idx
|
|
i++
|
|
}
|
|
sort.Ints(indexes)
|
|
return indexes
|
|
}
|
|
|
|
// sort sorts the word fragments in each bin in `b` in the reading direction.
|
|
func (b *wordBag) sort() {
|
|
for _, bin := range b.bins {
|
|
sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 })
|
|
}
|
|
}
|
|
|
|
// minDepth returns the minimum depth that word fragments in `b` touch.
|
|
func (b *wordBag) minDepth() float64 {
|
|
return b.pageHeight - (b.Ury - b.fontsize)
|
|
}
|
|
|
|
// maxDepth returns the maximum depth that word fragments in `b` touch.
|
|
func (b *wordBag) maxDepth() float64 {
|
|
return b.pageHeight - b.Lly
|
|
}
|
|
|
|
// The following functions are used only for logging.
|
|
|
|
func (b *wordBag) text() string {
|
|
words := b.allWords()
|
|
texts := make([]string, len(words))
|
|
for i, w := range words {
|
|
texts[i] = w.text
|
|
}
|
|
return strings.Join(texts, " ")
|
|
}
|
|
|
|
func (b *wordBag) allWords() []*textWord {
|
|
var wordList []*textWord
|
|
for _, words := range b.bins {
|
|
wordList = append(wordList, words...)
|
|
}
|
|
return wordList
|
|
}
|