unipdf/extractor/text_utils.go

276 lines
6.6 KiB
Go
Raw Normal View History

Text extraction code for columns. (#366) * Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
2020-07-01 05:33:10 +10:00
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"math"
"sort"
"unicode"
)
// TOL is the tolerance for coordinates to be consideted equal. It is big enough to cover all
// rounding errors and small enough that TOL point differences on a page aren't visible.
const TOL = 1.0e-6
// isZero returns true if x is with TOL of 0.0
func isZero(x float64) bool {
return math.Abs(x) < TOL
}
// minInt return the lesser of `a` and `b`.
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
// maxInt return the greater of `a` and `b`.
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// addNeighbours fills out the below and right fields of the paras in `paras`.
// For each para `a`:
// a.below is the unique highest para completely below `a` that overlaps it in the x-direction
// a.right is the unique leftmost para completely to the right of `a` that overlaps it in the y-direction
func (paras paraList) addNeighbours() {
paraNeighbours := paras.yNeighbours()
for _, para := range paras {
var left *textPara
dup := false
for _, k := range paraNeighbours[para] {
b := paras[k]
if b.Urx <= para.Llx {
if left == nil {
left = b
} else {
if b.Llx > left.Llx {
left = b
dup = false
} else if b.Llx == left.Llx {
dup = true
}
}
}
}
if !dup {
para.left = left
}
}
for _, para := range paras {
var right *textPara
dup := false
for _, k := range paraNeighbours[para] {
b := paras[k]
if b.Llx >= para.Urx {
if right == nil {
right = b
} else {
if b.Llx < right.Llx {
right = b
dup = false
} else if b.Llx == right.Llx {
dup = true
}
}
}
}
if !dup {
para.right = right
}
}
paraNeighbours = paras.xNeighbours()
for _, para := range paras {
var above *textPara
dup := false
for _, i := range paraNeighbours[para] {
b := paras[i]
if b.Lly >= para.Ury {
if above == nil {
above = b
} else {
if b.Ury < above.Ury {
above = b
dup = false
} else if b.Ury == above.Ury {
dup = true
}
}
}
}
if !dup {
para.above = above
}
}
for _, para := range paras {
var below *textPara
dup := false
for _, i := range paraNeighbours[para] {
b := paras[i]
if b.Ury <= para.Lly {
if below == nil {
below = b
} else {
if b.Ury > below.Ury {
below = b
dup = false
} else if b.Ury == below.Ury {
dup = true
}
}
}
}
if !dup {
para.below = below
}
}
}
// xNeighbours returns a map {para: indexes of paras that x-overlap para}.
func (paras paraList) xNeighbours() map[*textPara][]int {
events := make([]event, 2*len(paras))
for i, para := range paras {
events[2*i] = event{para.Llx, true, i}
events[2*i+1] = event{para.Urx, false, i}
}
return paras.eventNeighbours(events)
}
// yNeighbours returns a map {para: indexes of paras that y-overlap para}.
func (paras paraList) yNeighbours() map[*textPara][]int {
events := make([]event, 2*len(paras))
for i, para := range paras {
events[2*i] = event{para.Lly, true, i}
events[2*i+1] = event{para.Ury, false, i}
}
return paras.eventNeighbours(events)
}
// event is an entry or exit from an interval while scanning.
type event struct {
z float64 // Coordinate in the scanning direction.
enter bool // True if entering the interval, false it leaving.
i int // Index of the interval
}
// eventNeighbours returns a map {para: indexes of paras that overlap para in `events`}.
func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
sort.Slice(events, func(i, j int) bool {
ei, ej := events[i], events[j]
zi, zj := ei.z, ej.z
if zi != zj {
return zi < zj
}
if ei.enter != ej.enter {
return ei.enter
}
return i < j
})
overlaps := map[int]map[int]struct{}{}
olap := map[int]struct{}{}
for _, e := range events {
if e.enter {
overlaps[e.i] = map[int]struct{}{}
for i := range olap {
if i != e.i {
overlaps[e.i][i] = struct{}{}
overlaps[i][e.i] = struct{}{}
}
}
olap[e.i] = struct{}{}
} else {
delete(olap, e.i)
}
}
paraNeighbors := map[*textPara][]int{}
for i, olap := range overlaps {
para := paras[i]
neighbours := make([]int, len(olap))
k := 0
for j := range olap {
neighbours[k] = j
k++
}
paraNeighbors[para] = neighbours
}
return paraNeighbors
}
// isTextSpace returns true if `text` contains nothing but space code points.
func isTextSpace(text string) bool {
for _, r := range text {
if !unicode.IsSpace(r) {
return false
}
}
return true
}
// combiningDiacritic returns the combining version of `text` if text contains a single uncombined
// diacritic rune.
func combiningDiacritic(text string) (string, bool) {
runes := []rune(text)
if len(runes) != 1 {
return "", false
}
combining, isDiacritic := diacriticsToCombining[runes[0]]
return combining, isDiacritic
}
var (
// diacriticsToCombining is a map of diacritic runes to their combining diacritic equivalents.
// These values were copied from (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
diacriticsToCombining = map[rune]string{
0x0060: "\u0300", // ` -> ò
0x02CB: "\u0300", // ˋ -> ò
0x0027: "\u0301", // ' -> ó
0x00B4: "\u0301", // ´ -> ó
0x02B9: "\u0301", // ʹ -> ó
0x02CA: "\u0301", // ˊ -> ó
0x005E: "\u0302", // ^ -> ô
0x02C6: "\u0302", // ˆ -> ô
0x007E: "\u0303", // ~ -> õ
0x02DC: "\u0303", // ˜ -> õ
0x00AF: "\u0304", // ¯ -> ō
0x02C9: "\u0304", // ˉ -> ō
0x02D8: "\u0306", // ˘ -> ŏ
0x02D9: "\u0307", // ˙ -> ȯ
0x00A8: "\u0308", // ¨ -> ö
0x00B0: "\u030A", // ° -> o̊
0x02DA: "\u030A", // ˚ -> o̊
0x02BA: "\u030B", // ʺ -> ő
0x02DD: "\u030B", // ˝ -> ő
0x02C7: "\u030C", // ˇ -> ǒ
0x02C8: "\u030D", // ˈ -> o̍
0x0022: "\u030E", // " -> o̎
0x02BB: "\u0312", // ʻ -> o̒
0x02BC: "\u0313", // ʼ -> o̓
0x0486: "\u0313", // ҆ -> o̓
0x055A: "\u0313", // ՚ -> o̓
0x02BD: "\u0314", // ʽ -> o̔
0x0485: "\u0314", // ҅ -> o̔
0x0559: "\u0314", // ՙ -> o̔
0x02D4: "\u031D", // ˔ -> o̝
0x02D5: "\u031E", // ˕ -> o̞
0x02D6: "\u031F", // ˖ -> o̟
0x02D7: "\u0320", // ˗ -> o̠
0x02B2: "\u0321", // ʲ -> o̡
0x00B8: "\u0327", // ¸ -> o̧
0x02CC: "\u0329", // ˌ -> o̩
0x02B7: "\u032B", // ʷ -> o̫
0x02CD: "\u0331", // ˍ -> o̱
0x005F: "\u0332", // _ -> o̲
0x204E: "\u0359", // -> o͙
}
)