unipdf/extractor/text_utils.go
Peter Williams 88fda44e0a
Text extraction code for columns. (#366)
* Fixed filename:page in logging

* Got CMap working for multi-rune entries

* Treat CMap entries as strings instead of runes to handle multi-byte encodings.

* Added a test for multibyte encoding.

* First version of text extraction that recognizes columns

* Added an expanation of the text columns code to README.md.

* fixed typos

* Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

* Added function comments.

* Fixed text state save/restore.

* Adjusted inter-word search distance to make paragrah division work for thanh.pdf

* Got text_test.go passing.

* Reinstated hyphen suppression

* Handle more cases of fonts not being set in text extraction code.

* Fixed typo

* More verbose logging

* Adding tables to text extractor.

* Added tests for columns extraction.

* Removed commented code

* Check for textParas that are on the same line when writing out extracted text.

* Absorb text to the left of paras into paras e.g. Footnote numbers

* Removed funny character from text_test.go

* Commented out a creator_test.go test that was broken by my text extraction changes.

* Big changes to columns text extraction code for PR.

Performance improvements in several places.
Commented code.

* Updated extractor/README

* Cleaned up some comments and removed a panic

* Increased threshold for truncating extracted text when there is no license 100 -> 102.

This is a workaround to let a test in creator_test.go pass.

With the old text extraction code the following extracted text was 100 chars. With the new code it
is 102 chars which looks correct.

"你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n"

* Improved an error message.

* Removed irrelevant spaces

* Commented code and removed unused functions.

* Reverted PdfRectangle changes

* Added duplicate text detection.

* Combine diacritic textMarks in text extraction

* Reinstated a diacritic recombination test.

* Small code reorganisation

* Reinstated handling of rotated text

* Addressed issues in PR review

* Added color fields to TextMark

* Updated README

* Reinstated the disabled tests I missed before.

* Tightened definition for tables to prevent detection of tables where there weren't any.

* Compute line splitting search range based on fontsize of first word in word bag.

* Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs.

See https://blog.golang.org/go1.13-errors

* Fixed some naming and added some comments.

* errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility

* Removed code that doesn't ever get called.

* Removed unused test
2020-06-30 19:33:10 +00:00

276 lines
6.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"math"
"sort"
"unicode"
)
// TOL is the tolerance for coordinates to be consideted equal. It is big enough to cover all
// rounding errors and small enough that TOL point differences on a page aren't visible.
const TOL = 1.0e-6
// isZero returns true if x is with TOL of 0.0
func isZero(x float64) bool {
return math.Abs(x) < TOL
}
// minInt return the lesser of `a` and `b`.
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
// maxInt return the greater of `a` and `b`.
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// addNeighbours fills out the below and right fields of the paras in `paras`.
// For each para `a`:
// a.below is the unique highest para completely below `a` that overlaps it in the x-direction
// a.right is the unique leftmost para completely to the right of `a` that overlaps it in the y-direction
func (paras paraList) addNeighbours() {
paraNeighbours := paras.yNeighbours()
for _, para := range paras {
var left *textPara
dup := false
for _, k := range paraNeighbours[para] {
b := paras[k]
if b.Urx <= para.Llx {
if left == nil {
left = b
} else {
if b.Llx > left.Llx {
left = b
dup = false
} else if b.Llx == left.Llx {
dup = true
}
}
}
}
if !dup {
para.left = left
}
}
for _, para := range paras {
var right *textPara
dup := false
for _, k := range paraNeighbours[para] {
b := paras[k]
if b.Llx >= para.Urx {
if right == nil {
right = b
} else {
if b.Llx < right.Llx {
right = b
dup = false
} else if b.Llx == right.Llx {
dup = true
}
}
}
}
if !dup {
para.right = right
}
}
paraNeighbours = paras.xNeighbours()
for _, para := range paras {
var above *textPara
dup := false
for _, i := range paraNeighbours[para] {
b := paras[i]
if b.Lly >= para.Ury {
if above == nil {
above = b
} else {
if b.Ury < above.Ury {
above = b
dup = false
} else if b.Ury == above.Ury {
dup = true
}
}
}
}
if !dup {
para.above = above
}
}
for _, para := range paras {
var below *textPara
dup := false
for _, i := range paraNeighbours[para] {
b := paras[i]
if b.Ury <= para.Lly {
if below == nil {
below = b
} else {
if b.Ury > below.Ury {
below = b
dup = false
} else if b.Ury == below.Ury {
dup = true
}
}
}
}
if !dup {
para.below = below
}
}
}
// xNeighbours returns a map {para: indexes of paras that x-overlap para}.
func (paras paraList) xNeighbours() map[*textPara][]int {
events := make([]event, 2*len(paras))
for i, para := range paras {
events[2*i] = event{para.Llx, true, i}
events[2*i+1] = event{para.Urx, false, i}
}
return paras.eventNeighbours(events)
}
// yNeighbours returns a map {para: indexes of paras that y-overlap para}.
func (paras paraList) yNeighbours() map[*textPara][]int {
events := make([]event, 2*len(paras))
for i, para := range paras {
events[2*i] = event{para.Lly, true, i}
events[2*i+1] = event{para.Ury, false, i}
}
return paras.eventNeighbours(events)
}
// event is an entry or exit from an interval while scanning.
type event struct {
z float64 // Coordinate in the scanning direction.
enter bool // True if entering the interval, false it leaving.
i int // Index of the interval
}
// eventNeighbours returns a map {para: indexes of paras that overlap para in `events`}.
func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
sort.Slice(events, func(i, j int) bool {
ei, ej := events[i], events[j]
zi, zj := ei.z, ej.z
if zi != zj {
return zi < zj
}
if ei.enter != ej.enter {
return ei.enter
}
return i < j
})
overlaps := map[int]map[int]struct{}{}
olap := map[int]struct{}{}
for _, e := range events {
if e.enter {
overlaps[e.i] = map[int]struct{}{}
for i := range olap {
if i != e.i {
overlaps[e.i][i] = struct{}{}
overlaps[i][e.i] = struct{}{}
}
}
olap[e.i] = struct{}{}
} else {
delete(olap, e.i)
}
}
paraNeighbors := map[*textPara][]int{}
for i, olap := range overlaps {
para := paras[i]
neighbours := make([]int, len(olap))
k := 0
for j := range olap {
neighbours[k] = j
k++
}
paraNeighbors[para] = neighbours
}
return paraNeighbors
}
// isTextSpace returns true if `text` contains nothing but space code points.
func isTextSpace(text string) bool {
for _, r := range text {
if !unicode.IsSpace(r) {
return false
}
}
return true
}
// combiningDiacritic returns the combining version of `text` if text contains a single uncombined
// diacritic rune.
func combiningDiacritic(text string) (string, bool) {
runes := []rune(text)
if len(runes) != 1 {
return "", false
}
combining, isDiacritic := diacriticsToCombining[runes[0]]
return combining, isDiacritic
}
var (
// diacriticsToCombining is a map of diacritic runes to their combining diacritic equivalents.
// These values were copied from (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
diacriticsToCombining = map[rune]string{
0x0060: "\u0300", // ` -> ò
0x02CB: "\u0300", // ˋ -> ò
0x0027: "\u0301", // ' -> ó
0x00B4: "\u0301", // ´ -> ó
0x02B9: "\u0301", // ʹ -> ó
0x02CA: "\u0301", // ˊ -> ó
0x005E: "\u0302", // ^ -> ô
0x02C6: "\u0302", // ˆ -> ô
0x007E: "\u0303", // ~ -> õ
0x02DC: "\u0303", // ˜ -> õ
0x00AF: "\u0304", // ¯ -> ō
0x02C9: "\u0304", // ˉ -> ō
0x02D8: "\u0306", // ˘ -> ŏ
0x02D9: "\u0307", // ˙ -> ȯ
0x00A8: "\u0308", // ¨ -> ö
0x00B0: "\u030A", // ° -> o̊
0x02DA: "\u030A", // ˚ -> o̊
0x02BA: "\u030B", // ʺ -> ő
0x02DD: "\u030B", // ˝ -> ő
0x02C7: "\u030C", // ˇ -> ǒ
0x02C8: "\u030D", // ˈ -> o̍
0x0022: "\u030E", // " -> o̎
0x02BB: "\u0312", // ʻ -> o̒
0x02BC: "\u0313", // ʼ -> o̓
0x0486: "\u0313", // ҆ -> o̓
0x055A: "\u0313", // ՚ -> o̓
0x02BD: "\u0314", // ʽ -> o̔
0x0485: "\u0314", // ҅ -> o̔
0x0559: "\u0314", // ՙ -> o̔
0x02D4: "\u031D", // ˔ -> o̝
0x02D5: "\u031E", // ˕ -> o̞
0x02D6: "\u031F", // ˖ -> o̟
0x02D7: "\u0320", // ˗ -> o̠
0x02B2: "\u0321", // ʲ -> o̡
0x00B8: "\u0327", // ¸ -> o̧
0x02CC: "\u0329", // ˌ -> o̩
0x02B7: "\u032B", // ʷ -> o̫
0x02CD: "\u0331", // ˍ -> o̱
0x005F: "\u0332", // _ -> o̲
0x204E: "\u0359", // -> o͙
}
)