mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-01 22:17:29 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
276 lines
6.6 KiB
Go
276 lines
6.6 KiB
Go
/*
|
||
* This file is subject to the terms and conditions defined in
|
||
* file 'LICENSE.md', which is part of this source code package.
|
||
*/
|
||
|
||
package extractor
|
||
|
||
import (
|
||
"math"
|
||
"sort"
|
||
"unicode"
|
||
)
|
||
|
||
// TOL is the tolerance for coordinates to be consideted equal. It is big enough to cover all
|
||
// rounding errors and small enough that TOL point differences on a page aren't visible.
|
||
const TOL = 1.0e-6
|
||
|
||
// isZero returns true if x is with TOL of 0.0
|
||
func isZero(x float64) bool {
|
||
return math.Abs(x) < TOL
|
||
}
|
||
|
||
// minInt return the lesser of `a` and `b`.
|
||
func minInt(a, b int) int {
|
||
if a < b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|
||
|
||
// maxInt return the greater of `a` and `b`.
|
||
func maxInt(a, b int) int {
|
||
if a > b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|
||
|
||
// addNeighbours fills out the below and right fields of the paras in `paras`.
|
||
// For each para `a`:
|
||
// a.below is the unique highest para completely below `a` that overlaps it in the x-direction
|
||
// a.right is the unique leftmost para completely to the right of `a` that overlaps it in the y-direction
|
||
func (paras paraList) addNeighbours() {
|
||
paraNeighbours := paras.yNeighbours()
|
||
for _, para := range paras {
|
||
var left *textPara
|
||
dup := false
|
||
for _, k := range paraNeighbours[para] {
|
||
b := paras[k]
|
||
if b.Urx <= para.Llx {
|
||
if left == nil {
|
||
left = b
|
||
} else {
|
||
if b.Llx > left.Llx {
|
||
left = b
|
||
dup = false
|
||
} else if b.Llx == left.Llx {
|
||
dup = true
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if !dup {
|
||
para.left = left
|
||
}
|
||
}
|
||
for _, para := range paras {
|
||
var right *textPara
|
||
dup := false
|
||
for _, k := range paraNeighbours[para] {
|
||
b := paras[k]
|
||
if b.Llx >= para.Urx {
|
||
if right == nil {
|
||
right = b
|
||
} else {
|
||
if b.Llx < right.Llx {
|
||
right = b
|
||
dup = false
|
||
} else if b.Llx == right.Llx {
|
||
dup = true
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if !dup {
|
||
para.right = right
|
||
}
|
||
}
|
||
|
||
paraNeighbours = paras.xNeighbours()
|
||
for _, para := range paras {
|
||
var above *textPara
|
||
dup := false
|
||
for _, i := range paraNeighbours[para] {
|
||
b := paras[i]
|
||
if b.Lly >= para.Ury {
|
||
if above == nil {
|
||
above = b
|
||
} else {
|
||
if b.Ury < above.Ury {
|
||
above = b
|
||
dup = false
|
||
} else if b.Ury == above.Ury {
|
||
dup = true
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if !dup {
|
||
para.above = above
|
||
}
|
||
}
|
||
for _, para := range paras {
|
||
var below *textPara
|
||
dup := false
|
||
for _, i := range paraNeighbours[para] {
|
||
b := paras[i]
|
||
if b.Ury <= para.Lly {
|
||
if below == nil {
|
||
below = b
|
||
} else {
|
||
if b.Ury > below.Ury {
|
||
below = b
|
||
dup = false
|
||
} else if b.Ury == below.Ury {
|
||
dup = true
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if !dup {
|
||
para.below = below
|
||
}
|
||
}
|
||
}
|
||
|
||
// xNeighbours returns a map {para: indexes of paras that x-overlap para}.
|
||
func (paras paraList) xNeighbours() map[*textPara][]int {
|
||
events := make([]event, 2*len(paras))
|
||
for i, para := range paras {
|
||
events[2*i] = event{para.Llx, true, i}
|
||
events[2*i+1] = event{para.Urx, false, i}
|
||
}
|
||
return paras.eventNeighbours(events)
|
||
}
|
||
|
||
// yNeighbours returns a map {para: indexes of paras that y-overlap para}.
|
||
func (paras paraList) yNeighbours() map[*textPara][]int {
|
||
events := make([]event, 2*len(paras))
|
||
for i, para := range paras {
|
||
events[2*i] = event{para.Lly, true, i}
|
||
events[2*i+1] = event{para.Ury, false, i}
|
||
}
|
||
return paras.eventNeighbours(events)
|
||
}
|
||
|
||
// event is an entry or exit from an interval while scanning.
|
||
type event struct {
|
||
z float64 // Coordinate in the scanning direction.
|
||
enter bool // True if entering the interval, false it leaving.
|
||
i int // Index of the interval
|
||
}
|
||
|
||
// eventNeighbours returns a map {para: indexes of paras that overlap para in `events`}.
|
||
func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
|
||
sort.Slice(events, func(i, j int) bool {
|
||
ei, ej := events[i], events[j]
|
||
zi, zj := ei.z, ej.z
|
||
if zi != zj {
|
||
return zi < zj
|
||
}
|
||
if ei.enter != ej.enter {
|
||
return ei.enter
|
||
}
|
||
return i < j
|
||
})
|
||
|
||
overlaps := map[int]map[int]struct{}{}
|
||
olap := map[int]struct{}{}
|
||
for _, e := range events {
|
||
if e.enter {
|
||
overlaps[e.i] = map[int]struct{}{}
|
||
for i := range olap {
|
||
if i != e.i {
|
||
overlaps[e.i][i] = struct{}{}
|
||
overlaps[i][e.i] = struct{}{}
|
||
}
|
||
}
|
||
olap[e.i] = struct{}{}
|
||
} else {
|
||
delete(olap, e.i)
|
||
}
|
||
}
|
||
|
||
paraNeighbors := map[*textPara][]int{}
|
||
for i, olap := range overlaps {
|
||
para := paras[i]
|
||
neighbours := make([]int, len(olap))
|
||
k := 0
|
||
for j := range olap {
|
||
neighbours[k] = j
|
||
k++
|
||
}
|
||
paraNeighbors[para] = neighbours
|
||
}
|
||
return paraNeighbors
|
||
}
|
||
|
||
// isTextSpace returns true if `text` contains nothing but space code points.
|
||
func isTextSpace(text string) bool {
|
||
for _, r := range text {
|
||
if !unicode.IsSpace(r) {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
// combiningDiacritic returns the combining version of `text` if text contains a single uncombined
|
||
// diacritic rune.
|
||
func combiningDiacritic(text string) (string, bool) {
|
||
runes := []rune(text)
|
||
if len(runes) != 1 {
|
||
return "", false
|
||
}
|
||
combining, isDiacritic := diacriticsToCombining[runes[0]]
|
||
return combining, isDiacritic
|
||
}
|
||
|
||
var (
|
||
// diacriticsToCombining is a map of diacritic runes to their combining diacritic equivalents.
|
||
// These values were copied from (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
|
||
diacriticsToCombining = map[rune]string{
|
||
0x0060: "\u0300", // ` -> ò
|
||
0x02CB: "\u0300", // ˋ -> ò
|
||
0x0027: "\u0301", // ' -> ó
|
||
0x00B4: "\u0301", // ´ -> ó
|
||
0x02B9: "\u0301", // ʹ -> ó
|
||
0x02CA: "\u0301", // ˊ -> ó
|
||
0x005E: "\u0302", // ^ -> ô
|
||
0x02C6: "\u0302", // ˆ -> ô
|
||
0x007E: "\u0303", // ~ -> õ
|
||
0x02DC: "\u0303", // ˜ -> õ
|
||
0x00AF: "\u0304", // ¯ -> ō
|
||
0x02C9: "\u0304", // ˉ -> ō
|
||
0x02D8: "\u0306", // ˘ -> ŏ
|
||
0x02D9: "\u0307", // ˙ -> ȯ
|
||
0x00A8: "\u0308", // ¨ -> ö
|
||
0x00B0: "\u030A", // ° -> o̊
|
||
0x02DA: "\u030A", // ˚ -> o̊
|
||
0x02BA: "\u030B", // ʺ -> ő
|
||
0x02DD: "\u030B", // ˝ -> ő
|
||
0x02C7: "\u030C", // ˇ -> ǒ
|
||
0x02C8: "\u030D", // ˈ -> o̍
|
||
0x0022: "\u030E", // " -> o̎
|
||
0x02BB: "\u0312", // ʻ -> o̒
|
||
0x02BC: "\u0313", // ʼ -> o̓
|
||
0x0486: "\u0313", // ҆ -> o̓
|
||
0x055A: "\u0313", // ՚ -> o̓
|
||
0x02BD: "\u0314", // ʽ -> o̔
|
||
0x0485: "\u0314", // ҅ -> o̔
|
||
0x0559: "\u0314", // ՙ -> o̔
|
||
0x02D4: "\u031D", // ˔ -> o̝
|
||
0x02D5: "\u031E", // ˕ -> o̞
|
||
0x02D6: "\u031F", // ˖ -> o̟
|
||
0x02D7: "\u0320", // ˗ -> o̠
|
||
0x02B2: "\u0321", // ʲ -> o̡
|
||
0x00B8: "\u0327", // ¸ -> o̧
|
||
0x02CC: "\u0329", // ˌ -> o̩
|
||
0x02B7: "\u032B", // ʷ -> o̫
|
||
0x02CD: "\u0331", // ˍ -> o̱
|
||
0x005F: "\u0332", // _ -> o̲
|
||
0x204E: "\u0359", // ⁎ -> o͙
|
||
}
|
||
)
|