unipdf/extractor/text_utils.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"math"
	"sort"
	"unicode"
)

// TOL is the tolerance for coordinates to be consideted equal. It is big enough to cover all
// rounding errors and small enough that TOL point differences on a page aren't visible.
const TOL = 1.0e-6

// isZero returns true if x is with TOL of 0.0
func isZero(x float64) bool {
	return math.Abs(x) < TOL
}

// minInt return the lesser of `a` and `b`.
func minInt(a, b int) int {
	if a < b {
		return a
	}
	return b
}

// maxInt return the greater of `a` and `b`.
func maxInt(a, b int) int {
	if a > b {
		return a
	}
	return b
}

// addNeighbours fills out the below and right fields of the paras in `paras`.
// For each para `a`:
//    a.below is the unique highest para completely below `a` that overlaps it in the x-direction
//    a.right is the unique leftmost para completely to the right of `a` that overlaps it in the y-direction
func (paras paraList) addNeighbours() {
	paraNeighbours := paras.yNeighbours()
	for _, para := range paras {
		var left *textPara
		dup := false
		for _, k := range paraNeighbours[para] {
			b := paras[k]
			if b.Urx <= para.Llx {
				if left == nil {
					left = b
				} else {
					if b.Llx > left.Llx {
						left = b
						dup = false
					} else if b.Llx == left.Llx {
						dup = true
					}
				}
			}
		}
		if !dup {
			para.left = left
		}
	}
	for _, para := range paras {
		var right *textPara
		dup := false
		for _, k := range paraNeighbours[para] {
			b := paras[k]
			if b.Llx >= para.Urx {
				if right == nil {
					right = b
				} else {
					if b.Llx < right.Llx {
						right = b
						dup = false
					} else if b.Llx == right.Llx {
						dup = true
					}
				}
			}
		}
		if !dup {
			para.right = right
		}
	}

	paraNeighbours = paras.xNeighbours()
	for _, para := range paras {
		var above *textPara
		dup := false
		for _, i := range paraNeighbours[para] {
			b := paras[i]
			if b.Lly >= para.Ury {
				if above == nil {
					above = b
				} else {
					if b.Ury < above.Ury {
						above = b
						dup = false
					} else if b.Ury == above.Ury {
						dup = true
					}
				}
			}
		}
		if !dup {
			para.above = above
		}
	}
	for _, para := range paras {
		var below *textPara
		dup := false
		for _, i := range paraNeighbours[para] {
			b := paras[i]
			if b.Ury <= para.Lly {
				if below == nil {
					below = b
				} else {
					if b.Ury > below.Ury {
						below = b
						dup = false
					} else if b.Ury == below.Ury {
						dup = true
					}
				}
			}
		}
		if !dup {
			para.below = below
		}
	}
}

// xNeighbours returns a map {para: indexes of paras that x-overlap para}.
func (paras paraList) xNeighbours() map[*textPara][]int {
	events := make([]event, 2*len(paras))
	for i, para := range paras {
		events[2*i] = event{para.Llx, true, i}
		events[2*i+1] = event{para.Urx, false, i}
	}
	return paras.eventNeighbours(events)
}

// yNeighbours returns a map {para: indexes of paras that y-overlap para}.
func (paras paraList) yNeighbours() map[*textPara][]int {
	events := make([]event, 2*len(paras))
	for i, para := range paras {
		events[2*i] = event{para.Lly, true, i}
		events[2*i+1] = event{para.Ury, false, i}
	}
	return paras.eventNeighbours(events)
}

// event is an entry or exit from an interval while scanning.
type event struct {
	z     float64 // Coordinate in the scanning direction.
	enter bool    // True if entering the interval, false it leaving.
	i     int     // Index of the interval
}

// eventNeighbours returns a map {para: indexes of paras that overlap para in `events`}.
func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
	sort.Slice(events, func(i, j int) bool {
		ei, ej := events[i], events[j]
		zi, zj := ei.z, ej.z
		if zi != zj {
			return zi < zj
		}
		if ei.enter != ej.enter {
			return ei.enter
		}
		return i < j
	})

	overlaps := map[int]map[int]struct{}{}
	olap := map[int]struct{}{}
	for _, e := range events {
		if e.enter {
			overlaps[e.i] = map[int]struct{}{}
			for i := range olap {
				if i != e.i {
					overlaps[e.i][i] = struct{}{}
					overlaps[i][e.i] = struct{}{}
				}
			}
			olap[e.i] = struct{}{}
		} else {
			delete(olap, e.i)
		}
	}

	paraNeighbors := map[*textPara][]int{}
	for i, olap := range overlaps {
		para := paras[i]
		neighbours := make([]int, len(olap))
		k := 0
		for j := range olap {
			neighbours[k] = j
			k++
		}
		paraNeighbors[para] = neighbours
	}
	return paraNeighbors
}

// isTextSpace returns true if `text` contains nothing but space code points.
func isTextSpace(text string) bool {
	for _, r := range text {
		if !unicode.IsSpace(r) {
			return false
		}
	}
	return true
}

// combiningDiacritic returns the combining version of `text` if text contains a single uncombined
// diacritic rune.
func combiningDiacritic(text string) (string, bool) {
	runes := []rune(text)
	if len(runes) != 1 {
		return "", false
	}
	combining, isDiacritic := diacriticsToCombining[runes[0]]
	return combining, isDiacritic
}

var (
	// diacriticsToCombining is a map of diacritic runes to their combining diacritic equivalents.
	// These values were  copied from  (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
	diacriticsToCombining = map[rune]string{
		0x0060: "\u0300", //   ` -> ò
		0x02CB: "\u0300", //   ˋ -> ò
		0x0027: "\u0301", //   ' -> ó
		0x00B4: "\u0301", //   ´ -> ó
		0x02B9: "\u0301", //   ʹ -> ó
		0x02CA: "\u0301", //   ˊ -> ó
		0x005E: "\u0302", //   ^ -> ô
		0x02C6: "\u0302", //   ˆ -> ô
		0x007E: "\u0303", //   ~ -> õ
		0x02DC: "\u0303", //   ˜ -> õ
		0x00AF: "\u0304", //   ¯ -> ō
		0x02C9: "\u0304", //   ˉ -> ō
		0x02D8: "\u0306", //   ˘ -> ŏ
		0x02D9: "\u0307", //   ˙ -> ȯ
		0x00A8: "\u0308", //   ¨ -> ö
		0x00B0: "\u030A", //   ° -> o̊
		0x02DA: "\u030A", //   ˚ -> o̊
		0x02BA: "\u030B", //   ʺ -> ő
		0x02DD: "\u030B", //   ˝ -> ő
		0x02C7: "\u030C", //   ˇ -> ǒ
		0x02C8: "\u030D", //   ˈ -> o̍
		0x0022: "\u030E", //   " -> o̎
		0x02BB: "\u0312", //   ʻ -> o̒
		0x02BC: "\u0313", //   ʼ -> o̓
		0x0486: "\u0313", //   ҆ -> o̓
		0x055A: "\u0313", //   ՚ -> o̓
		0x02BD: "\u0314", //   ʽ -> o̔
		0x0485: "\u0314", //   ҅ -> o̔
		0x0559: "\u0314", //   ՙ -> o̔
		0x02D4: "\u031D", //   ˔ -> o̝
		0x02D5: "\u031E", //   ˕ -> o̞
		0x02D6: "\u031F", //   ˖ -> o̟
		0x02D7: "\u0320", //   ˗ -> o̠
		0x02B2: "\u0321", //   ʲ -> o̡
		0x00B8: "\u0327", //   ¸ -> o̧
		0x02CC: "\u0329", //   ˌ -> o̩
		0x02B7: "\u032B", //   ʷ -> o̫
		0x02CD: "\u0331", //   ˍ -> o̱
		0x005F: "\u0332", //   _ -> o̲
		0x204E: "\u0359", //   ⁎ -> o͙
	}
)
-												Text extraction code for columns. (#366)

* Fixed filename:page in logging

* Got CMap working for multi-rune entries

* Treat CMap entries as strings instead of runes to handle multi-byte encodings.

* Added a test for multibyte encoding.

* First version of text extraction that recognizes columns

* Added an expanation of the text columns code to README.md.

* fixed typos

* Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

* Added function comments.

* Fixed text state save/restore.

* Adjusted inter-word search distance to make paragrah division work for thanh.pdf

* Got text_test.go passing.

* Reinstated hyphen suppression

* Handle more cases of fonts not being set in text extraction code.

* Fixed typo

* More verbose logging

* Adding tables to text extractor.

* Added tests for columns extraction.

* Removed commented code

* Check for textParas that are on the same line when writing out extracted text.

* Absorb text to the left of paras into paras e.g. Footnote numbers

* Removed funny character from text_test.go

* Commented out a creator_test.go test that was broken by my text extraction changes.

* Big changes to columns text extraction code for PR.

Performance improvements in several places.
Commented code.

* Updated extractor/README

* Cleaned up some comments and removed a panic

* Increased threshold for truncating extracted text when there is no license 100 -> 102.

This is a workaround to let a test in creator_test.go pass.

With the old text extraction code the following extracted text was 100 chars. With the new code it
is 102 chars which looks correct.

"你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n"

* Improved an error message.

* Removed irrelevant spaces

* Commented code and removed unused functions.

* Reverted PdfRectangle changes

* Added duplicate text detection.

* Combine diacritic textMarks in text extraction

* Reinstated a diacritic recombination test.

* Small code reorganisation

* Reinstated handling of rotated text

* Addressed issues in PR review

* Added color fields to TextMark

* Updated README

* Reinstated the disabled tests I missed before.

* Tightened definition for tables to prevent detection of tables where there weren't any.

* Compute line splitting search range based on fontsize of first word in word bag.

* Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs.

See https://blog.golang.org/go1.13-errors

* Fixed some naming and added some comments.

* errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility

* Removed code that doesn't ever get called.

* Removed unused test
											
										
										
											2020-07-01 05:33:10 +10:00
+								/*
 								 * This file is subject to the terms and conditions defined in
 								 * file 'LICENSE.md', which is part of this source code package.
 								 */
 								package extractor
 								import (
 									"math"
 									"sort"
 									"unicode"
 								)
 								// TOL is the tolerance for coordinates to be consideted equal. It is big enough to cover all
 								// rounding errors and small enough that TOL point differences on a page aren't visible.
 								const TOL = 1.0e-6
 								// isZero returns true if x is with TOL of 0.0
 								func isZero(x float64) bool {
 									return math.Abs(x) < TOL
 								}
 								// minInt return the lesser of `a` and `b`.
 								func minInt(a, b int) int {
 									if a < b {
 										return a
 									}
 									return b
 								}
 								// maxInt return the greater of `a` and `b`.
 								func maxInt(a, b int) int {
 									if a > b {
 										return a
 									}
 									return b
 								}
 								// addNeighbours fills out the below and right fields of the paras in `paras`.
 								// For each para `a`:
 								//    a.below is the unique highest para completely below `a` that overlaps it in the x-direction
 								//    a.right is the unique leftmost para completely to the right of `a` that overlaps it in the y-direction
 								func (paras paraList) addNeighbours() {
 									paraNeighbours := paras.yNeighbours()
 									for _, para := range paras {
 										var left *textPara
 										dup := false
 										for _, k := range paraNeighbours[para] {
 											b := paras[k]
 											if b.Urx <= para.Llx {
 												if left == nil {
 													left = b
 												} else {
 													if b.Llx > left.Llx {
 														left = b
 														dup = false
 													} else if b.Llx == left.Llx {
 														dup = true
 													}
 												}
 											}
 										}
 										if !dup {
 											para.left = left
 										}
 									}
 									for _, para := range paras {
 										var right *textPara
 										dup := false
 										for _, k := range paraNeighbours[para] {
 											b := paras[k]
 											if b.Llx >= para.Urx {
 												if right == nil {
 													right = b
 												} else {
 													if b.Llx < right.Llx {
 														right = b
 														dup = false
 													} else if b.Llx == right.Llx {
 														dup = true
 													}
 												}
 											}
 										}
 										if !dup {
 											para.right = right
 										}
 									}
 									paraNeighbours = paras.xNeighbours()
 									for _, para := range paras {
 										var above *textPara
 										dup := false
 										for _, i := range paraNeighbours[para] {
 											b := paras[i]
 											if b.Lly >= para.Ury {
 												if above == nil {
 													above = b
 												} else {
 													if b.Ury < above.Ury {
 														above = b
 														dup = false
 													} else if b.Ury == above.Ury {
 														dup = true
 													}
 												}
 											}
 										}
 										if !dup {
 											para.above = above
 										}
 									}
 									for _, para := range paras {
 										var below *textPara
 										dup := false
 										for _, i := range paraNeighbours[para] {
 											b := paras[i]
 											if b.Ury <= para.Lly {
 												if below == nil {
 													below = b
 												} else {
 													if b.Ury > below.Ury {
 														below = b
 														dup = false
 													} else if b.Ury == below.Ury {
 														dup = true
 													}
 												}
 											}
 										}
 										if !dup {
 											para.below = below
 										}
 									}
 								}
 								// xNeighbours returns a map {para: indexes of paras that x-overlap para}.
 								func (paras paraList) xNeighbours() map[*textPara][]int {
 									events := make([]event, 2*len(paras))
 									for i, para := range paras {
 										events[2*i] = event{para.Llx, true, i}
 										events[2*i+1] = event{para.Urx, false, i}
 									}
 									return paras.eventNeighbours(events)
 								}
 								// yNeighbours returns a map {para: indexes of paras that y-overlap para}.
 								func (paras paraList) yNeighbours() map[*textPara][]int {
 									events := make([]event, 2*len(paras))
 									for i, para := range paras {
 										events[2*i] = event{para.Lly, true, i}
 										events[2*i+1] = event{para.Ury, false, i}
 									}
 									return paras.eventNeighbours(events)
 								}
 								// event is an entry or exit from an interval while scanning.
 								type event struct {
 									z     float64 // Coordinate in the scanning direction.
 									enter bool    // True if entering the interval, false it leaving.
 									i     int     // Index of the interval
 								}
 								// eventNeighbours returns a map {para: indexes of paras that overlap para in `events`}.
 								func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
 									sort.Slice(events, func(i, j int) bool {
 										ei, ej := events[i], events[j]
 										zi, zj := ei.z, ej.z
 										if zi != zj {
 											return zi < zj
 										}
 										if ei.enter != ej.enter {
 											return ei.enter
 										}
 										return i < j
 									})
 									overlaps := map[int]map[int]struct{}{}
 									olap := map[int]struct{}{}
 									for _, e := range events {
 										if e.enter {
 											overlaps[e.i] = map[int]struct{}{}
 											for i := range olap {
 												if i != e.i {
 													overlaps[e.i][i] = struct{}{}
 													overlaps[i][e.i] = struct{}{}
 												}
 											}
 											olap[e.i] = struct{}{}
 										} else {
 											delete(olap, e.i)
 										}
 									}
 									paraNeighbors := map[*textPara][]int{}
 									for i, olap := range overlaps {
 										para := paras[i]
 										neighbours := make([]int, len(olap))
 										k := 0
 										for j := range olap {
 											neighbours[k] = j
 											k++
 										}
 										paraNeighbors[para] = neighbours
 									}
 									return paraNeighbors
 								}
 								// isTextSpace returns true if `text` contains nothing but space code points.
 								func isTextSpace(text string) bool {
 									for _, r := range text {
 										if !unicode.IsSpace(r) {
 											return false
 										}
 									}
 									return true
 								}
 								// combiningDiacritic returns the combining version of `text` if text contains a single uncombined
 								// diacritic rune.
 								func combiningDiacritic(text string) (string, bool) {
 									runes := []rune(text)
 									if len(runes) != 1 {
 										return "", false
 									}
 									combining, isDiacritic := diacriticsToCombining[runes[0]]
 									return combining, isDiacritic
 								}
 								var (
 									// diacriticsToCombining is a map of diacritic runes to their combining diacritic equivalents.
 									// These values were  copied from  (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
 									diacriticsToCombining = map[rune]string{
 x0060: "\u0300", //   ` -> ò
 x02CB: "\u0300", //   ˋ -> ò
 x0027: "\u0301", //   ' -> ó
 x00B4: "\u0301", //   ´ -> ó
 x02B9: "\u0301", //   ʹ -> ó
 x02CA: "\u0301", //   ˊ -> ó
 x005E: "\u0302", //   ^ -> ô
 x02C6: "\u0302", //   ˆ -> ô
 x007E: "\u0303", //   ~ -> õ
 x02DC: "\u0303", //   ˜ -> õ
 x00AF: "\u0304", //   ¯ -> ō
 x02C9: "\u0304", //   ˉ -> ō
 x02D8: "\u0306", //   ˘ -> ŏ
 x02D9: "\u0307", //   ˙ -> ȯ
 x00A8: "\u0308", //   ¨ -> ö
 x00B0: "\u030A", //   ° -> o̊
 x02DA: "\u030A", //   ˚ -> o̊
 x02BA: "\u030B", //   ʺ -> ő
 x02DD: "\u030B", //   ˝ -> ő
 x02C7: "\u030C", //   ˇ -> ǒ
 x02C8: "\u030D", //   ˈ -> o̍
 x0022: "\u030E", //   " -> o̎
 x02BB: "\u0312", //   ʻ -> o̒
 x02BC: "\u0313", //   ʼ -> o̓
 x0486: "\u0313", //   ҆ -> o̓
 x055A: "\u0313", //   ՚ -> o̓
 x02BD: "\u0314", //   ʽ -> o̔
 x0485: "\u0314", //   ҅ -> o̔
 x0559: "\u0314", //   ՙ -> o̔
 x02D4: "\u031D", //   ˔ -> o̝
 x02D5: "\u031E", //   ˕ -> o̞
 x02D6: "\u031F", //   ˖ -> o̟
 x02D7: "\u0320", //   ˗ -> o̠
 x02B2: "\u0321", //   ʲ -> o̡
 x00B8: "\u0327", //   ¸ -> o̧
 x02CC: "\u0329", //   ˌ -> o̩
 x02B7: "\u032B", //   ʷ -> o̫
 x02CD: "\u0331", //   ˍ -> o̱
 x005F: "\u0332", //   _ -> o̲
 x204E: "\u0359", //   ⁎ -> o͙
 									}
 								)