unipdf/extractor/text_bound.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"math"

	"github.com/unidoc/unipdf/v3/model"
)

/*
 * Sorting functions.
 *
 * There are two directions:
 *  - reading. Left to right in English
 *  - depth (aka non-reading).  Top to botttom in English.
 *
 * Text is read in reading then depth order.
 *
 * TODO(peterwilliams97): Add support for other reading orders and page rotations
 */

// bounded is an object with a bounding box. A mark, word, line or para.
type bounded interface {
	bbox() model.PdfRectangle
}

// getDepth returns the depth of `a` on a page of size `pageSize`.
func getDepth(pageSize model.PdfRectangle, a bounded) float64 {
	return pageSize.Ury - a.bbox().Lly
}

// diffReading returns `a` - `b` in the reading direction.
func diffReading(a, b bounded) float64 {
	return a.bbox().Llx - b.bbox().Llx
}

// rectContainsRect returns true if `a` contains `b`.
func rectContainsRect(a, b model.PdfRectangle) bool {
	return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury
}

// diffDepth returns `a` - `b` in the depth direction.
func diffDepth(a, b bounded) float64 {
	return bboxDepth(a) - bboxDepth(b)
}

// diffReadingDepth returns `a` - `b` in the reading then depth direction..
func diffReadingDepth(a, b bounded) float64 {
	diff := diffReading(a, b)
	if !isZero(diff) {
		return diff
	}
	return diffDepth(a, b)
}

// diffDepthReading returns `a` - `b` in the depth then reading directions
func diffDepthReading(a, b bounded) float64 {
	cmp := diffDepth(a, b)
	if !isZero(cmp) {
		return cmp
	}
	return diffReading(a, b)
}

// gapReading returns the reading direction gap between `a` and the following object `b` in the
// reading direction.
func gapReading(a, b bounded) float64 {
	return a.bbox().Llx - b.bbox().Urx
}

// bboxDepth returns the relative depth of `b`. Depth is only used for comparison so we don't care
// about its absolute value
func bboxDepth(b bounded) float64 {
	return -b.bbox().Lly
}

// readingOverlapLeft returns true is the left of `word` is in within `para` or delta to its right
func readingOverlapLeft(para *wordBag, word *textWord, delta float64) bool {
	return para.Urx <= word.Llx && word.Llx < para.Urx+delta
}

// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
// in the reading direction.
func readingOverlapPlusGap(para *wordBag, word *textWord, maxIntraReadingGap float64) bool {
	return word.Llx < para.Urx+maxIntraReadingGap && para.Llx-maxIntraReadingGap < word.Urx
}

// partial return 'overlap`(*wordBag, *textWord, `param`) bool.
func partial(overlap func(*wordBag, *textWord, float64) bool,
	param float64) func(*wordBag, *textWord) bool {
	return func(para *wordBag, word *textWord) bool {
		return overlap(para, word, param)
	}
}

// rectUnion returns the smallest axis-aligned rectangle that contains `b1` and `b2`.
func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {
	return model.PdfRectangle{
		Llx: math.Min(b1.Llx, b2.Llx),
		Lly: math.Min(b1.Lly, b2.Lly),
		Urx: math.Max(b1.Urx, b2.Urx),
		Ury: math.Max(b1.Ury, b2.Ury),
	}
}

// rectIntersection returns the largest axis-aligned rectangle that is contained by `b1` and `b2`.
func rectIntersection(b1, b2 model.PdfRectangle) (model.PdfRectangle, bool) {
	if !intersects(b1, b2) {
		return model.PdfRectangle{}, false
	}
	return model.PdfRectangle{
		Llx: math.Max(b1.Llx, b2.Llx),
		Urx: math.Min(b1.Urx, b2.Urx),
		Lly: math.Max(b1.Lly, b2.Lly),
		Ury: math.Min(b1.Ury, b2.Ury),
	}, true
}

// intersects returns true if `r0` and `r1` overlap in the x and y axes.
func intersects(b1, b2 model.PdfRectangle) bool {
	return intersectsX(b1, b2) && intersectsY(b1, b2)
}

// intersectsX returns true if `r0` and `r1` overlap in the x axis.
func intersectsX(r0, r1 model.PdfRectangle) bool {
	return r1.Llx <= r0.Urx && r0.Llx <= r1.Urx
}

// intersectsY returns true if `r0` and `r1` overlap in the y axis.
func intersectsY(r0, r1 model.PdfRectangle) bool {
	return r0.Lly <= r1.Ury && r1.Lly <= r0.Ury
}
Text extraction code for columns. (#366) * Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to textMark in a lot of code. Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test 2020-07-01 05:33:10 +10:00			`/*`
			`* This file is subject to the terms and conditions defined in`
			`* file 'LICENSE.md', which is part of this source code package.`
			`*/`

			`package extractor`

			`import (`
			`"math"`

			`"github.com/unidoc/unipdf/v3/model"`
			`)`

			`/*`
			`* Sorting functions.`
			`*`
			`* There are two directions:`
			`* - reading. Left to right in English`
			`* - depth (aka non-reading). Top to botttom in English.`
			`*`
			`* Text is read in reading then depth order.`
			`*`
			`* TODO(peterwilliams97): Add support for other reading orders and page rotations`
			`*/`

			`// bounded is an object with a bounding box. A mark, word, line or para.`
			`type bounded interface {`
			`bbox() model.PdfRectangle`
			`}`

			// getDepth returns the depth of `a` on a page of size `pageSize`.
			`func getDepth(pageSize model.PdfRectangle, a bounded) float64 {`
			`return pageSize.Ury - a.bbox().Lly`
			`}`

			// diffReading returns `a` - `b` in the reading direction.
			`func diffReading(a, b bounded) float64 {`
			`return a.bbox().Llx - b.bbox().Llx`
			`}`

			// rectContainsRect returns true if `a` contains `b`.
			`func rectContainsRect(a, b model.PdfRectangle) bool {`
			`return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury`
			`}`

			// diffDepth returns `a` - `b` in the depth direction.
			`func diffDepth(a, b bounded) float64 {`
			`return bboxDepth(a) - bboxDepth(b)`
			`}`

			// diffReadingDepth returns `a` - `b` in the reading then depth direction..
			`func diffReadingDepth(a, b bounded) float64 {`
			`diff := diffReading(a, b)`
			`if !isZero(diff) {`
			`return diff`
			`}`
			`return diffDepth(a, b)`
			`}`

			// diffDepthReading returns `a` - `b` in the depth then reading directions
			`func diffDepthReading(a, b bounded) float64 {`
			`cmp := diffDepth(a, b)`
			`if !isZero(cmp) {`
			`return cmp`
			`}`
			`return diffReading(a, b)`
			`}`

			// gapReading returns the reading direction gap between `a` and the following object `b` in the
			`// reading direction.`
			`func gapReading(a, b bounded) float64 {`
			`return a.bbox().Llx - b.bbox().Urx`
			`}`

			// bboxDepth returns the relative depth of `b`. Depth is only used for comparison so we don't care
			`// about its absolute value`
			`func bboxDepth(b bounded) float64 {`
			`return -b.bbox().Lly`
			`}`

			// readingOverlapLeft returns true is the left of `word` is in within `para` or delta to its right
			`func readingOverlapLeft(para wordBag, word textWord, delta float64) bool {`
			`return para.Urx <= word.Llx && word.Llx < para.Urx+delta`
			`}`

			// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
			`// in the reading direction.`
			`func readingOverlapPlusGap(para wordBag, word textWord, maxIntraReadingGap float64) bool {`
			`return word.Llx < para.Urx+maxIntraReadingGap && para.Llx-maxIntraReadingGap < word.Urx`
			`}`

			// partial return 'overlap`(wordBag, textWord, `param`) bool.
			`func partial(overlap func(wordBag, textWord, float64) bool,`
			`param float64) func(wordBag, textWord) bool {`
			`return func(para wordBag, word textWord) bool {`
			`return overlap(para, word, param)`
			`}`
			`}`

			// rectUnion returns the smallest axis-aligned rectangle that contains `b1` and `b2`.
			`func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {`
			`return model.PdfRectangle{`
			`Llx: math.Min(b1.Llx, b2.Llx),`
			`Lly: math.Min(b1.Lly, b2.Lly),`
			`Urx: math.Max(b1.Urx, b2.Urx),`
			`Ury: math.Max(b1.Ury, b2.Ury),`
			`}`
			`}`

			// rectIntersection returns the largest axis-aligned rectangle that is contained by `b1` and `b2`.
			`func rectIntersection(b1, b2 model.PdfRectangle) (model.PdfRectangle, bool) {`
			`if !intersects(b1, b2) {`
			`return model.PdfRectangle{}, false`
			`}`
			`return model.PdfRectangle{`
			`Llx: math.Max(b1.Llx, b2.Llx),`
			`Urx: math.Min(b1.Urx, b2.Urx),`
			`Lly: math.Max(b1.Lly, b2.Lly),`
			`Ury: math.Min(b1.Ury, b2.Ury),`
			`}, true`
			`}`

			// intersects returns true if `r0` and `r1` overlap in the x and y axes.
			`func intersects(b1, b2 model.PdfRectangle) bool {`
			`return intersectsX(b1, b2) && intersectsY(b1, b2)`
			`}`

			// intersectsX returns true if `r0` and `r1` overlap in the x axis.
			`func intersectsX(r0, r1 model.PdfRectangle) bool {`
			`return r1.Llx <= r0.Urx && r0.Llx <= r1.Urx`
			`}`

			// intersectsY returns true if `r0` and `r1` overlap in the y axis.
			`func intersectsY(r0, r1 model.PdfRectangle) bool {`
			`return r0.Lly <= r1.Ury && r1.Lly <= r0.Ury`
			`}`