unipdf/extractor/text_bound.go

137 lines
4.0 KiB
Go
Raw Normal View History

Text extraction code for columns. (#366) * Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
2020-07-01 05:33:10 +10:00
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"math"
"github.com/unidoc/unipdf/v3/model"
)
/*
* Sorting functions.
*
* There are two directions:
* - reading. Left to right in English
* - depth (aka non-reading). Top to botttom in English.
*
* Text is read in reading then depth order.
*
* TODO(peterwilliams97): Add support for other reading orders and page rotations
*/
// bounded is an object with a bounding box. A mark, word, line or para.
type bounded interface {
bbox() model.PdfRectangle
}
// getDepth returns the depth of `a` on a page of size `pageSize`.
func getDepth(pageSize model.PdfRectangle, a bounded) float64 {
return pageSize.Ury - a.bbox().Lly
}
// diffReading returns `a` - `b` in the reading direction.
func diffReading(a, b bounded) float64 {
return a.bbox().Llx - b.bbox().Llx
}
// rectContainsRect returns true if `a` contains `b`.
func rectContainsRect(a, b model.PdfRectangle) bool {
return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury
}
// diffDepth returns `a` - `b` in the depth direction.
func diffDepth(a, b bounded) float64 {
return bboxDepth(a) - bboxDepth(b)
}
// diffReadingDepth returns `a` - `b` in the reading then depth direction..
func diffReadingDepth(a, b bounded) float64 {
diff := diffReading(a, b)
if !isZero(diff) {
return diff
}
return diffDepth(a, b)
}
// diffDepthReading returns `a` - `b` in the depth then reading directions
func diffDepthReading(a, b bounded) float64 {
cmp := diffDepth(a, b)
if !isZero(cmp) {
return cmp
}
return diffReading(a, b)
}
// gapReading returns the reading direction gap between `a` and the following object `b` in the
// reading direction.
func gapReading(a, b bounded) float64 {
return a.bbox().Llx - b.bbox().Urx
}
// bboxDepth returns the relative depth of `b`. Depth is only used for comparison so we don't care
// about its absolute value
func bboxDepth(b bounded) float64 {
return -b.bbox().Lly
}
// readingOverlapLeft returns true is the left of `word` is in within `para` or delta to its right
func readingOverlapLeft(para *wordBag, word *textWord, delta float64) bool {
return para.Urx <= word.Llx && word.Llx < para.Urx+delta
}
// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
// in the reading direction.
func readingOverlapPlusGap(para *wordBag, word *textWord, maxIntraReadingGap float64) bool {
return word.Llx < para.Urx+maxIntraReadingGap && para.Llx-maxIntraReadingGap < word.Urx
}
// partial return 'overlap`(*wordBag, *textWord, `param`) bool.
func partial(overlap func(*wordBag, *textWord, float64) bool,
param float64) func(*wordBag, *textWord) bool {
return func(para *wordBag, word *textWord) bool {
return overlap(para, word, param)
}
}
// rectUnion returns the smallest axis-aligned rectangle that contains `b1` and `b2`.
func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {
return model.PdfRectangle{
Llx: math.Min(b1.Llx, b2.Llx),
Lly: math.Min(b1.Lly, b2.Lly),
Urx: math.Max(b1.Urx, b2.Urx),
Ury: math.Max(b1.Ury, b2.Ury),
}
}
// rectIntersection returns the largest axis-aligned rectangle that is contained by `b1` and `b2`.
func rectIntersection(b1, b2 model.PdfRectangle) (model.PdfRectangle, bool) {
if !intersects(b1, b2) {
return model.PdfRectangle{}, false
}
return model.PdfRectangle{
Llx: math.Max(b1.Llx, b2.Llx),
Urx: math.Min(b1.Urx, b2.Urx),
Lly: math.Max(b1.Lly, b2.Lly),
Ury: math.Min(b1.Ury, b2.Ury),
}, true
}
// intersects returns true if `r0` and `r1` overlap in the x and y axes.
func intersects(b1, b2 model.PdfRectangle) bool {
return intersectsX(b1, b2) && intersectsY(b1, b2)
}
// intersectsX returns true if `r0` and `r1` overlap in the x axis.
func intersectsX(r0, r1 model.PdfRectangle) bool {
return r1.Llx <= r0.Urx && r0.Llx <= r1.Urx
}
// intersectsY returns true if `r0` and `r1` overlap in the y axis.
func intersectsY(r0, r1 model.PdfRectangle) bool {
return r0.Lly <= r1.Ury && r1.Lly <= r0.Ury
}