unipdf/extractor/text_line.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"fmt"
	"strings"
	"unicode"

	"github.com/unidoc/unipdf/v3/model"
)

// textLine repesents words on the same line within a textPara.
type textLine struct {
	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
	depth              float64     // Distance from bottom of line to top of page.
	words              []*textWord // Words in this line.
	fontsize           float64     // Largest word font size.
}

// newTextLine creates a line with the font and bbox size of the first word in `b`, removes the word
// from `b` and adds it to the line.
func newTextLine(b *wordBag, depthIdx int) *textLine {
	word := b.firstWord(depthIdx)
	line := textLine{
		PdfRectangle: word.PdfRectangle,
		fontsize:     word.fontsize,
		depth:        word.depth,
	}
	line.pullWord(b, word, depthIdx)
	return &line
}

// String returns a description of `l`.
func (l *textLine) String() string {
	return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
		l.depth, l.PdfRectangle, l.fontsize, l.text())
}

// bbox makes textLine implement the `bounded` interface.
func (l *textLine) bbox() model.PdfRectangle {
	return l.PdfRectangle
}

// text returns the extracted text contained in line.
func (l *textLine) text() string {
	var words []string
	for _, w := range l.words {
		if w.newWord {
			words = append(words, " ")
		}
		words = append(words, w.text)
	}
	return strings.Join(words, "")
}

// toTextMarks returns the TextMarks contained in `l`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (l *textLine) toTextMarks(offset *int) []TextMark {
	var marks []TextMark
	for _, w := range l.words {
		if w.newWord {
			marks = appendSpaceMark(marks, offset, " ")
		}
		wordMarks := w.toTextMarks(offset)
		marks = append(marks, wordMarks...)
	}
	return marks
}

// pullWord removes `word` from bag and appends it to `l`.
func (l *textLine) pullWord(bag *wordBag, word *textWord, depthIdx int) {
	l.appendWord(word)
	bag.removeWord(word, depthIdx)
}

// appendWord appends `word` to `l`.
// `l.PdfRectangle` is increased to bound the new word.
// `l.fontsize` is the largest of the fontsizes of the words in line.
func (l *textLine) appendWord(word *textWord) {
	l.words = append(l.words, word)
	l.PdfRectangle = rectUnion(l.PdfRectangle, word.PdfRectangle)
	if word.fontsize > l.fontsize {
		l.fontsize = word.fontsize
	}
	if word.depth > l.depth {
		l.depth = word.depth
	}
}

// markWordBoundaries marks the word fragments that are the first fragments in whole words.
func (l *textLine) markWordBoundaries() {
	maxGap := maxIntraLineGapR * l.fontsize
	for i, w := range l.words[1:] {
		if gapReading(w, l.words[i]) >= maxGap {
			w.newWord = true
		}
	}
}

// endsInHyphen attempts to detect words that are split between lines
// IT currently returns true if `l` ends in a hyphen and its last minHyphenation runes don't coataib
// a space.
// TODO(peterwilliams97): Figure out a better heuristic
func (l *textLine) endsInHyphen() bool {
	// Computing l.text() is a little expensive so we filter out simple cases first.
	lastWord := l.words[len(l.words)-1]
	runes := []rune(lastWord.text)
	if !unicode.Is(unicode.Hyphen, runes[len(runes)-1]) {
		return false
	}
	if lastWord.newWord && endsInHyphen(runes) {
		return true
	}
	return endsInHyphen([]rune(l.text()))
}

// endsInHyphen returns true if `runes` ends with a hyphenated word.
func endsInHyphen(runes []rune) bool {
	return len(runes) >= minHyphenation &&
		unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
		!unicode.IsSpace(runes[len(runes)-2])
}
Text extraction code for columns. (#366) * Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to textMark in a lot of code. Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test 2020-07-01 05:33:10 +10:00			`/*`
			`* This file is subject to the terms and conditions defined in`
			`* file 'LICENSE.md', which is part of this source code package.`
			`*/`

			`package extractor`

			`import (`
			`"fmt"`
			`"strings"`
			`"unicode"`

			`"github.com/unidoc/unipdf/v3/model"`
			`)`

			`// textLine repesents words on the same line within a textPara.`
			`type textLine struct {`
			model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
			`depth float64 // Distance from bottom of line to top of page.`
			`words []*textWord // Words in this line.`
			`fontsize float64 // Largest word font size.`
			`}`

			// newTextLine creates a line with the font and bbox size of the first word in `b`, removes the word
			// from `b` and adds it to the line.
			`func newTextLine(b wordBag, depthIdx int) textLine {`
			`word := b.firstWord(depthIdx)`
			`line := textLine{`
			`PdfRectangle: word.PdfRectangle,`
			`fontsize: word.fontsize,`
			`depth: word.depth,`
			`}`
			`line.pullWord(b, word, depthIdx)`
			`return &line`
			`}`

			// String returns a description of `l`.
			`func (l *textLine) String() string {`
			`return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",`
			`l.depth, l.PdfRectangle, l.fontsize, l.text())`
			`}`

			// bbox makes textLine implement the `bounded` interface.
			`func (l *textLine) bbox() model.PdfRectangle {`
			`return l.PdfRectangle`
			`}`

			`// text returns the extracted text contained in line.`
			`func (l *textLine) text() string {`
			`var words []string`
			`for _, w := range l.words {`
			`if w.newWord {`
			`words = append(words, " ")`
			`}`
			`words = append(words, w.text)`
			`}`
			`return strings.Join(words, "")`
			`}`

			// toTextMarks returns the TextMarks contained in `l`.text().
			// `offset` is used to give the TextMarks the correct Offset values.
			`func (l textLine) toTextMarks(offset int) []TextMark {`
			`var marks []TextMark`
			`for _, w := range l.words {`
			`if w.newWord {`
			`marks = appendSpaceMark(marks, offset, " ")`
			`}`
			`wordMarks := w.toTextMarks(offset)`
			`marks = append(marks, wordMarks...)`
			`}`
			`return marks`
			`}`

			// pullWord removes `word` from bag and appends it to `l`.
			`func (l textLine) pullWord(bag wordBag, word *textWord, depthIdx int) {`
			`l.appendWord(word)`
			`bag.removeWord(word, depthIdx)`
			`}`

			// appendWord appends `word` to `l`.
			// `l.PdfRectangle` is increased to bound the new word.
			// `l.fontsize` is the largest of the fontsizes of the words in line.
			`func (l textLine) appendWord(word textWord) {`
			`l.words = append(l.words, word)`
			`l.PdfRectangle = rectUnion(l.PdfRectangle, word.PdfRectangle)`
			`if word.fontsize > l.fontsize {`
			`l.fontsize = word.fontsize`
			`}`
			`if word.depth > l.depth {`
			`l.depth = word.depth`
			`}`
			`}`

			`// markWordBoundaries marks the word fragments that are the first fragments in whole words.`
			`func (l *textLine) markWordBoundaries() {`
			`maxGap := maxIntraLineGapR * l.fontsize`
			`for i, w := range l.words[1:] {`
			`if gapReading(w, l.words[i]) >= maxGap {`
			`w.newWord = true`
			`}`
			`}`
			`}`

			`// endsInHyphen attempts to detect words that are split between lines`
			// IT currently returns true if `l` ends in a hyphen and its last minHyphenation runes don't coataib
			`// a space.`
			`// TODO(peterwilliams97): Figure out a better heuristic`
			`func (l *textLine) endsInHyphen() bool {`
			`// Computing l.text() is a little expensive so we filter out simple cases first.`
			`lastWord := l.words[len(l.words)-1]`
			`runes := []rune(lastWord.text)`
			`if !unicode.Is(unicode.Hyphen, runes[len(runes)-1]) {`
			`return false`
			`}`
			`if lastWord.newWord && endsInHyphen(runes) {`
			`return true`
			`}`
			`return endsInHyphen([]rune(l.text()))`
			`}`

			// endsInHyphen returns true if `runes` ends with a hyphenated word.
			`func endsInHyphen(runes []rune) bool {`
			`return len(runes) >= minHyphenation &&`
			`unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&`
			`!unicode.IsSpace(runes[len(runes)-2])`
			`}`