unipdf/extractor/text_line.go
Peter Williams 88fda44e0a
Text extraction code for columns. (#366)
* Fixed filename:page in logging

* Got CMap working for multi-rune entries

* Treat CMap entries as strings instead of runes to handle multi-byte encodings.

* Added a test for multibyte encoding.

* First version of text extraction that recognizes columns

* Added an expanation of the text columns code to README.md.

* fixed typos

* Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

* Added function comments.

* Fixed text state save/restore.

* Adjusted inter-word search distance to make paragrah division work for thanh.pdf

* Got text_test.go passing.

* Reinstated hyphen suppression

* Handle more cases of fonts not being set in text extraction code.

* Fixed typo

* More verbose logging

* Adding tables to text extractor.

* Added tests for columns extraction.

* Removed commented code

* Check for textParas that are on the same line when writing out extracted text.

* Absorb text to the left of paras into paras e.g. Footnote numbers

* Removed funny character from text_test.go

* Commented out a creator_test.go test that was broken by my text extraction changes.

* Big changes to columns text extraction code for PR.

Performance improvements in several places.
Commented code.

* Updated extractor/README

* Cleaned up some comments and removed a panic

* Increased threshold for truncating extracted text when there is no license 100 -> 102.

This is a workaround to let a test in creator_test.go pass.

With the old text extraction code the following extracted text was 100 chars. With the new code it
is 102 chars which looks correct.

"你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n"

* Improved an error message.

* Removed irrelevant spaces

* Commented code and removed unused functions.

* Reverted PdfRectangle changes

* Added duplicate text detection.

* Combine diacritic textMarks in text extraction

* Reinstated a diacritic recombination test.

* Small code reorganisation

* Reinstated handling of rotated text

* Addressed issues in PR review

* Added color fields to TextMark

* Updated README

* Reinstated the disabled tests I missed before.

* Tightened definition for tables to prevent detection of tables where there weren't any.

* Compute line splitting search range based on fontsize of first word in word bag.

* Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs.

See https://blog.golang.org/go1.13-errors

* Fixed some naming and added some comments.

* errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility

* Removed code that doesn't ever get called.

* Removed unused test
2020-06-30 19:33:10 +00:00

127 lines
3.6 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"fmt"
"strings"
"unicode"
"github.com/unidoc/unipdf/v3/model"
)
// textLine repesents words on the same line within a textPara.
type textLine struct {
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
depth float64 // Distance from bottom of line to top of page.
words []*textWord // Words in this line.
fontsize float64 // Largest word font size.
}
// newTextLine creates a line with the font and bbox size of the first word in `b`, removes the word
// from `b` and adds it to the line.
func newTextLine(b *wordBag, depthIdx int) *textLine {
word := b.firstWord(depthIdx)
line := textLine{
PdfRectangle: word.PdfRectangle,
fontsize: word.fontsize,
depth: word.depth,
}
line.pullWord(b, word, depthIdx)
return &line
}
// String returns a description of `l`.
func (l *textLine) String() string {
return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
l.depth, l.PdfRectangle, l.fontsize, l.text())
}
// bbox makes textLine implement the `bounded` interface.
func (l *textLine) bbox() model.PdfRectangle {
return l.PdfRectangle
}
// text returns the extracted text contained in line.
func (l *textLine) text() string {
var words []string
for _, w := range l.words {
if w.newWord {
words = append(words, " ")
}
words = append(words, w.text)
}
return strings.Join(words, "")
}
// toTextMarks returns the TextMarks contained in `l`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (l *textLine) toTextMarks(offset *int) []TextMark {
var marks []TextMark
for _, w := range l.words {
if w.newWord {
marks = appendSpaceMark(marks, offset, " ")
}
wordMarks := w.toTextMarks(offset)
marks = append(marks, wordMarks...)
}
return marks
}
// pullWord removes `word` from bag and appends it to `l`.
func (l *textLine) pullWord(bag *wordBag, word *textWord, depthIdx int) {
l.appendWord(word)
bag.removeWord(word, depthIdx)
}
// appendWord appends `word` to `l`.
// `l.PdfRectangle` is increased to bound the new word.
// `l.fontsize` is the largest of the fontsizes of the words in line.
func (l *textLine) appendWord(word *textWord) {
l.words = append(l.words, word)
l.PdfRectangle = rectUnion(l.PdfRectangle, word.PdfRectangle)
if word.fontsize > l.fontsize {
l.fontsize = word.fontsize
}
if word.depth > l.depth {
l.depth = word.depth
}
}
// markWordBoundaries marks the word fragments that are the first fragments in whole words.
func (l *textLine) markWordBoundaries() {
maxGap := maxIntraLineGapR * l.fontsize
for i, w := range l.words[1:] {
if gapReading(w, l.words[i]) >= maxGap {
w.newWord = true
}
}
}
// endsInHyphen attempts to detect words that are split between lines
// IT currently returns true if `l` ends in a hyphen and its last minHyphenation runes don't coataib
// a space.
// TODO(peterwilliams97): Figure out a better heuristic
func (l *textLine) endsInHyphen() bool {
// Computing l.text() is a little expensive so we filter out simple cases first.
lastWord := l.words[len(l.words)-1]
runes := []rune(lastWord.text)
if !unicode.Is(unicode.Hyphen, runes[len(runes)-1]) {
return false
}
if lastWord.newWord && endsInHyphen(runes) {
return true
}
return endsInHyphen([]rune(l.text()))
}
// endsInHyphen returns true if `runes` ends with a hyphenated word.
func endsInHyphen(runes []rune) bool {
return len(runes) >= minHyphenation &&
unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
!unicode.IsSpace(runes[len(runes)-2])
}