unipdf/extractor/text_word.go
Peter Williams 88fda44e0a
Text extraction code for columns. (#366)
* Fixed filename:page in logging

* Got CMap working for multi-rune entries

* Treat CMap entries as strings instead of runes to handle multi-byte encodings.

* Added a test for multibyte encoding.

* First version of text extraction that recognizes columns

* Added an expanation of the text columns code to README.md.

* fixed typos

* Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

* Added function comments.

* Fixed text state save/restore.

* Adjusted inter-word search distance to make paragrah division work for thanh.pdf

* Got text_test.go passing.

* Reinstated hyphen suppression

* Handle more cases of fonts not being set in text extraction code.

* Fixed typo

* More verbose logging

* Adding tables to text extractor.

* Added tests for columns extraction.

* Removed commented code

* Check for textParas that are on the same line when writing out extracted text.

* Absorb text to the left of paras into paras e.g. Footnote numbers

* Removed funny character from text_test.go

* Commented out a creator_test.go test that was broken by my text extraction changes.

* Big changes to columns text extraction code for PR.

Performance improvements in several places.
Commented code.

* Updated extractor/README

* Cleaned up some comments and removed a panic

* Increased threshold for truncating extracted text when there is no license 100 -> 102.

This is a workaround to let a test in creator_test.go pass.

With the old text extraction code the following extracted text was 100 chars. With the new code it
is 102 chars which looks correct.

"你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n"

* Improved an error message.

* Removed irrelevant spaces

* Commented code and removed unused functions.

* Reverted PdfRectangle changes

* Added duplicate text detection.

* Combine diacritic textMarks in text extraction

* Reinstated a diacritic recombination test.

* Small code reorganisation

* Reinstated handling of rotated text

* Addressed issues in PR review

* Added color fields to TextMark

* Updated README

* Reinstated the disabled tests I missed before.

* Tightened definition for tables to prevent detection of tables where there weren't any.

* Compute line splitting search range based on fontsize of first word in word bag.

* Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs.

See https://blog.golang.org/go1.13-errors

* Fixed some naming and added some comments.

* errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility

* Removed code that doesn't ever get called.

* Removed unused test
2020-06-30 19:33:10 +00:00

206 lines
6.7 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"fmt"
"math"
"strings"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
"golang.org/x/text/unicode/norm"
)
// textWord represents a word fragment.
// makeTextWords() shows how textWords are created.
// We don't see whole words until textWords are eventually sorted into textLines in
// wordBag.arrangeText(). textLines are slices of textWord that define whole words by the
// newWord marker on those fragments that start whole words.
// - A textLine is the textWords at similar depths sorted in reading order.
// - All textWords, w, in the textLine that start whole words have w.newWord = true
type textWord struct {
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
depth float64 // Distance from bottom of this word to the top of the page.
text string // The word fragment text.
marks []*textMark // Marks in this word.
fontsize float64 // Largest fontsize in the word.
newWord bool // Is this word fragment the start of a new word?
}
// makeTextPage combines `marks`, the textMarks on a page, into word fragments.
// `pageSize` is used to calculate the words` depths depth on the page.
// Algorithm:
// 1. `marks` are in the order they were rendered in the PDF.
// 2. Successive marks are combined into a word fragment unless
// One mark is a space character.
// They are separated by more than maxWordAdvanceR*fontsize in the reading direction
// They are not within the location allowed by horizontal and vertical variations allowed by
// reasonable kerning and leading.
// TODO(peterwilliams97): Check for overlapping textWords for cases such as diacritics, bolding by
// repeating and others.
func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
var words []*textWord // The words.
var newWord *textWord // The word being built.
// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
addNewWord := func() {
if newWord != nil {
text := newWord.computeText()
if !isTextSpace(text) {
newWord.text = text
words = append(words, newWord)
}
newWord = nil
}
}
for _, tm := range marks {
if doCombineDiacritics && newWord != nil && len(newWord.marks) > 0 {
// Combine diacritic marks into neighbourimg non-diacritics marks.
prev := newWord.marks[len(newWord.marks)-1]
text, isDiacritic := combiningDiacritic(tm.text)
prevText, prevDiacritic := combiningDiacritic(prev.text)
if isDiacritic && !prevDiacritic && prev.inDiacriticArea(tm) {
newWord.addDiacritic(text)
continue
}
if prevDiacritic && !isDiacritic && tm.inDiacriticArea(prev) {
// If the previous mark was the diacritic, merge it into this mark and re-append it
newWord.marks = newWord.marks[:len(newWord.marks)-1]
newWord.appendMark(tm, pageSize)
newWord.addDiacritic(prevText)
continue
}
}
// Check for spaces between words.
isSpace := isTextSpace(tm.text)
if isSpace {
addNewWord()
continue
}
if newWord == nil && !isSpace {
newWord = newTextWord([]*textMark{tm}, pageSize)
continue
}
fontsize := newWord.fontsize
depthGap := math.Abs(getDepth(pageSize, tm)-newWord.depth) / fontsize
readingGap := gapReading(tm, newWord) / fontsize
// These are the conditions for `tm` to be from a new word.
// - Gap between words in reading position is larger than a space.
// - Change in reading position is too negative to be just a kerning adjustment.
// - Change in depth is too large to be just a leading adjustment.
if readingGap >= maxWordAdvanceR || !(-maxKerningR <= readingGap && depthGap <= maxLeadingR) {
addNewWord()
newWord = newTextWord([]*textMark{tm}, pageSize)
continue
}
newWord.appendMark(tm, pageSize)
}
addNewWord()
return words
}
// newTextWord creates a textWords containing `marks`.
// `pageSize` is used to calculate the word's depth on the page.
func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
r := marks[0].PdfRectangle
fontsize := marks[0].fontsize
for _, tm := range marks[1:] {
r = rectUnion(r, tm.PdfRectangle)
if tm.fontsize > fontsize {
fontsize = tm.fontsize
}
}
return &textWord{
PdfRectangle: r,
marks: marks,
depth: pageSize.Ury - r.Lly,
fontsize: fontsize,
}
}
// String returns a description of `w`.
func (w *textWord) String() string {
return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
w.depth, w.PdfRectangle, w.fontsize, w.text)
}
// bbox makes textWord implement the `bounded` interface.
func (w *textWord) bbox() model.PdfRectangle {
return w.PdfRectangle
}
// appendMark adds textMark `tm` to `w`.
// `pageSize` is used to calculate the word's depth on the page.
func (w *textWord) appendMark(tm *textMark, pageSize model.PdfRectangle) {
w.marks = append(w.marks, tm)
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
if tm.fontsize > w.fontsize {
w.fontsize = tm.fontsize
}
w.depth = pageSize.Ury - w.PdfRectangle.Lly
}
// addDiacritic adds combining diacritic `text` `tm` to `w`.
// It adds the diacritic to the last mark and doesn't update the size
func (w *textWord) addDiacritic(text string) {
lastMark := w.marks[len(w.marks)-1]
lastMark.text = lastMark.text + text
lastMark.text = norm.NFKC.String(lastMark.text)
}
// absorb combines `word` into `w`.
func (w *textWord) absorb(word *textWord) {
w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)
w.marks = append(w.marks, word.marks...)
}
// text returns the text in `w`.
func (w *textWord) computeText() string {
texts := make([]string, len(w.marks))
for i, tm := range w.marks {
texts[i] = tm.text
}
return strings.Join(texts, "")
}
// toTextMarks returns the TextMarks contained in `w`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (w *textWord) toTextMarks(offset *int) []TextMark {
var marks []TextMark
for _, tm := range w.marks {
marks = appendTextMark(marks, offset, tm.ToTextMark())
}
return marks
}
// removeWord returns `words` with `word` removed.
// Caller must check that `words` contains `word`,
// TODO(peterwilliams97): Optimize
func removeWord(words []*textWord, word *textWord) []*textWord {
for i, w := range words {
if w == word {
return removeWordAt(words, i)
}
}
common.Log.Error("removeWord: words doesn't contain word=%s", word)
return nil
}
// removeWord returns `words` with `words[idx]` removed.
func removeWordAt(words []*textWord, idx int) []*textWord {
n := len(words)
copy(words[idx:], words[idx+1:])
return words[:n-1]
}