unipdf/extractor/text_word.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"fmt"
	"math"
	"strings"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/model"
	"golang.org/x/text/unicode/norm"
)

// textWord represents a word fragment.
// makeTextWords() shows how textWords are created.
// We don't see whole words until textWords are eventually sorted into textLines  in
// wordBag.arrangeText(). textLines are slices of textWord that define whole words by the
//  newWord marker on those fragments that start whole words.
//  - A textLine is the textWords at similar depths sorted in reading order.
//  - All textWords, w, in the textLine that start whole words have w.newWord = true
type textWord struct {
	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
	depth              float64     // Distance from bottom of this word to the top of the page.
	text               string      // The word fragment text.
	marks              []*textMark // Marks in this word.
	fontsize           float64     // Largest fontsize in the word.
	newWord            bool        // Is this word fragment the start of  a new word?
}

// makeTextPage combines `marks`, the textMarks on a page, into word fragments.
// `pageSize` is used to calculate the words` depths depth on the page.
// Algorithm:
//  1. `marks` are in the order they were rendered in the PDF.
//  2. Successive marks are combined into a word fragment unless
//      One mark is a space character.
//      They are separated by more than maxWordAdvanceR*fontsize in the reading direction
//      They are not within the location allowed by horizontal and vertical variations allowed by
//       reasonable kerning and leading.
// TODO(peterwilliams97): Check for overlapping textWords for cases such as diacritics, bolding by
//                       repeating and others.
func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
	var words []*textWord // The words.
	var newWord *textWord // The word being built.

	// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
	addNewWord := func() {
		if newWord != nil {
			text := newWord.computeText()
			if !isTextSpace(text) {
				newWord.text = text
				words = append(words, newWord)
			}
			newWord = nil
		}
	}

	for _, tm := range marks {
		if doCombineDiacritics && newWord != nil && len(newWord.marks) > 0 {
			// Combine diacritic marks into neighbourimg non-diacritics marks.
			prev := newWord.marks[len(newWord.marks)-1]
			text, isDiacritic := combiningDiacritic(tm.text)
			prevText, prevDiacritic := combiningDiacritic(prev.text)
			if isDiacritic && !prevDiacritic && prev.inDiacriticArea(tm) {
				newWord.addDiacritic(text)
				continue
			}
			if prevDiacritic && !isDiacritic && tm.inDiacriticArea(prev) {
				// If the previous mark was the diacritic, merge it into this mark and re-append it
				newWord.marks = newWord.marks[:len(newWord.marks)-1]
				newWord.appendMark(tm, pageSize)
				newWord.addDiacritic(prevText)
				continue
			}
		}

		// Check for spaces between words.
		isSpace := isTextSpace(tm.text)
		if isSpace {
			addNewWord()
			continue
		}

		if newWord == nil && !isSpace {
			newWord = newTextWord([]*textMark{tm}, pageSize)
			continue
		}

		fontsize := newWord.fontsize
		depthGap := math.Abs(getDepth(pageSize, tm)-newWord.depth) / fontsize
		readingGap := gapReading(tm, newWord) / fontsize

		// These are the conditions for `tm` to be from a new word.
		// - Gap between words in reading position is larger than a space.
		// - Change in reading position is too negative to be just a kerning adjustment.
		// - Change in depth is too large to be just a leading adjustment.
		if readingGap >= maxWordAdvanceR || !(-maxKerningR <= readingGap && depthGap <= maxLeadingR) {
			addNewWord()
			newWord = newTextWord([]*textMark{tm}, pageSize)
			continue
		}
		newWord.appendMark(tm, pageSize)
	}
	addNewWord()

	return words
}

// newTextWord creates a textWords containing `marks`.
// `pageSize` is used to calculate the word's depth on the page.
func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
	r := marks[0].PdfRectangle
	fontsize := marks[0].fontsize
	for _, tm := range marks[1:] {
		r = rectUnion(r, tm.PdfRectangle)
		if tm.fontsize > fontsize {
			fontsize = tm.fontsize
		}
	}

	return &textWord{
		PdfRectangle: r,
		marks:        marks,
		depth:        pageSize.Ury - r.Lly,
		fontsize:     fontsize,
	}
}

// String returns a description of `w`.
func (w *textWord) String() string {
	return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
		w.depth, w.PdfRectangle, w.fontsize, w.text)
}

// bbox makes textWord implement the `bounded` interface.
func (w *textWord) bbox() model.PdfRectangle {
	return w.PdfRectangle
}

// appendMark adds textMark `tm` to  `w`.
// `pageSize` is used to calculate the word's depth on the page.
func (w *textWord) appendMark(tm *textMark, pageSize model.PdfRectangle) {
	w.marks = append(w.marks, tm)
	w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
	if tm.fontsize > w.fontsize {
		w.fontsize = tm.fontsize
	}
	w.depth = pageSize.Ury - w.PdfRectangle.Lly
}

// addDiacritic adds combining diacritic `text` `tm` to `w`.
// It adds the diacritic to the last mark and doesn't update the size
func (w *textWord) addDiacritic(text string) {
	lastMark := w.marks[len(w.marks)-1]
	lastMark.text = lastMark.text + text
	lastMark.text = norm.NFKC.String(lastMark.text)
}

// absorb combines `word` into `w`.
func (w *textWord) absorb(word *textWord) {
	w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)
	w.marks = append(w.marks, word.marks...)
}

// text returns the text in `w`.
func (w *textWord) computeText() string {
	texts := make([]string, len(w.marks))
	for i, tm := range w.marks {
		texts[i] = tm.text
	}
	return strings.Join(texts, "")
}

// toTextMarks returns the TextMarks contained in `w`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (w *textWord) toTextMarks(offset *int) []TextMark {
	var marks []TextMark
	for _, tm := range w.marks {
		marks = appendTextMark(marks, offset, tm.ToTextMark())
	}
	return marks
}

// removeWord returns `words` with `word` removed.
// Caller must check that `words` contains `word`,
// TODO(peterwilliams97): Optimize
func removeWord(words []*textWord, word *textWord) []*textWord {
	for i, w := range words {
		if w == word {
			return removeWordAt(words, i)
		}
	}
	common.Log.Error("removeWord: words doesn't contain word=%s", word)
	return nil
}

// removeWord returns `words` with `words[idx]` removed.
func removeWordAt(words []*textWord, idx int) []*textWord {
	n := len(words)
	copy(words[idx:], words[idx+1:])
	return words[:n-1]
}
Text extraction code for columns. (#366) * Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to textMark in a lot of code. Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test 2020-07-01 05:33:10 +10:00			`/*`
			`* This file is subject to the terms and conditions defined in`
			`* file 'LICENSE.md', which is part of this source code package.`
			`*/`

			`package extractor`

			`import (`
			`"fmt"`
			`"math"`
			`"strings"`

			`"github.com/unidoc/unipdf/v3/common"`
			`"github.com/unidoc/unipdf/v3/model"`
			`"golang.org/x/text/unicode/norm"`
			`)`

			`// textWord represents a word fragment.`
			`// makeTextWords() shows how textWords are created.`
			`// We don't see whole words until textWords are eventually sorted into textLines in`
			`// wordBag.arrangeText(). textLines are slices of textWord that define whole words by the`
			`// newWord marker on those fragments that start whole words.`
			`// - A textLine is the textWords at similar depths sorted in reading order.`
			`// - All textWords, w, in the textLine that start whole words have w.newWord = true`
			`type textWord struct {`
			model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
			`depth float64 // Distance from bottom of this word to the top of the page.`
			`text string // The word fragment text.`
			`marks []*textMark // Marks in this word.`
			`fontsize float64 // Largest fontsize in the word.`
			`newWord bool // Is this word fragment the start of a new word?`
			`}`

			// makeTextPage combines `marks`, the textMarks on a page, into word fragments.
			// `pageSize` is used to calculate the words` depths depth on the page.
			`// Algorithm:`
			// 1. `marks` are in the order they were rendered in the PDF.
			`// 2. Successive marks are combined into a word fragment unless`
			`// One mark is a space character.`
			`// They are separated by more than maxWordAdvanceR*fontsize in the reading direction`
			`// They are not within the location allowed by horizontal and vertical variations allowed by`
			`// reasonable kerning and leading.`
			`// TODO(peterwilliams97): Check for overlapping textWords for cases such as diacritics, bolding by`
			`// repeating and others.`
			`func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []textWord {`
			`var words []*textWord // The words.`
			`var newWord *textWord // The word being built.`

			// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
			`addNewWord := func() {`
			`if newWord != nil {`
			`text := newWord.computeText()`
			`if !isTextSpace(text) {`
			`newWord.text = text`
			`words = append(words, newWord)`
			`}`
			`newWord = nil`
			`}`
			`}`

			`for _, tm := range marks {`
			`if doCombineDiacritics && newWord != nil && len(newWord.marks) > 0 {`
			`// Combine diacritic marks into neighbourimg non-diacritics marks.`
			`prev := newWord.marks[len(newWord.marks)-1]`
			`text, isDiacritic := combiningDiacritic(tm.text)`
			`prevText, prevDiacritic := combiningDiacritic(prev.text)`
			`if isDiacritic && !prevDiacritic && prev.inDiacriticArea(tm) {`
			`newWord.addDiacritic(text)`
			`continue`
			`}`
			`if prevDiacritic && !isDiacritic && tm.inDiacriticArea(prev) {`
			`// If the previous mark was the diacritic, merge it into this mark and re-append it`
			`newWord.marks = newWord.marks[:len(newWord.marks)-1]`
			`newWord.appendMark(tm, pageSize)`
			`newWord.addDiacritic(prevText)`
			`continue`
			`}`
			`}`

			`// Check for spaces between words.`
			`isSpace := isTextSpace(tm.text)`
			`if isSpace {`
			`addNewWord()`
			`continue`
			`}`

			`if newWord == nil && !isSpace {`
			`newWord = newTextWord([]*textMark{tm}, pageSize)`
			`continue`
			`}`

			`fontsize := newWord.fontsize`
			`depthGap := math.Abs(getDepth(pageSize, tm)-newWord.depth) / fontsize`
			`readingGap := gapReading(tm, newWord) / fontsize`

			// These are the conditions for `tm` to be from a new word.
			`// - Gap between words in reading position is larger than a space.`
			`// - Change in reading position is too negative to be just a kerning adjustment.`
			`// - Change in depth is too large to be just a leading adjustment.`
			`if readingGap >= maxWordAdvanceR \|\| !(-maxKerningR <= readingGap && depthGap <= maxLeadingR) {`
			`addNewWord()`
			`newWord = newTextWord([]*textMark{tm}, pageSize)`
			`continue`
			`}`
			`newWord.appendMark(tm, pageSize)`
			`}`
			`addNewWord()`

			`return words`
			`}`

			// newTextWord creates a textWords containing `marks`.
			// `pageSize` is used to calculate the word's depth on the page.
			`func newTextWord(marks []textMark, pageSize model.PdfRectangle) textWord {`
			`r := marks[0].PdfRectangle`
			`fontsize := marks[0].fontsize`
			`for _, tm := range marks[1:] {`
			`r = rectUnion(r, tm.PdfRectangle)`
			`if tm.fontsize > fontsize {`
			`fontsize = tm.fontsize`
			`}`
			`}`

			`return &textWord{`
			`PdfRectangle: r,`
			`marks: marks,`
			`depth: pageSize.Ury - r.Lly,`
			`fontsize: fontsize,`
			`}`
			`}`

			// String returns a description of `w`.
			`func (w *textWord) String() string {`
			`return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",`
			`w.depth, w.PdfRectangle, w.fontsize, w.text)`
			`}`

			// bbox makes textWord implement the `bounded` interface.
			`func (w *textWord) bbox() model.PdfRectangle {`
			`return w.PdfRectangle`
			`}`

			// appendMark adds textMark `tm` to `w`.
			// `pageSize` is used to calculate the word's depth on the page.
			`func (w textWord) appendMark(tm textMark, pageSize model.PdfRectangle) {`
			`w.marks = append(w.marks, tm)`
			`w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)`
			`if tm.fontsize > w.fontsize {`
			`w.fontsize = tm.fontsize`
			`}`
			`w.depth = pageSize.Ury - w.PdfRectangle.Lly`
			`}`

			// addDiacritic adds combining diacritic `text` `tm` to `w`.
			`// It adds the diacritic to the last mark and doesn't update the size`
			`func (w *textWord) addDiacritic(text string) {`
			`lastMark := w.marks[len(w.marks)-1]`
			`lastMark.text = lastMark.text + text`
			`lastMark.text = norm.NFKC.String(lastMark.text)`
			`}`

			// absorb combines `word` into `w`.
			`func (w textWord) absorb(word textWord) {`
			`w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)`
			`w.marks = append(w.marks, word.marks...)`
			`}`

			// text returns the text in `w`.
			`func (w *textWord) computeText() string {`
			`texts := make([]string, len(w.marks))`
			`for i, tm := range w.marks {`
			`texts[i] = tm.text`
			`}`
			`return strings.Join(texts, "")`
			`}`

			// toTextMarks returns the TextMarks contained in `w`.text().
			// `offset` is used to give the TextMarks the correct Offset values.
			`func (w textWord) toTextMarks(offset int) []TextMark {`
			`var marks []TextMark`
			`for _, tm := range w.marks {`
			`marks = appendTextMark(marks, offset, tm.ToTextMark())`
			`}`
			`return marks`
			`}`

			// removeWord returns `words` with `word` removed.
			// Caller must check that `words` contains `word`,
			`// TODO(peterwilliams97): Optimize`
			`func removeWord(words []textWord, word textWord) []*textWord {`
			`for i, w := range words {`
			`if w == word {`
			`return removeWordAt(words, i)`
			`}`
			`}`
			`common.Log.Error("removeWord: words doesn't contain word=%s", word)`
			`return nil`
			`}`

			// removeWord returns `words` with `words[idx]` removed.
			`func removeWordAt(words []textWord, idx int) []textWord {`
			`n := len(words)`
			`copy(words[idx:], words[idx+1:])`
			`return words[:n-1]`
			`}`