mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
206 lines
6.7 KiB
Go
206 lines
6.7 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"strings"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
"golang.org/x/text/unicode/norm"
|
|
)
|
|
|
|
// textWord represents a word fragment.
|
|
// makeTextWords() shows how textWords are created.
|
|
// We don't see whole words until textWords are eventually sorted into textLines in
|
|
// wordBag.arrangeText(). textLines are slices of textWord that define whole words by the
|
|
// newWord marker on those fragments that start whole words.
|
|
// - A textLine is the textWords at similar depths sorted in reading order.
|
|
// - All textWords, w, in the textLine that start whole words have w.newWord = true
|
|
type textWord struct {
|
|
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
|
depth float64 // Distance from bottom of this word to the top of the page.
|
|
text string // The word fragment text.
|
|
marks []*textMark // Marks in this word.
|
|
fontsize float64 // Largest fontsize in the word.
|
|
newWord bool // Is this word fragment the start of a new word?
|
|
}
|
|
|
|
// makeTextPage combines `marks`, the textMarks on a page, into word fragments.
|
|
// `pageSize` is used to calculate the words` depths depth on the page.
|
|
// Algorithm:
|
|
// 1. `marks` are in the order they were rendered in the PDF.
|
|
// 2. Successive marks are combined into a word fragment unless
|
|
// One mark is a space character.
|
|
// They are separated by more than maxWordAdvanceR*fontsize in the reading direction
|
|
// They are not within the location allowed by horizontal and vertical variations allowed by
|
|
// reasonable kerning and leading.
|
|
// TODO(peterwilliams97): Check for overlapping textWords for cases such as diacritics, bolding by
|
|
// repeating and others.
|
|
func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
|
|
var words []*textWord // The words.
|
|
var newWord *textWord // The word being built.
|
|
|
|
// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
|
|
addNewWord := func() {
|
|
if newWord != nil {
|
|
text := newWord.computeText()
|
|
if !isTextSpace(text) {
|
|
newWord.text = text
|
|
words = append(words, newWord)
|
|
}
|
|
newWord = nil
|
|
}
|
|
}
|
|
|
|
for _, tm := range marks {
|
|
if doCombineDiacritics && newWord != nil && len(newWord.marks) > 0 {
|
|
// Combine diacritic marks into neighbourimg non-diacritics marks.
|
|
prev := newWord.marks[len(newWord.marks)-1]
|
|
text, isDiacritic := combiningDiacritic(tm.text)
|
|
prevText, prevDiacritic := combiningDiacritic(prev.text)
|
|
if isDiacritic && !prevDiacritic && prev.inDiacriticArea(tm) {
|
|
newWord.addDiacritic(text)
|
|
continue
|
|
}
|
|
if prevDiacritic && !isDiacritic && tm.inDiacriticArea(prev) {
|
|
// If the previous mark was the diacritic, merge it into this mark and re-append it
|
|
newWord.marks = newWord.marks[:len(newWord.marks)-1]
|
|
newWord.appendMark(tm, pageSize)
|
|
newWord.addDiacritic(prevText)
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Check for spaces between words.
|
|
isSpace := isTextSpace(tm.text)
|
|
if isSpace {
|
|
addNewWord()
|
|
continue
|
|
}
|
|
|
|
if newWord == nil && !isSpace {
|
|
newWord = newTextWord([]*textMark{tm}, pageSize)
|
|
continue
|
|
}
|
|
|
|
fontsize := newWord.fontsize
|
|
depthGap := math.Abs(getDepth(pageSize, tm)-newWord.depth) / fontsize
|
|
readingGap := gapReading(tm, newWord) / fontsize
|
|
|
|
// These are the conditions for `tm` to be from a new word.
|
|
// - Gap between words in reading position is larger than a space.
|
|
// - Change in reading position is too negative to be just a kerning adjustment.
|
|
// - Change in depth is too large to be just a leading adjustment.
|
|
if readingGap >= maxWordAdvanceR || !(-maxKerningR <= readingGap && depthGap <= maxLeadingR) {
|
|
addNewWord()
|
|
newWord = newTextWord([]*textMark{tm}, pageSize)
|
|
continue
|
|
}
|
|
newWord.appendMark(tm, pageSize)
|
|
}
|
|
addNewWord()
|
|
|
|
return words
|
|
}
|
|
|
|
// newTextWord creates a textWords containing `marks`.
|
|
// `pageSize` is used to calculate the word's depth on the page.
|
|
func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
|
|
r := marks[0].PdfRectangle
|
|
fontsize := marks[0].fontsize
|
|
for _, tm := range marks[1:] {
|
|
r = rectUnion(r, tm.PdfRectangle)
|
|
if tm.fontsize > fontsize {
|
|
fontsize = tm.fontsize
|
|
}
|
|
}
|
|
|
|
return &textWord{
|
|
PdfRectangle: r,
|
|
marks: marks,
|
|
depth: pageSize.Ury - r.Lly,
|
|
fontsize: fontsize,
|
|
}
|
|
}
|
|
|
|
// String returns a description of `w`.
|
|
func (w *textWord) String() string {
|
|
return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
|
|
w.depth, w.PdfRectangle, w.fontsize, w.text)
|
|
}
|
|
|
|
// bbox makes textWord implement the `bounded` interface.
|
|
func (w *textWord) bbox() model.PdfRectangle {
|
|
return w.PdfRectangle
|
|
}
|
|
|
|
// appendMark adds textMark `tm` to `w`.
|
|
// `pageSize` is used to calculate the word's depth on the page.
|
|
func (w *textWord) appendMark(tm *textMark, pageSize model.PdfRectangle) {
|
|
w.marks = append(w.marks, tm)
|
|
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
|
|
if tm.fontsize > w.fontsize {
|
|
w.fontsize = tm.fontsize
|
|
}
|
|
w.depth = pageSize.Ury - w.PdfRectangle.Lly
|
|
}
|
|
|
|
// addDiacritic adds combining diacritic `text` `tm` to `w`.
|
|
// It adds the diacritic to the last mark and doesn't update the size
|
|
func (w *textWord) addDiacritic(text string) {
|
|
lastMark := w.marks[len(w.marks)-1]
|
|
lastMark.text = lastMark.text + text
|
|
lastMark.text = norm.NFKC.String(lastMark.text)
|
|
}
|
|
|
|
// absorb combines `word` into `w`.
|
|
func (w *textWord) absorb(word *textWord) {
|
|
w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)
|
|
w.marks = append(w.marks, word.marks...)
|
|
}
|
|
|
|
// text returns the text in `w`.
|
|
func (w *textWord) computeText() string {
|
|
texts := make([]string, len(w.marks))
|
|
for i, tm := range w.marks {
|
|
texts[i] = tm.text
|
|
}
|
|
return strings.Join(texts, "")
|
|
}
|
|
|
|
// toTextMarks returns the TextMarks contained in `w`.text().
|
|
// `offset` is used to give the TextMarks the correct Offset values.
|
|
func (w *textWord) toTextMarks(offset *int) []TextMark {
|
|
var marks []TextMark
|
|
for _, tm := range w.marks {
|
|
marks = appendTextMark(marks, offset, tm.ToTextMark())
|
|
}
|
|
return marks
|
|
}
|
|
|
|
// removeWord returns `words` with `word` removed.
|
|
// Caller must check that `words` contains `word`,
|
|
// TODO(peterwilliams97): Optimize
|
|
func removeWord(words []*textWord, word *textWord) []*textWord {
|
|
for i, w := range words {
|
|
if w == word {
|
|
return removeWordAt(words, i)
|
|
}
|
|
}
|
|
common.Log.Error("removeWord: words doesn't contain word=%s", word)
|
|
return nil
|
|
}
|
|
|
|
// removeWord returns `words` with `words[idx]` removed.
|
|
func removeWordAt(words []*textWord, idx int) []*textWord {
|
|
n := len(words)
|
|
copy(words[idx:], words[idx+1:])
|
|
return words[:n-1]
|
|
}
|