mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-05 19:30:30 +08:00
206 lines
6.7 KiB
Go
206 lines
6.7 KiB
Go
![]() |
/*
|
||
|
* This file is subject to the terms and conditions defined in
|
||
|
* file 'LICENSE.md', which is part of this source code package.
|
||
|
*/
|
||
|
|
||
|
package extractor
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"math"
|
||
|
"strings"
|
||
|
|
||
|
"github.com/unidoc/unipdf/v3/common"
|
||
|
"github.com/unidoc/unipdf/v3/model"
|
||
|
"golang.org/x/text/unicode/norm"
|
||
|
)
|
||
|
|
||
|
// textWord represents a word fragment.
|
||
|
// makeTextWords() shows how textWords are created.
|
||
|
// We don't see whole words until textWords are eventually sorted into textLines in
|
||
|
// wordBag.arrangeText(). textLines are slices of textWord that define whole words by the
|
||
|
// newWord marker on those fragments that start whole words.
|
||
|
// - A textLine is the textWords at similar depths sorted in reading order.
|
||
|
// - All textWords, w, in the textLine that start whole words have w.newWord = true
|
||
|
type textWord struct {
|
||
|
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
||
|
depth float64 // Distance from bottom of this word to the top of the page.
|
||
|
text string // The word fragment text.
|
||
|
marks []*textMark // Marks in this word.
|
||
|
fontsize float64 // Largest fontsize in the word.
|
||
|
newWord bool // Is this word fragment the start of a new word?
|
||
|
}
|
||
|
|
||
|
// makeTextPage combines `marks`, the textMarks on a page, into word fragments.
|
||
|
// `pageSize` is used to calculate the words` depths depth on the page.
|
||
|
// Algorithm:
|
||
|
// 1. `marks` are in the order they were rendered in the PDF.
|
||
|
// 2. Successive marks are combined into a word fragment unless
|
||
|
// One mark is a space character.
|
||
|
// They are separated by more than maxWordAdvanceR*fontsize in the reading direction
|
||
|
// They are not within the location allowed by horizontal and vertical variations allowed by
|
||
|
// reasonable kerning and leading.
|
||
|
// TODO(peterwilliams97): Check for overlapping textWords for cases such as diacritics, bolding by
|
||
|
// repeating and others.
|
||
|
func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
|
||
|
var words []*textWord // The words.
|
||
|
var newWord *textWord // The word being built.
|
||
|
|
||
|
// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
|
||
|
addNewWord := func() {
|
||
|
if newWord != nil {
|
||
|
text := newWord.computeText()
|
||
|
if !isTextSpace(text) {
|
||
|
newWord.text = text
|
||
|
words = append(words, newWord)
|
||
|
}
|
||
|
newWord = nil
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for _, tm := range marks {
|
||
|
if doCombineDiacritics && newWord != nil && len(newWord.marks) > 0 {
|
||
|
// Combine diacritic marks into neighbourimg non-diacritics marks.
|
||
|
prev := newWord.marks[len(newWord.marks)-1]
|
||
|
text, isDiacritic := combiningDiacritic(tm.text)
|
||
|
prevText, prevDiacritic := combiningDiacritic(prev.text)
|
||
|
if isDiacritic && !prevDiacritic && prev.inDiacriticArea(tm) {
|
||
|
newWord.addDiacritic(text)
|
||
|
continue
|
||
|
}
|
||
|
if prevDiacritic && !isDiacritic && tm.inDiacriticArea(prev) {
|
||
|
// If the previous mark was the diacritic, merge it into this mark and re-append it
|
||
|
newWord.marks = newWord.marks[:len(newWord.marks)-1]
|
||
|
newWord.appendMark(tm, pageSize)
|
||
|
newWord.addDiacritic(prevText)
|
||
|
continue
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Check for spaces between words.
|
||
|
isSpace := isTextSpace(tm.text)
|
||
|
if isSpace {
|
||
|
addNewWord()
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if newWord == nil && !isSpace {
|
||
|
newWord = newTextWord([]*textMark{tm}, pageSize)
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
fontsize := newWord.fontsize
|
||
|
depthGap := math.Abs(getDepth(pageSize, tm)-newWord.depth) / fontsize
|
||
|
readingGap := gapReading(tm, newWord) / fontsize
|
||
|
|
||
|
// These are the conditions for `tm` to be from a new word.
|
||
|
// - Gap between words in reading position is larger than a space.
|
||
|
// - Change in reading position is too negative to be just a kerning adjustment.
|
||
|
// - Change in depth is too large to be just a leading adjustment.
|
||
|
if readingGap >= maxWordAdvanceR || !(-maxKerningR <= readingGap && depthGap <= maxLeadingR) {
|
||
|
addNewWord()
|
||
|
newWord = newTextWord([]*textMark{tm}, pageSize)
|
||
|
continue
|
||
|
}
|
||
|
newWord.appendMark(tm, pageSize)
|
||
|
}
|
||
|
addNewWord()
|
||
|
|
||
|
return words
|
||
|
}
|
||
|
|
||
|
// newTextWord creates a textWords containing `marks`.
|
||
|
// `pageSize` is used to calculate the word's depth on the page.
|
||
|
func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
|
||
|
r := marks[0].PdfRectangle
|
||
|
fontsize := marks[0].fontsize
|
||
|
for _, tm := range marks[1:] {
|
||
|
r = rectUnion(r, tm.PdfRectangle)
|
||
|
if tm.fontsize > fontsize {
|
||
|
fontsize = tm.fontsize
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return &textWord{
|
||
|
PdfRectangle: r,
|
||
|
marks: marks,
|
||
|
depth: pageSize.Ury - r.Lly,
|
||
|
fontsize: fontsize,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// String returns a description of `w`.
|
||
|
func (w *textWord) String() string {
|
||
|
return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
|
||
|
w.depth, w.PdfRectangle, w.fontsize, w.text)
|
||
|
}
|
||
|
|
||
|
// bbox makes textWord implement the `bounded` interface.
|
||
|
func (w *textWord) bbox() model.PdfRectangle {
|
||
|
return w.PdfRectangle
|
||
|
}
|
||
|
|
||
|
// appendMark adds textMark `tm` to `w`.
|
||
|
// `pageSize` is used to calculate the word's depth on the page.
|
||
|
func (w *textWord) appendMark(tm *textMark, pageSize model.PdfRectangle) {
|
||
|
w.marks = append(w.marks, tm)
|
||
|
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
|
||
|
if tm.fontsize > w.fontsize {
|
||
|
w.fontsize = tm.fontsize
|
||
|
}
|
||
|
w.depth = pageSize.Ury - w.PdfRectangle.Lly
|
||
|
}
|
||
|
|
||
|
// addDiacritic adds combining diacritic `text` `tm` to `w`.
|
||
|
// It adds the diacritic to the last mark and doesn't update the size
|
||
|
func (w *textWord) addDiacritic(text string) {
|
||
|
lastMark := w.marks[len(w.marks)-1]
|
||
|
lastMark.text = lastMark.text + text
|
||
|
lastMark.text = norm.NFKC.String(lastMark.text)
|
||
|
}
|
||
|
|
||
|
// absorb combines `word` into `w`.
|
||
|
func (w *textWord) absorb(word *textWord) {
|
||
|
w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)
|
||
|
w.marks = append(w.marks, word.marks...)
|
||
|
}
|
||
|
|
||
|
// text returns the text in `w`.
|
||
|
func (w *textWord) computeText() string {
|
||
|
texts := make([]string, len(w.marks))
|
||
|
for i, tm := range w.marks {
|
||
|
texts[i] = tm.text
|
||
|
}
|
||
|
return strings.Join(texts, "")
|
||
|
}
|
||
|
|
||
|
// toTextMarks returns the TextMarks contained in `w`.text().
|
||
|
// `offset` is used to give the TextMarks the correct Offset values.
|
||
|
func (w *textWord) toTextMarks(offset *int) []TextMark {
|
||
|
var marks []TextMark
|
||
|
for _, tm := range w.marks {
|
||
|
marks = appendTextMark(marks, offset, tm.ToTextMark())
|
||
|
}
|
||
|
return marks
|
||
|
}
|
||
|
|
||
|
// removeWord returns `words` with `word` removed.
|
||
|
// Caller must check that `words` contains `word`,
|
||
|
// TODO(peterwilliams97): Optimize
|
||
|
func removeWord(words []*textWord, word *textWord) []*textWord {
|
||
|
for i, w := range words {
|
||
|
if w == word {
|
||
|
return removeWordAt(words, i)
|
||
|
}
|
||
|
}
|
||
|
common.Log.Error("removeWord: words doesn't contain word=%s", word)
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// removeWord returns `words` with `words[idx]` removed.
|
||
|
func removeWordAt(words []*textWord, idx int) []*textWord {
|
||
|
n := len(words)
|
||
|
copy(words[idx:], words[idx+1:])
|
||
|
return words[:n-1]
|
||
|
}
|