unipdf/extractor/text_word.go

190 lines
4.8 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"fmt"
"math"
"strings"
"unicode/utf8"
"github.com/unidoc/unipdf/v3/internal/textencoding"
"github.com/unidoc/unipdf/v3/model"
)
// textWord represents a word. It's a sequence of textMarks that are close enough toghether in the
// reading direction and doesn't have any space textMarks.
type textWord struct {
serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
depth float64 // Distance from bottom of word to top of page.
marks []textMark // Marks in this word.
fontsize float64 // Largest fontsize in `marks` w
spaceAfter bool
}
// makeTextPage builds a word list from `marks`, the textMarks on a page.
// `pageSize` is used to calculate the words` depths depth on the page
func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []*textWord {
var words []*textWord
var cursor *textWord
// addWord adds `cursor` to `words` and resets it to nil
addWord := func() {
if cursor != nil {
if !isTextSpace(cursor.text()) {
words = append(words, cursor)
}
cursor = nil
}
}
for _, tm := range marks {
isSpace := isTextSpace(tm.text)
if cursor == nil && !isSpace {
cursor = newTextWord([]textMark{tm}, pageSize)
continue
}
if isSpace {
addWord()
continue
}
depthGap := pageSize.Ury - tm.Lly - cursor.depth
readingGap := tm.Llx - cursor.Urx
fontsize := cursor.fontsize
// These are the conditions for `tm` to be from a new word.
// - Change in reading position is larger than a space which we guess to be 0.11*fontsize.
// - Change in reading position is too negative to be just a kerning adjustment.
// - Change in depth is too large to be just a leading adjustment.
sameWord := -0.19*fontsize <= readingGap && readingGap <= 0.11*fontsize &&
math.Abs(depthGap) <= 0.04*fontsize
if !sameWord {
addWord()
cursor = newTextWord([]textMark{tm}, pageSize)
continue
}
cursor.addMark(tm, pageSize)
}
addWord()
return words
}
// newTextWord creates a textWords containing `marks`.
// `pageSize` is used to calculate the word's depth on the page.
func newTextWord(marks []textMark, pageSize model.PdfRectangle) *textWord {
r := marks[0].PdfRectangle
fontsize := marks[0].fontsize
for _, tm := range marks[1:] {
r = rectUnion(r, tm.PdfRectangle)
if tm.fontsize > fontsize {
fontsize = tm.fontsize
}
}
depth := pageSize.Ury - r.Lly
word := textWord{
serial: serial.word,
PdfRectangle: r,
marks: marks,
depth: depth,
fontsize: fontsize,
}
serial.word++
return &word
}
// String returns a description of `w.
func (w *textWord) String() string {
return fmt.Sprintf("serial=%d base=%.2f %.2f fontsize=%.2f \"%s\"",
w.serial, w.depth, w.PdfRectangle, w.fontsize, w.text())
}
func (w *textWord) bbox() model.PdfRectangle {
return w.PdfRectangle
}
// addMark adds textMark `tm` to word `w`.
// `pageSize` is used to calculate the word's depth on the page.
func (w *textWord) addMark(tm textMark, pageSize model.PdfRectangle) {
w.marks = append(w.marks, tm)
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
if tm.fontsize > w.fontsize {
w.fontsize = tm.fontsize
}
w.depth = pageSize.Ury - w.PdfRectangle.Lly
if w.depth < 0 {
panic(w.depth)
}
}
// len returns the number of runes in `w`.
func (w *textWord) len() int {
return utf8.RuneCountInString(w.text())
}
func (w *textWord) merge(word *textWord) {
w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)
w.marks = append(w.marks, word.marks...)
}
func (w *textWord) text() string {
var parts []string
for _, tm := range w.marks {
for _, r := range tm.text {
parts = append(parts, textencoding.RuneToString(r))
}
}
return strings.Join(parts, "")
}
// font returns the fontID of the `idx`th rune in text.
// compute on creation? !@#$
func (w *textWord) font(idx int) string {
numChars := 0
for _, tm := range w.marks {
for _, r := range tm.text {
numChars += len(textencoding.RuneToString(r))
if numChars > idx {
return fmt.Sprintf("%s:%.3f", tm.font, tm.fontsize)
}
}
}
panic("no match")
}
func baseRange(words []*textWord) (minDepth, maxDepth float64) {
for i, w := range words {
depth := w.depth
if i == 0 {
minDepth = depth
maxDepth = depth
} else if depth < minDepth {
minDepth = depth
} else if depth > maxDepth {
maxDepth = depth
}
}
return
}
func removeWord(words []*textWord, word *textWord) []*textWord {
for i, w := range words {
if w == word {
return removeWordAt(words, i)
}
}
panic("word not in words")
}
func removeWordAt(words []*textWord, idx int) []*textWord {
n := len(words)
copy(words[idx:], words[idx+1:])
return words[:n-1]
}