mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-02 22:17:06 +08:00
127 lines
3.6 KiB
Go
127 lines
3.6 KiB
Go
![]() |
/*
|
||
|
* This file is subject to the terms and conditions defined in
|
||
|
* file 'LICENSE.md', which is part of this source code package.
|
||
|
*/
|
||
|
|
||
|
package extractor
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
|
||
|
"github.com/unidoc/unipdf/v3/model"
|
||
|
)
|
||
|
|
||
|
// textLine repesents words on the same line within a textPara.
|
||
|
type textLine struct {
|
||
|
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
||
|
depth float64 // Distance from bottom of line to top of page.
|
||
|
words []*textWord // Words in this line.
|
||
|
fontsize float64 // Largest word font size.
|
||
|
}
|
||
|
|
||
|
// newTextLine creates a line with the font and bbox size of the first word in `b`, removes the word
|
||
|
// from `b` and adds it to the line.
|
||
|
func newTextLine(b *wordBag, depthIdx int) *textLine {
|
||
|
word := b.firstWord(depthIdx)
|
||
|
line := textLine{
|
||
|
PdfRectangle: word.PdfRectangle,
|
||
|
fontsize: word.fontsize,
|
||
|
depth: word.depth,
|
||
|
}
|
||
|
line.pullWord(b, word, depthIdx)
|
||
|
return &line
|
||
|
}
|
||
|
|
||
|
// String returns a description of `l`.
|
||
|
func (l *textLine) String() string {
|
||
|
return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
|
||
|
l.depth, l.PdfRectangle, l.fontsize, l.text())
|
||
|
}
|
||
|
|
||
|
// bbox makes textLine implement the `bounded` interface.
|
||
|
func (l *textLine) bbox() model.PdfRectangle {
|
||
|
return l.PdfRectangle
|
||
|
}
|
||
|
|
||
|
// text returns the extracted text contained in line.
|
||
|
func (l *textLine) text() string {
|
||
|
var words []string
|
||
|
for _, w := range l.words {
|
||
|
if w.newWord {
|
||
|
words = append(words, " ")
|
||
|
}
|
||
|
words = append(words, w.text)
|
||
|
}
|
||
|
return strings.Join(words, "")
|
||
|
}
|
||
|
|
||
|
// toTextMarks returns the TextMarks contained in `l`.text().
|
||
|
// `offset` is used to give the TextMarks the correct Offset values.
|
||
|
func (l *textLine) toTextMarks(offset *int) []TextMark {
|
||
|
var marks []TextMark
|
||
|
for _, w := range l.words {
|
||
|
if w.newWord {
|
||
|
marks = appendSpaceMark(marks, offset, " ")
|
||
|
}
|
||
|
wordMarks := w.toTextMarks(offset)
|
||
|
marks = append(marks, wordMarks...)
|
||
|
}
|
||
|
return marks
|
||
|
}
|
||
|
|
||
|
// pullWord removes `word` from bag and appends it to `l`.
|
||
|
func (l *textLine) pullWord(bag *wordBag, word *textWord, depthIdx int) {
|
||
|
l.appendWord(word)
|
||
|
bag.removeWord(word, depthIdx)
|
||
|
}
|
||
|
|
||
|
// appendWord appends `word` to `l`.
|
||
|
// `l.PdfRectangle` is increased to bound the new word.
|
||
|
// `l.fontsize` is the largest of the fontsizes of the words in line.
|
||
|
func (l *textLine) appendWord(word *textWord) {
|
||
|
l.words = append(l.words, word)
|
||
|
l.PdfRectangle = rectUnion(l.PdfRectangle, word.PdfRectangle)
|
||
|
if word.fontsize > l.fontsize {
|
||
|
l.fontsize = word.fontsize
|
||
|
}
|
||
|
if word.depth > l.depth {
|
||
|
l.depth = word.depth
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// markWordBoundaries marks the word fragments that are the first fragments in whole words.
|
||
|
func (l *textLine) markWordBoundaries() {
|
||
|
maxGap := maxIntraLineGapR * l.fontsize
|
||
|
for i, w := range l.words[1:] {
|
||
|
if gapReading(w, l.words[i]) >= maxGap {
|
||
|
w.newWord = true
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// endsInHyphen attempts to detect words that are split between lines
|
||
|
// IT currently returns true if `l` ends in a hyphen and its last minHyphenation runes don't coataib
|
||
|
// a space.
|
||
|
// TODO(peterwilliams97): Figure out a better heuristic
|
||
|
func (l *textLine) endsInHyphen() bool {
|
||
|
// Computing l.text() is a little expensive so we filter out simple cases first.
|
||
|
lastWord := l.words[len(l.words)-1]
|
||
|
runes := []rune(lastWord.text)
|
||
|
if !unicode.Is(unicode.Hyphen, runes[len(runes)-1]) {
|
||
|
return false
|
||
|
}
|
||
|
if lastWord.newWord && endsInHyphen(runes) {
|
||
|
return true
|
||
|
}
|
||
|
return endsInHyphen([]rune(l.text()))
|
||
|
}
|
||
|
|
||
|
// endsInHyphen returns true if `runes` ends with a hyphenated word.
|
||
|
func endsInHyphen(runes []rune) bool {
|
||
|
return len(runes) >= minHyphenation &&
|
||
|
unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
|
||
|
!unicode.IsSpace(runes[len(runes)-2])
|
||
|
}
|