unipdf/extractor/text_line.go
2020-05-27 18:15:18 +10:00

135 lines
3.6 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"fmt"
"math"
"strings"
"unicode/utf8"
"github.com/unidoc/unipdf/v3/model"
)
// textLine repesents words on the same line within a textPara.
type textLine struct {
serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
depth float64 // Distance from bottom of line to top of page.
words []*textWord // Words in this line.
fontsize float64
hyphenated bool
}
// newTextLine creates a line with font and bbox size of `w`, removes `w` from p.bins[bestWordDepthIdx] and adds it to the line
func newTextLine(p *textStrata, depthIdx int) *textLine {
words := p.getStratum(depthIdx)
word := words[0]
line := textLine{
serial: serial.line,
PdfRectangle: word.PdfRectangle,
fontsize: word.fontsize,
depth: word.depth,
}
serial.line++
line.moveWord(p, depthIdx, word)
return &line
}
// String returns a description of `l`.
func (l *textLine) String() string {
return fmt.Sprintf("serial=%d %.2f %.2f fontsize=%.2f \"%s\"",
l.serial, l.depth, l.PdfRectangle, l.fontsize, l.text())
}
// bbox makes textLine implementethe `bounded` interface.
func (l *textLine) bbox() model.PdfRectangle {
return l.PdfRectangle
}
// text returns the extracted text contained in line..
func (l *textLine) text() string {
var words []string
for _, w := range l.words {
words = append(words, w.text())
if w.spaceAfter {
words = append(words, " ")
}
}
return strings.Join(words, "")
}
// toTextMarks returns the TextMarks contained in `l`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (l *textLine) toTextMarks(offset *int) []TextMark {
var marks []TextMark
addMark := func(mark TextMark) {
mark.Offset = *offset
marks = append(marks, mark)
*offset += len(mark.Text)
}
addSpaceMark := func(spaceChar string) {
mark := spaceMark
mark.Text = spaceChar
addMark(mark)
}
for _, word := range l.words {
for _, tm := range word.marks {
addMark(tm.ToTextMark())
}
if word.spaceAfter {
addSpaceMark(" ")
}
}
return marks
}
// moveWord removes `word` from p.bins[bestWordDepthIdx] and adds it to `l`.
// `l.PdfRectangle` is increased to bound the new word
// `l.fontsize` is the largest of the fontsizes of the words in line
func (l *textLine) moveWord(s *textStrata, depthIdx int, word *textWord) {
l.words = append(l.words, word)
l.PdfRectangle = rectUnion(l.PdfRectangle, word.PdfRectangle)
if word.fontsize > l.fontsize {
l.fontsize = word.fontsize
}
if word.depth > l.depth {
l.depth = word.depth
}
s.removeWord(depthIdx, word)
}
// mergeWordFragments merges the word fragments in the words in `l`.
func (l *textLine) mergeWordFragments() {
fontsize := l.fontsize
if len(l.words) > 1 {
maxGap := maxIntraLineGapR * fontsize
fontTol := maxIntraWordFontTolR * fontsize
merged := []*textWord{l.words[0]}
for _, word := range l.words[1:] {
lastMerged := merged[len(merged)-1]
doMerge := false
if gapReading(word, lastMerged) >= maxGap {
lastMerged.spaceAfter = true
} else if lastMerged.font(lastMerged.len()-1) == word.font(0) &&
math.Abs(lastMerged.fontsize-word.fontsize) < fontTol {
doMerge = true
}
if doMerge {
lastMerged.absorb(word)
} else {
merged = append(merged, word)
}
}
l.words = merged
}
// check for hyphen at end of line
r, _ := utf8.DecodeLastRuneInString(l.text())
l.hyphenated = r == '-'
}