unipdf/extractor/text_line.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"fmt"
	"strings"
	"unicode"

	"github.com/unidoc/unipdf/v3/model"
)

// textLine repesents words on the same line within a textPara.
type textLine struct {
	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
	depth              float64     // Distance from bottom of line to top of page.
	words              []*textWord // Words in this line.
	fontsize           float64     // Largest word font size.
}

// newTextLine creates a line with the font and bbox size of the first word in `b`, removes the word
// from `b` and adds it to the line.
func newTextLine(b *wordBag, depthIdx int) *textLine {
	word := b.firstWord(depthIdx)
	line := textLine{
		PdfRectangle: word.PdfRectangle,
		fontsize:     word.fontsize,
		depth:        word.depth,
	}
	line.pullWord(b, word, depthIdx)
	return &line
}

// String returns a description of `l`.
func (l *textLine) String() string {
	return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
		l.depth, l.PdfRectangle, l.fontsize, l.text())
}

// bbox makes textLine implement the `bounded` interface.
func (l *textLine) bbox() model.PdfRectangle {
	return l.PdfRectangle
}

// text returns the extracted text contained in line.
func (l *textLine) text() string {
	var words []string
	for _, w := range l.words {
		if w.newWord {
			words = append(words, " ")
		}
		words = append(words, w.text)
	}
	return strings.Join(words, "")
}

// toTextMarks returns the TextMarks contained in `l`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (l *textLine) toTextMarks(offset *int) []TextMark {
	var marks []TextMark
	for _, w := range l.words {
		if w.newWord {
			marks = appendSpaceMark(marks, offset, " ")
		}
		wordMarks := w.toTextMarks(offset)
		marks = append(marks, wordMarks...)
	}
	return marks
}

// pullWord removes `word` from bag and appends it to `l`.
func (l *textLine) pullWord(bag *wordBag, word *textWord, depthIdx int) {
	l.appendWord(word)
	bag.removeWord(word, depthIdx)
}

// appendWord appends `word` to `l`.
// `l.PdfRectangle` is increased to bound the new word.
// `l.fontsize` is the largest of the fontsizes of the words in line.
func (l *textLine) appendWord(word *textWord) {
	l.words = append(l.words, word)
	l.PdfRectangle = rectUnion(l.PdfRectangle, word.PdfRectangle)
	if word.fontsize > l.fontsize {
		l.fontsize = word.fontsize
	}
	if word.depth > l.depth {
		l.depth = word.depth
	}
}

// markWordBoundaries marks the word fragments that are the first fragments in whole words.
func (l *textLine) markWordBoundaries() {
	maxGap := maxIntraLineGapR * l.fontsize
	for i, w := range l.words[1:] {
		if gapReading(w, l.words[i]) >= maxGap {
			w.newWord = true
		}
	}
}

// endsInHyphen attempts to detect words that are split between lines
// IT currently returns true if `l` ends in a hyphen and its last minHyphenation runes don't coataib
// a space.
// TODO(peterwilliams97): Figure out a better heuristic
func (l *textLine) endsInHyphen() bool {
	// Computing l.text() is a little expensive so we filter out simple cases first.
	lastWord := l.words[len(l.words)-1]
	runes := []rune(lastWord.text)
	if !unicode.Is(unicode.Hyphen, runes[len(runes)-1]) {
		return false
	}
	if lastWord.newWord && endsInHyphen(runes) {
		return true
	}
	return endsInHyphen([]rune(l.text()))
}

// endsInHyphen returns true if `runes` ends with a hyphenated word.
func endsInHyphen(runes []rune) bool {
	return len(runes) >= minHyphenation &&
		unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
		!unicode.IsSpace(runes[len(runes)-2])
}