unipdf/extractor/text_para.go

305 lines
8.2 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"bytes"
"fmt"
"io"
"sort"
"unicode"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
)
// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
// type so we can have methods on it.
type paraList []*textPara
// textPara is a group of words in a rectangular region of a page that get read together.
// An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.
// We start by finding paragraph regions on a page, then we break the words into the textPara into
// textLines.
type textPara struct {
serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box.
eBBox model.PdfRectangle // Extended bounding box needed to compute reading order.
lines []*textLine // Paragraph text gets broken into lines.
table *textTable
}
// newTextPara returns a textPara with the same bouding rectangle as `strata`.
func newTextPara(strata *textStrata) *textPara {
para := textPara{
serial: serial.para,
PdfRectangle: strata.PdfRectangle,
}
serial.para++
return &para
}
// String returns a description of `p`.
func (p *textPara) String() string {
return fmt.Sprintf("serial=%d %.2f %d lines %q",
p.serial, p.PdfRectangle, len(p.lines), truncate(p.text(), 50))
}
// text returns the text of the lines in `p`.
func (p *textPara) text() string {
w := new(bytes.Buffer)
p.writeText(w)
return w.String()
}
// writeText writes the text of `p` including tables to `w`.
func (p *textPara) writeText(w io.Writer) {
if p.table == nil {
p.writeCellText(w)
return
}
for y := 0; y < p.table.h; y++ {
for x := 0; x < p.table.w; x++ {
cell := p.table.get(x, y)
if cell == nil {
w.Write([]byte("\t"))
} else {
cell.writeCellText(w)
}
w.Write([]byte(" "))
}
if y < p.table.h-1 {
w.Write([]byte("\n"))
}
}
}
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
// paras `p`.writeText().
func (p *textPara) toTextMarks(offset *int) []TextMark {
if p.table == nil {
return p.toCellTextMarks(offset)
}
var marks []TextMark
for y := 0; y < p.table.h; y++ {
for x := 0; x < p.table.w; x++ {
cell := p.table.get(x, y)
if cell == nil {
marks = appendSpaceMark(marks, offset, "\t")
} else {
cellMarks := cell.toCellTextMarks(offset)
marks = append(marks, cellMarks...)
}
marks = appendSpaceMark(marks, offset, " ")
}
if y < p.table.h-1 {
marks = appendSpaceMark(marks, offset, "\n")
}
}
return marks
}
// writeCellText writes the text of `p` not including tables to `w`.
func (p *textPara) writeCellText(w io.Writer) {
for il, line := range p.lines {
lineText := line.text()
reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
if reduced { // Line ending with hyphen. Remove it.
lineText = removeLastRune(lineText)
}
w.Write([]byte(lineText))
if !(reduced || il == len(p.lines)-1) {
w.Write([]byte(getSpace(line.depth, p.lines[il+1].depth)))
}
}
}
// toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by
// paras `paras`.writeCellText().
func (p *textPara) toCellTextMarks(offset *int) []TextMark {
var marks []TextMark
for il, line := range p.lines {
lineMarks := line.toTextMarks(offset)
reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
if reduced { // Line ending with hyphen. Remove it.
if len([]rune(line.text())) < minHyphenation {
panic(line.text())
}
if len(lineMarks) < 1 {
panic(line.text())
}
lineMarks = removeLastTextMarkRune(lineMarks, offset)
}
marks = append(marks, lineMarks...)
if !(reduced || il == len(p.lines)-1) {
marks = appendSpaceMark(marks, offset, getSpace(line.depth, p.lines[il+1].depth))
}
}
return marks
}
func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark {
tm := marks[len(marks)-1]
runes := []rune(tm.Text)
if unicode.IsSpace(runes[len(runes)-1]) {
panic(tm)
}
if len(runes) == 1 {
marks = marks[:len(marks)-1]
tm1 := marks[len(marks)-1]
*offset = tm1.Offset + len(tm1.Text)
} else {
text := removeLastRune(tm.Text)
*offset += len(text) - len(tm.Text)
tm.Text = text
}
return marks
}
func removeLastRune(text string) string {
runes := []rune(text)
if len(runes) < 2 {
panic(text)
}
return string(runes[:len(runes)-1])
}
// getSpace returns the space to insert between lines of depth `depth1` and `depth2`.
// Next line is the same depth so it's the same line as this one in the extracted text
func getSpace(depth1, depth2 float64) string {
eol := !isZero(depth1 - depth2)
if eol {
return "\n"
}
return " "
}
// bbox makes textPara implement the `bounded` interface.
func (p *textPara) bbox() model.PdfRectangle {
return p.PdfRectangle
}
// fontsize return the para's fontsize which we take to be the first line's fontsize
func (p *textPara) fontsize() float64 {
if len(p.lines) == 0 {
panic(p)
}
return p.lines[0].fontsize
}
// composePara builds a textPara from the words in `strata`.
// It does this by arranging the words in `strata` into lines.
func composePara(strata *textStrata) *textPara {
para := newTextPara(strata)
// build the lines
for _, depthIdx := range strata.depthIndexes() {
for !strata.empty(depthIdx) {
// words[0] is the leftmost word from bins near `depthIdx`.
firstReadingIdx := strata.firstReadingIndex(depthIdx)
// create a new line
words := strata.getStratum(firstReadingIdx)
word0 := words[0]
line := newTextLine(strata, firstReadingIdx)
lastWord := words[0]
// compute the search range
// this is based on word0, the first word in the `firstReadingIdx` bin.
fontSize := strata.fontsize
minDepth := word0.depth - lineDepthR*fontSize
maxDepth := word0.depth + lineDepthR*fontSize
maxIntraWordGap := maxIntraWordGapR * fontSize
remainingWords:
// find the rest of the words in this line
for {
// Search for `leftWord`, the left-most word w: minDepth <= w.depth <= maxDepth.
var leftWord *textWord
leftDepthIdx := 0
for _, depthIdx := range strata.depthBand(minDepth, maxDepth) {
words := strata.stratumBand(depthIdx, minDepth, maxDepth)
if len(words) == 0 {
continue
}
word := words[0]
gap := gapReading(word, lastWord)
if gap < -maxIntraLineOverlapR*fontSize {
break remainingWords
}
// No `leftWord` or `word` to the left of `leftWord`.
if gap < maxIntraWordGap {
if leftWord == nil || diffReading(word, leftWord) < 0 {
leftDepthIdx = depthIdx
leftWord = word
}
}
}
if leftWord == nil {
break
}
// remove `leftWord` from `strata`[`leftDepthIdx`], and append it to `line`.
line.moveWord(strata, leftDepthIdx, leftWord)
lastWord = leftWord
// // TODO(peterwilliams97): Replace lastWord with line.words[len(line.words)-1] ???
// if lastWord != line.words[len(line.words)-1] {
// panic("ddd")
// }
}
line.mergeWordFragments()
// add the line
para.lines = append(para.lines, line)
}
}
sort.Slice(para.lines, func(i, j int) bool {
return diffDepthReading(para.lines[i], para.lines[j]) < 0
})
if len(para.lines) == 0 {
panic(para)
}
if verbosePara {
common.Log.Info("!!! para=%s", para.String())
for i, line := range para.lines {
fmt.Printf("%4d: %s\n", i, line)
for j, word := range line.words {
fmt.Printf("%8d: %s\n", j, word)
for k, mark := range word.marks {
fmt.Printf("%12d: %s\n", k, mark)
}
}
}
}
return para
}
// log logs the contents of `paras`.
func (paras paraList) log(title string) {
if !verbosePage {
return
}
common.Log.Info("%8s: %d paras =======-------=======", title, len(paras))
for i, para := range paras {
if para == nil {
continue
}
text := para.text()
tabl := " "
if para.table != nil {
tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
}
fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
if len(text) == 0 {
panic("empty")
}
if para.table != nil && len(para.table.cells) == 0 {
panic(para)
}
}
}