unipdf/extractor/text_para.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"bytes"
	"fmt"
	"io"
	"sort"
	"unicode"

	"github.com/unidoc/unipdf/v3/model"
)

// textPara is a group of words in a rectangular region of a page that get read together.
// An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.
// We start by finding paragraph regions on a page, then we break the words into the textPara into
// textLines.
type textPara struct {
	serial             int                // Sequence number for debugging.
	model.PdfRectangle                    // Bounding box.
	eBBox              model.PdfRectangle // Extented ounding box needed to compute reading order.
	lines              []*textLine        // Paragraph text gets broken into lines.
	table              *textTable
}

// newTextPara returns a textPara with the same bouding rectangle as `strata`.
func newTextPara(strata *textStrata) *textPara {
	para := textPara{
		serial:       serial.para,
		PdfRectangle: strata.PdfRectangle,
	}
	serial.para++
	return &para
}

// String returns a description of `p`.
func (p *textPara) String() string {
	return fmt.Sprintf("serial=%d %.2f %d lines\n%s\n-------------",
		p.serial, p.PdfRectangle, len(p.lines), p.text())
}

// text returns the text  of the lines in `p`.
func (p *textPara) text() string {
	w := new(bytes.Buffer)
	p.writeText(w)
	return w.String()
}

// writeText writes the text of `p` including tables to `w`.
func (p *textPara) writeText(w io.Writer) {
	if p.table != nil {
		for y := 0; y < p.table.h; y++ {
			for x := 0; x < p.table.w; x++ {
				cell := p.table.cells[y*p.table.w+x]
				cell.writeCellText(w)
				w.Write([]byte(" "))
			}
			w.Write([]byte("\n"))
		}
	} else {
		p.writeCellText(w)
		w.Write([]byte("\n"))
	}
}

// writeCellText writes the text of `p` not including tables to `w`.
func (p *textPara) writeCellText(w io.Writer) {
	// w := new(bytes.Buffer)
	para := p
	for il, line := range para.lines {
		s := line.text()
		reduced := false
		if doHyphens {
			if line.hyphenated && il != len(para.lines)-1 {
				// Line ending with hyphen. Remove it.
				runes := []rune(s)
				s = string(runes[:len(runes)-1])
				reduced = true
			}
		}
		w.Write([]byte(s))
		if reduced {
			// We removed the hyphen from the end of the line so we don't need a line ending.
			continue
		}
		if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
			// Next line is the same depth so it's the same line as this one in the extracted text
			w.Write([]byte(" "))
			continue
		}
		if il < len(para.lines)-1 {
			w.Write([]byte("\n"))
		}
	}
}

// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
// paras `p`.writeText().
func (p *textPara) toTextMarks(offset *int) []TextMark {
	var marks []TextMark
	addMark := func(mark TextMark) {
		mark.Offset = *offset
		marks = append(marks, mark)
		*offset += len(mark.Text)
	}
	addSpaceMark := func(spaceChar string) {
		mark := spaceMark
		mark.Text = spaceChar
		addMark(mark)
	}
	if p.table != nil {
		for y := 0; y < p.table.h; y++ {
			for x := 0; x < p.table.w; x++ {
				cell := p.table.cells[y*p.table.w+x]
				cellMarks := cell.toCellTextMarks(offset)
				marks = append(marks, cellMarks...)
				addSpaceMark(" ")
			}
			addSpaceMark("\n")
		}
	} else {
		marks = p.toCellTextMarks(offset)
		addSpaceMark("\n")
	}
	return marks
}

// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
// paras `paras`.writeCellText().
func (p *textPara) toCellTextMarks(offset *int) []TextMark {
	var marks []TextMark
	addMark := func(mark TextMark) {
		mark.Offset = *offset
		marks = append(marks, mark)
		*offset += len(mark.Text)
	}
	addSpaceMark := func(spaceChar string) {
		mark := spaceMark
		mark.Text = spaceChar
		addMark(mark)
	}
	para := p

	for il, line := range para.lines {
		lineMarks := line.toTextMarks(offset)
		marks = append(marks, lineMarks...)
		reduced := false
		if doHyphens {
			if line.hyphenated && il != len(para.lines)-1 {
				tm := marks[len(marks)-1]
				r := []rune(tm.Text)
				if unicode.IsSpace(r[len(r)-1]) {
					panic(tm)
				}
				if len(r) == 1 {
					marks = marks[:len(marks)-1]
					*offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
				} else {
					s := string(r[:len(r)-1])
					*offset += len(s) - len(tm.Text)
					tm.Text = s
				}
				reduced = true
			}
		}
		if reduced {
			continue
		}
		if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
			// Next line is the same depth so it's the same line as this one in the extracted text
			addSpaceMark(" ")
			continue
		}
		if il < len(para.lines)-1 {
			addSpaceMark("\n")
		}
	}

	addSpaceMark("\n")

	return marks
}

// bbox makes textPara implement the `bounded` interface.
func (p *textPara) bbox() model.PdfRectangle {
	return p.PdfRectangle
}

// fontsize return the para's fontsize which we take to be the first line's fontsize
func (p *textPara) fontsize() float64 {
	if len(p.lines) == 0 {
		panic(p)
	}
	return p.lines[0].fontsize
}

// composePara builds a textPara from the words in `strata`.
// It does this by arranging the words in `strata` into lines.
func composePara(strata *textStrata) *textPara {
	para := newTextPara(strata)

	// build the lines
	for _, depthIdx := range strata.depthIndexes() {
		for !strata.empty(depthIdx) {

			// words[0] is the leftmost word from bins near `depthIdx`.
			firstReadingIdx := strata.firstReadingIndex(depthIdx)
			// create a new line
			words := strata.getStratum(firstReadingIdx)
			word0 := words[0]
			line := newTextLine(strata, firstReadingIdx)
			lastWord := words[0]

			// compute the search range
			// this is based on word0, the first word in the `firstReadingIdx` bin.
			fontSize := strata.fontsize
			minDepth := word0.depth - lineDepthR*fontSize
			maxDepth := word0.depth + lineDepthR*fontSize
			maxIntraWordGap := maxIntraWordGapR * fontSize

		remainingWords:
			// find the rest of the words in this line
			for {
				// Search for `leftWord`, the left-most word w: minDepth <= w.depth <= maxDepth.
				var leftWord *textWord
				leftDepthIdx := 0
				for _, depthIdx := range strata.depthBand(minDepth, maxDepth) {
					words := strata.stratumBand(depthIdx, minDepth, maxDepth)
					if len(words) == 0 {
						continue
					}
					word := words[0]
					gap := gapReading(word, lastWord)
					if gap < -maxIntraLineOverlapR*fontSize {
						break remainingWords
					}
					// No `leftWord` or `word` to the left of `leftWord`.
					if gap < maxIntraWordGap {
						if leftWord == nil || diffReading(word, leftWord) < 0 {
							leftDepthIdx = depthIdx
							leftWord = word
						}
					}
				}
				if leftWord == nil {
					break
				}

				// remove `leftWord` from `strata`[`leftDepthIdx`], and append it to `line`.
				line.moveWord(strata, leftDepthIdx, leftWord)
				lastWord = leftWord
				// // TODO(peterwilliams97): Replace lastWord with line.words[len(line.words)-1] ???
				// if lastWord != line.words[len(line.words)-1] {
				// 	panic("ddd")
				// }
			}

			line.mergeWordFragments()
			// add the line
			para.lines = append(para.lines, line)
		}
	}

	sort.Slice(para.lines, func(i, j int) bool {
		return diffDepthReading(para.lines[i], para.lines[j]) < 0
	})
	if len(para.lines) == 0 {
		panic(para)
	}
	return para
}
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`/*`
			`* This file is subject to the terms and conditions defined in`
			`* file 'LICENSE.md', which is part of this source code package.`
			`*/`

			`package extractor`

			`import (`
Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`"bytes"`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`"fmt"`
Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`"io"`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`"sort"`
Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`"unicode"`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00
			`"github.com/unidoc/unipdf/v3/model"`
			`)`

			`// textPara is a group of words in a rectangular region of a page that get read together.`
			`// An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.`
			`// We start by finding paragraph regions on a page, then we break the words into the textPara into`
			`// textLines.`
			`type textPara struct {`
			`serial int // Sequence number for debugging.`
			`model.PdfRectangle // Bounding box.`
			`eBBox model.PdfRectangle // Extented ounding box needed to compute reading order.`
			`lines []*textLine // Paragraph text gets broken into lines.`
Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`table *textTable`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`}`

			// newTextPara returns a textPara with the same bouding rectangle as `strata`.
			`func newTextPara(strata textStrata) textPara {`
			`para := textPara{`
			`serial: serial.para,`
			`PdfRectangle: strata.PdfRectangle,`
			`}`
			`serial.para++`
			`return &para`
			`}`

			// String returns a description of `p`.
			`func (p *textPara) String() string {`
Got text_test.go passing. 2020-05-27 18:15:18 +10:00			`return fmt.Sprintf("serial=%d %.2f %d lines\n%s\n-------------",`
			`p.serial, p.PdfRectangle, len(p.lines), p.text())`
			`}`

			// text returns the text of the lines in `p`.
			`func (p *textPara) text() string {`
Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`w := new(bytes.Buffer)`
			`p.writeText(w)`
			`return w.String()`
			`}`

			// writeText writes the text of `p` including tables to `w`.
			`func (p *textPara) writeText(w io.Writer) {`
			`if p.table != nil {`
			`for y := 0; y < p.table.h; y++ {`
			`for x := 0; x < p.table.w; x++ {`
			`cell := p.table.cells[y*p.table.w+x]`
			`cell.writeCellText(w)`
			`w.Write([]byte(" "))`
			`}`
			`w.Write([]byte("\n"))`
			`}`
			`} else {`
			`p.writeCellText(w)`
			`w.Write([]byte("\n"))`
			`}`
			`}`

			// writeCellText writes the text of `p` not including tables to `w`.
			`func (p *textPara) writeCellText(w io.Writer) {`
			`// w := new(bytes.Buffer)`
			`para := p`
			`for il, line := range para.lines {`
			`s := line.text()`
			`reduced := false`
			`if doHyphens {`
			`if line.hyphenated && il != len(para.lines)-1 {`
			`// Line ending with hyphen. Remove it.`
			`runes := []rune(s)`
			`s = string(runes[:len(runes)-1])`
			`reduced = true`
			`}`
			`}`
			`w.Write([]byte(s))`
			`if reduced {`
			`// We removed the hyphen from the end of the line so we don't need a line ending.`
			`continue`
			`}`
			`if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {`
			`// Next line is the same depth so it's the same line as this one in the extracted text`
			`w.Write([]byte(" "))`
			`continue`
			`}`
			`if il < len(para.lines)-1 {`
			`w.Write([]byte("\n"))`
			`}`
			`}`
			`}`

			`// toTextMarks creates the TextMarkArray corresponding to the extracted text created by`
			// paras `p`.writeText().
			`func (p textPara) toTextMarks(offset int) []TextMark {`
			`var marks []TextMark`
			`addMark := func(mark TextMark) {`
			`mark.Offset = *offset`
			`marks = append(marks, mark)`
			`*offset += len(mark.Text)`
			`}`
			`addSpaceMark := func(spaceChar string) {`
			`mark := spaceMark`
			`mark.Text = spaceChar`
			`addMark(mark)`
			`}`
			`if p.table != nil {`
			`for y := 0; y < p.table.h; y++ {`
			`for x := 0; x < p.table.w; x++ {`
			`cell := p.table.cells[y*p.table.w+x]`
			`cellMarks := cell.toCellTextMarks(offset)`
			`marks = append(marks, cellMarks...)`
			`addSpaceMark(" ")`
			`}`
			`addSpaceMark("\n")`
			`}`
			`} else {`
			`marks = p.toCellTextMarks(offset)`
			`addSpaceMark("\n")`
Got text_test.go passing. 2020-05-27 18:15:18 +10:00			`}`
Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`return marks`
			`}`

			`// toTextMarks creates the TextMarkArray corresponding to the extracted text created by`
			// paras `paras`.writeCellText().
			`func (p textPara) toCellTextMarks(offset int) []TextMark {`
			`var marks []TextMark`
			`addMark := func(mark TextMark) {`
			`mark.Offset = *offset`
			`marks = append(marks, mark)`
			`*offset += len(mark.Text)`
			`}`
			`addSpaceMark := func(spaceChar string) {`
			`mark := spaceMark`
			`mark.Text = spaceChar`
			`addMark(mark)`
			`}`
			`para := p`

			`for il, line := range para.lines {`
			`lineMarks := line.toTextMarks(offset)`
			`marks = append(marks, lineMarks...)`
			`reduced := false`
			`if doHyphens {`
			`if line.hyphenated && il != len(para.lines)-1 {`
			`tm := marks[len(marks)-1]`
			`r := []rune(tm.Text)`
			`if unicode.IsSpace(r[len(r)-1]) {`
			`panic(tm)`
			`}`
			`if len(r) == 1 {`
			`marks = marks[:len(marks)-1]`
			`*offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)`
			`} else {`
			`s := string(r[:len(r)-1])`
			`*offset += len(s) - len(tm.Text)`
			`tm.Text = s`
			`}`
			`reduced = true`
			`}`
			`}`
			`if reduced {`
			`continue`
			`}`
			`if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {`
			`// Next line is the same depth so it's the same line as this one in the extracted text`
			`addSpaceMark(" ")`
			`continue`
			`}`
			`if il < len(para.lines)-1 {`
			`addSpaceMark("\n")`
			`}`
			`}`

			`addSpaceMark("\n")`

			`return marks`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`}`

Added function comments. 2020-05-25 14:00:00 +10:00			// bbox makes textPara implement the `bounded` interface.
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`func (p *textPara) bbox() model.PdfRectangle {`
			`return p.PdfRectangle`
			`}`

Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`// fontsize return the para's fontsize which we take to be the first line's fontsize`
			`func (p *textPara) fontsize() float64 {`
			`if len(p.lines) == 0 {`
			`panic(p)`
			`}`
			`return p.lines[0].fontsize`
			`}`

First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			// composePara builds a textPara from the words in `strata`.
			// It does this by arranging the words in `strata` into lines.
			`func composePara(strata textStrata) textPara {`
			`para := newTextPara(strata)`

			`// build the lines`
			`for _, depthIdx := range strata.depthIndexes() {`
			`for !strata.empty(depthIdx) {`

			// words[0] is the leftmost word from bins near `depthIdx`.
			`firstReadingIdx := strata.firstReadingIndex(depthIdx)`
			`// create a new line`
			`words := strata.getStratum(firstReadingIdx)`
			`word0 := words[0]`
			`line := newTextLine(strata, firstReadingIdx)`
			`lastWord := words[0]`

			`// compute the search range`
			// this is based on word0, the first word in the `firstReadingIdx` bin.
			`fontSize := strata.fontsize`
			`minDepth := word0.depth - lineDepthR*fontSize`
			`maxDepth := word0.depth + lineDepthR*fontSize`
			`maxIntraWordGap := maxIntraWordGapR * fontSize`

			`remainingWords:`
			`// find the rest of the words in this line`
			`for {`
			// Search for `leftWord`, the left-most word w: minDepth <= w.depth <= maxDepth.
			`var leftWord *textWord`
			`leftDepthIdx := 0`
			`for _, depthIdx := range strata.depthBand(minDepth, maxDepth) {`
			`words := strata.stratumBand(depthIdx, minDepth, maxDepth)`
			`if len(words) == 0 {`
			`continue`
			`}`
			`word := words[0]`
			`gap := gapReading(word, lastWord)`
			`if gap < -maxIntraLineOverlapR*fontSize {`
			`break remainingWords`
			`}`
			// No `leftWord` or `word` to the left of `leftWord`.
			`if gap < maxIntraWordGap {`
			`if leftWord == nil \|\| diffReading(word, leftWord) < 0 {`
			`leftDepthIdx = depthIdx`
			`leftWord = word`
			`}`
			`}`
			`}`
			`if leftWord == nil {`
			`break`
			`}`

			// remove `leftWord` from `strata`[`leftDepthIdx`], and append it to `line`.
			`line.moveWord(strata, leftDepthIdx, leftWord)`
			`lastWord = leftWord`
Got text_test.go passing. 2020-05-27 18:15:18 +10:00			`// // TODO(peterwilliams97): Replace lastWord with line.words[len(line.words)-1] ???`
			`// if lastWord != line.words[len(line.words)-1] {`
			`// panic("ddd")`
			`// }`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`}`

Got text_test.go passing. 2020-05-27 18:15:18 +10:00			`line.mergeWordFragments()`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`// add the line`
			`para.lines = append(para.lines, line)`
			`}`
			`}`

			`sort.Slice(para.lines, func(i, j int) bool {`
			`return diffDepthReading(para.lines[i], para.lines[j]) < 0`
			`})`
Adding tables to text extractor. 2020-06-01 14:04:32 +10:00			`if len(para.lines) == 0 {`
			`panic(para)`
			`}`
First version of text extraction that recognizes columns 2020-05-24 21:00:37 +10:00			`return para`
			`}`