unipdf/extractor/text_page.go
Peter Williams 88fda44e0a
Text extraction code for columns. (#366)
* Fixed filename:page in logging

* Got CMap working for multi-rune entries

* Treat CMap entries as strings instead of runes to handle multi-byte encodings.

* Added a test for multibyte encoding.

* First version of text extraction that recognizes columns

* Added an expanation of the text columns code to README.md.

* fixed typos

* Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

* Added function comments.

* Fixed text state save/restore.

* Adjusted inter-word search distance to make paragrah division work for thanh.pdf

* Got text_test.go passing.

* Reinstated hyphen suppression

* Handle more cases of fonts not being set in text extraction code.

* Fixed typo

* More verbose logging

* Adding tables to text extractor.

* Added tests for columns extraction.

* Removed commented code

* Check for textParas that are on the same line when writing out extracted text.

* Absorb text to the left of paras into paras e.g. Footnote numbers

* Removed funny character from text_test.go

* Commented out a creator_test.go test that was broken by my text extraction changes.

* Big changes to columns text extraction code for PR.

Performance improvements in several places.
Commented code.

* Updated extractor/README

* Cleaned up some comments and removed a panic

* Increased threshold for truncating extracted text when there is no license 100 -> 102.

This is a workaround to let a test in creator_test.go pass.

With the old text extraction code the following extracted text was 100 chars. With the new code it
is 102 chars which looks correct.

"你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n"

* Improved an error message.

* Removed irrelevant spaces

* Commented code and removed unused functions.

* Reverted PdfRectangle changes

* Added duplicate text detection.

* Combine diacritic textMarks in text extraction

* Reinstated a diacritic recombination test.

* Small code reorganisation

* Reinstated handling of rotated text

* Addressed issues in PR review

* Added color fields to TextMark

* Updated README

* Reinstated the disabled tests I missed before.

* Tightened definition for tables to prevent detection of tables where there weren't any.

* Compute line splitting search range based on fontsize of first word in word bag.

* Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs.

See https://blog.golang.org/go1.13-errors

* Fixed some naming and added some comments.

* errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility

* Removed code that doesn't ever get called.

* Removed unused test
2020-06-30 19:33:10 +00:00

431 lines
14 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"fmt"
"io"
"math"
"sort"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
)
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
// The paraList contains the page arranged as
// - a list of texPara in reading order
// - each textPara contains list of textLine (text lines or parts of text lines) in reading order
// - each textLine contains a list of textWord (words or parts of words) in reading order
// The paraList is thus an ordering of words on a page.
// - Users of the paraList are expected to work with words. This should be adequate for most uses
// as words are the basic unit of meaning in written language.
// - However we provide links back from the extracted text to the textMarks as follows.
// * paraList.writeText() returns the extracted text for a page
// * paras.toTextMarks() returns a TextMarkArray containing the marks
// * TextMarkArray.RangeOffset(lo, hi) return the marks corresponding offsets [lo:hi] in the
// extracted text.
// NOTE: The "parts of words" occur because of hyphenation. We do some weak coordinate based
// dehypenation. Caller who need strong dehypenation should use NLP librarie.
// The "parts of lines" are an implementation detail. Line fragments are combined in
// paraList.writeText()
// ALGORITHM:
// 1) Group the textMarks into textWords based on their bounding boxes.
// 2) Group the textWords into textParas based on their bounding boxes.
// 3) Detect textParas arranged as cells in a table and convert each one to a textPara containing a
// textTable.
// 4) Sort the textParas in reading order.
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle) paraList {
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
if len(marks) == 0 {
return nil
}
// Group the marks into word fragments
words := makeTextWords(marks, pageSize)
if len(words) == 0 {
return nil
}
// Put the word fragments into a container that facilitates the grouping of words into paragraphs.
pageWords := makeWordBag(words, pageSize.Ury)
// Divide the page into rectangular regions for each paragraph and creata a wordBag for each one.
paraWords := dividePage(pageWords, pageSize.Ury)
paraWords = mergeWordBags(paraWords)
// Arrange the contents of each paragraph wordBag into lines and the lines into whole words.
paras := make(paraList, 0, len(paraWords))
for _, bag := range paraWords {
para := bag.arrangeText()
if para != nil {
paras = append(paras, para)
}
}
// Find paras that are cells in tables, convert the tables to paras and remove the cell paras.
if len(paras) >= minTableParas {
paras = paras.extractTables()
}
// Sort the paras into reading order.
paras.sortReadingOrder()
paras.log("sorted in reading order")
return paras
}
// dividePage divides `pageWords`, the page wordBag, into a list of paragraph wordBags.
func dividePage(pageWords *wordBag, pageHeight float64) []*wordBag {
var paraWordBags []*wordBag
// We move words from `page` to paras until there no words left in page.
// We do this by iterating through `page` in depth bin order and, for each surving bin (see
// below), creating a paragraph with seed word, `words[0]` in the code below.
// We then move words from around the `para` region from `page` to `para` .
// This may empty some page bins before we iterate to them
// Some bins are emptied before they iterated to (seee "surving bin" above).
// If a `page` survives until it is iterated to then at least one `para` will be built around it.
for _, depthIdx := range pageWords.depthIndexes() {
changed := false
for !pageWords.empty(depthIdx) {
// Start a new paragraph region `paraWords`.
// Build `paraWords` out from the left-most (lowest in reading direction) word `words`[0],
// in the bins in and below `depthIdx`.
// `firstWord` is the left-most word from the bins in and a few lines below `depthIdx`. We
// seed 'paraWords` with this word.
firstReadingIdx := pageWords.firstReadingIndex(depthIdx)
firstWord := pageWords.firstWord(firstReadingIdx)
paraWords := newWordBag(firstWord, pageHeight)
pageWords.removeWord(firstWord, firstReadingIdx)
if verbosePage {
common.Log.Info("words[0]=%s", firstWord.String())
}
// The following 3 numbers define whether words should be added to `paraWords`.
minInterReadingGap := minInterReadingGapR * paraWords.fontsize
maxIntraReadingGap := maxIntraReadingGapR * paraWords.fontsize
maxIntraDepthGap := maxIntraDepthGapR * paraWords.fontsize
// Add words to `paraWords` until we pass through the following loop without adding a
// new word.
for running := true; running; running = changed {
changed = false
// Add words that are within maxIntraDepthGap of `paraWords` in the depth direction.
// i.e. Stretch paraWords in the depth direction, vertically for English text.
if verbosePage {
common.Log.Info("paraWords depth %.2f - %.2f maxIntraDepthGap=%.2f ",
paraWords.minDepth(), paraWords.maxDepth(), maxIntraDepthGap)
}
if pageWords.scanBand("vertical", paraWords, partial(readingOverlapPlusGap, 0),
paraWords.minDepth()-maxIntraDepthGap, paraWords.maxDepth()+maxIntraDepthGap,
maxIntraDepthFontTolR, false, false) > 0 {
changed = true
}
// Add words that are within maxIntraReadingGap of `paraWords` in the reading direction.
// i.e. Stretch paraWords in the reading direction, horizontall for English text.
if pageWords.scanBand("horizontal", paraWords, partial(readingOverlapPlusGap, maxIntraReadingGap),
paraWords.minDepth(), paraWords.maxDepth(),
maxIntraReadingFontTol, false, false) > 0 {
changed = true
}
// The above stretching has got as far as it can go. Repeating it won't pull in more words.
// Only try to combine other words if we can't grow paraWords in the simple way above.
if changed {
continue
}
// In the following cases, we don't expand `paraWords` while scanning. We look for words
// around paraWords. If we find them, we add them then expand `paraWords` when we are done.
// This pulls the numbers to the left of paraWords into paraWords
// e.g. From
// Regulatory compliance
// Archiving
// Document search
// to
// 1. Regulatory compliance
// 2. Archiving
// 3. Document search
// If there are words to the left of `paraWords`, add them.
// We need to limit the number of words.
n := pageWords.scanBand("", paraWords, partial(readingOverlapLeft, minInterReadingGap),
paraWords.minDepth(), paraWords.maxDepth(),
minInterReadingFontTol, true, false)
if n > 0 {
r := (paraWords.maxDepth() - paraWords.minDepth()) / paraWords.fontsize
if (n > 1 && float64(n) > 0.3*r) || n <= 10 {
if pageWords.scanBand("other", paraWords, partial(readingOverlapLeft, minInterReadingGap),
paraWords.minDepth(), paraWords.maxDepth(),
minInterReadingFontTol, false, true) > 0 {
changed = true
}
}
}
}
paraWordBags = append(paraWordBags, paraWords)
}
}
return paraWordBags
}
// writeText writes the text in `paras` to `w`.
func (paras paraList) writeText(w io.Writer) {
for ip, para := range paras {
para.writeText(w)
if ip != len(paras)-1 {
if sameLine(para, paras[ip+1]) {
w.Write([]byte(" "))
} else {
w.Write([]byte("\n"))
w.Write([]byte("\n"))
}
}
}
w.Write([]byte("\n"))
w.Write([]byte("\n"))
}
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
// `paras`.writeText().
func (paras paraList) toTextMarks() []TextMark {
offset := 0
var marks []TextMark
for ip, para := range paras {
paraMarks := para.toTextMarks(&offset)
marks = append(marks, paraMarks...)
if ip != len(paras)-1 {
if sameLine(para, paras[ip+1]) {
marks = appendSpaceMark(marks, &offset, " ")
} else {
marks = appendSpaceMark(marks, &offset, "\n")
marks = appendSpaceMark(marks, &offset, "\n")
}
}
}
marks = appendSpaceMark(marks, &offset, "\n")
marks = appendSpaceMark(marks, &offset, "\n")
return marks
}
// sameLine returms true if `para1` and `para2` are on the same line.
func sameLine(para1, para2 *textPara) bool {
return isZero(para1.depth() - para2.depth())
}
// tables returns the tables from all the paras that contain them.
func (paras paraList) tables() []TextTable {
var tables []TextTable
for _, para := range paras {
if para.table != nil {
tables = append(tables, para.table.toTextTable())
}
}
return tables
}
// sortReadingOrder sorts `paras` in reading order.
func (paras paraList) sortReadingOrder() {
common.Log.Trace("sortReadingOrder: paras=%d ===========x=============", len(paras))
if len(paras) <= 1 {
return
}
paras.computeEBBoxes()
sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 })
order := paras.topoOrder()
paras.reorder(order)
}
// topoOrder returns the ordering of the topological sort of `paras` using readBefore() to determine
// the incoming nodes to each node.
func (paras paraList) topoOrder() []int {
if verbosePage {
common.Log.Info("topoOrder:")
}
n := len(paras)
visited := make([]bool, n)
order := make([]int, 0, n)
llyOrder := paras.llyOrdering()
// sortNode recursively sorts below node `idx` in the adjacency matrix.
var sortNode func(idx int)
sortNode = func(idx int) {
visited[idx] = true
for i := 0; i < n; i++ {
if !visited[i] {
if paras.readBefore(llyOrder, idx, i) {
sortNode(i)
}
}
}
order = append(order, idx) // Should prepend but it's cheaper to append and reverse later.
}
for idx := 0; idx < n; idx++ {
if !visited[idx] {
sortNode(idx)
}
}
return reversed(order)
}
// readBefore returns true if paras[`i`] comes before paras[`j`].
// readBefore defines an ordering over `paras`.
// a = paras[i], b= paras[j]
// 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if
// line segment `a` is above line segment `b` on the page.
// 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if
// there does not exist a line segment `c` whose y-coordinates are between `a` and `b` and whose
// range of x coordinates overlaps both `a` and `b`.
// From Thomas M. Breuel "High Performance Document Layout Analysis"
func (paras paraList) readBefore(ordering []int, i, j int) bool {
a, b := paras[i], paras[j]
// Breuel's rule 1
if overlappedXPara(a, b) && a.Lly > b.Lly {
return true
}
// Breuel's rule 2
if !(a.eBBox.Urx < b.eBBox.Llx) {
return false
}
lo, hi := a.Lly, b.Lly
if lo > hi {
hi, lo = lo, hi
}
llx := math.Max(a.eBBox.Llx, b.eBBox.Llx)
urx := math.Min(a.eBBox.Urx, b.eBBox.Urx)
llyOrder := paras.llyRange(ordering, lo, hi)
for _, k := range llyOrder {
if k == i || k == j {
continue
}
c := paras[k]
if c.eBBox.Llx <= urx && llx <= c.eBBox.Urx {
return false
}
}
return true
}
// overlappedX returns true if `r0` and `r1` overlap on the x-axis.
func overlappedXPara(r0, r1 *textPara) bool {
return intersectsX(r0.eBBox, r1.eBBox)
}
// llyOrdering is ordering over the indexes of `paras` sorted by Llx is increasing order.
func (paras paraList) llyOrdering() []int {
ordering := make([]int, len(paras))
for i := range paras {
ordering[i] = i
}
sort.SliceStable(ordering, func(i, j int) bool {
oi, oj := ordering[i], ordering[j]
return paras[oi].Lly < paras[oj].Lly
})
return ordering
}
// llyRange returns the indexes in `paras` of paras p: lo <= p.Llx < hi
func (paras paraList) llyRange(ordering []int, lo, hi float64) []int {
n := len(paras)
if hi < paras[ordering[0]].Lly || lo > paras[ordering[n-1]].Lly {
return nil
}
// i0 is the lowest i: lly(i) >= lo
// i1 is the lowest i: lly(i) > hi
i0 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly >= lo })
i1 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly > hi })
return ordering[i0:i1]
}
// computeEBBoxes computes the eBBox fields in the elements of `paras`.
// The EBBoxs are the regions around the paras that don't intersect paras in other columns.
// This is needed for sortReadingOrder to work with skinny paras in a column of fat paras. The
// sorting assumes the skinny para bounding box is as wide as the fat para bounding boxes.
func (paras paraList) computeEBBoxes() {
if verbose {
common.Log.Info("computeEBBoxes:")
}
for _, para := range paras {
para.eBBox = para.PdfRectangle
}
paraYNeighbours := paras.yNeighbours()
for i, aa := range paras {
a := aa.eBBox
// [llx, urx] is the reading direction interval for which no paras overlap `a`.
llx, urx := -1.0e9, +1.0e9
for _, j := range paraYNeighbours[aa] {
b := paras[j].eBBox
if b.Urx < a.Llx { // `b` to left of `a`. no x overlap.
llx = math.Max(llx, b.Urx)
} else if a.Urx < b.Llx { // `b` to right of `a`. no x overlap.
urx = math.Min(urx, b.Llx)
}
}
// llx extends left from `a` and overlaps no other paras.
// urx extends right from `a` and overlaps no other paras.
// Go through all paras below `a` within interval [llx, urx] in the reading direction and
// expand `a` as far as possible to left and right without overlapping any of them.
for j, bb := range paras {
b := bb.eBBox
if i == j || b.Ury > a.Lly {
continue
}
if llx <= b.Llx && b.Llx < a.Llx {
// If `b` is completely to right of `llx`, extend `a` left to `b`.
a.Llx = b.Llx
} else if b.Urx <= urx && a.Urx < b.Urx {
// If `b` is completely to left of `urx`, extend `a` right to `b`.
a.Urx = b.Urx
}
}
if verbose {
fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50))
}
aa.eBBox = a
}
if useEBBox {
for _, para := range paras {
para.PdfRectangle = para.eBBox
}
}
}
// reversed return `order` reversed.
func reversed(order []int) []int {
rev := make([]int, len(order))
for i, v := range order {
rev[len(order)-1-i] = v
}
return rev
}
// reorder reorders `para` to the order in `order`.
func (paras paraList) reorder(order []int) {
sorted := make(paraList, len(paras))
for i, k := range order {
sorted[i] = paras[k]
}
copy(paras, sorted)
}