mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-01 22:17:29 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
431 lines
14 KiB
Go
431 lines
14 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"sort"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
|
|
// The paraList contains the page arranged as
|
|
// - a list of texPara in reading order
|
|
// - each textPara contains list of textLine (text lines or parts of text lines) in reading order
|
|
// - each textLine contains a list of textWord (words or parts of words) in reading order
|
|
// The paraList is thus an ordering of words on a page.
|
|
// - Users of the paraList are expected to work with words. This should be adequate for most uses
|
|
// as words are the basic unit of meaning in written language.
|
|
// - However we provide links back from the extracted text to the textMarks as follows.
|
|
// * paraList.writeText() returns the extracted text for a page
|
|
// * paras.toTextMarks() returns a TextMarkArray containing the marks
|
|
// * TextMarkArray.RangeOffset(lo, hi) return the marks corresponding offsets [lo:hi] in the
|
|
// extracted text.
|
|
// NOTE: The "parts of words" occur because of hyphenation. We do some weak coordinate based
|
|
// dehypenation. Caller who need strong dehypenation should use NLP librarie.
|
|
// The "parts of lines" are an implementation detail. Line fragments are combined in
|
|
// paraList.writeText()
|
|
// ALGORITHM:
|
|
// 1) Group the textMarks into textWords based on their bounding boxes.
|
|
// 2) Group the textWords into textParas based on their bounding boxes.
|
|
// 3) Detect textParas arranged as cells in a table and convert each one to a textPara containing a
|
|
// textTable.
|
|
// 4) Sort the textParas in reading order.
|
|
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle) paraList {
|
|
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
|
|
if len(marks) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Group the marks into word fragments
|
|
words := makeTextWords(marks, pageSize)
|
|
if len(words) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Put the word fragments into a container that facilitates the grouping of words into paragraphs.
|
|
pageWords := makeWordBag(words, pageSize.Ury)
|
|
|
|
// Divide the page into rectangular regions for each paragraph and creata a wordBag for each one.
|
|
paraWords := dividePage(pageWords, pageSize.Ury)
|
|
paraWords = mergeWordBags(paraWords)
|
|
|
|
// Arrange the contents of each paragraph wordBag into lines and the lines into whole words.
|
|
paras := make(paraList, 0, len(paraWords))
|
|
for _, bag := range paraWords {
|
|
para := bag.arrangeText()
|
|
if para != nil {
|
|
paras = append(paras, para)
|
|
}
|
|
}
|
|
|
|
// Find paras that are cells in tables, convert the tables to paras and remove the cell paras.
|
|
if len(paras) >= minTableParas {
|
|
paras = paras.extractTables()
|
|
}
|
|
|
|
// Sort the paras into reading order.
|
|
paras.sortReadingOrder()
|
|
paras.log("sorted in reading order")
|
|
|
|
return paras
|
|
}
|
|
|
|
// dividePage divides `pageWords`, the page wordBag, into a list of paragraph wordBags.
|
|
func dividePage(pageWords *wordBag, pageHeight float64) []*wordBag {
|
|
var paraWordBags []*wordBag
|
|
|
|
// We move words from `page` to paras until there no words left in page.
|
|
// We do this by iterating through `page` in depth bin order and, for each surving bin (see
|
|
// below), creating a paragraph with seed word, `words[0]` in the code below.
|
|
// We then move words from around the `para` region from `page` to `para` .
|
|
// This may empty some page bins before we iterate to them
|
|
// Some bins are emptied before they iterated to (seee "surving bin" above).
|
|
// If a `page` survives until it is iterated to then at least one `para` will be built around it.
|
|
|
|
for _, depthIdx := range pageWords.depthIndexes() {
|
|
changed := false
|
|
for !pageWords.empty(depthIdx) {
|
|
// Start a new paragraph region `paraWords`.
|
|
// Build `paraWords` out from the left-most (lowest in reading direction) word `words`[0],
|
|
// in the bins in and below `depthIdx`.
|
|
|
|
// `firstWord` is the left-most word from the bins in and a few lines below `depthIdx`. We
|
|
// seed 'paraWords` with this word.
|
|
firstReadingIdx := pageWords.firstReadingIndex(depthIdx)
|
|
firstWord := pageWords.firstWord(firstReadingIdx)
|
|
paraWords := newWordBag(firstWord, pageHeight)
|
|
pageWords.removeWord(firstWord, firstReadingIdx)
|
|
if verbosePage {
|
|
common.Log.Info("words[0]=%s", firstWord.String())
|
|
}
|
|
|
|
// The following 3 numbers define whether words should be added to `paraWords`.
|
|
minInterReadingGap := minInterReadingGapR * paraWords.fontsize
|
|
maxIntraReadingGap := maxIntraReadingGapR * paraWords.fontsize
|
|
maxIntraDepthGap := maxIntraDepthGapR * paraWords.fontsize
|
|
|
|
// Add words to `paraWords` until we pass through the following loop without adding a
|
|
// new word.
|
|
for running := true; running; running = changed {
|
|
changed = false
|
|
|
|
// Add words that are within maxIntraDepthGap of `paraWords` in the depth direction.
|
|
// i.e. Stretch paraWords in the depth direction, vertically for English text.
|
|
if verbosePage {
|
|
common.Log.Info("paraWords depth %.2f - %.2f maxIntraDepthGap=%.2f ",
|
|
paraWords.minDepth(), paraWords.maxDepth(), maxIntraDepthGap)
|
|
}
|
|
if pageWords.scanBand("vertical", paraWords, partial(readingOverlapPlusGap, 0),
|
|
paraWords.minDepth()-maxIntraDepthGap, paraWords.maxDepth()+maxIntraDepthGap,
|
|
maxIntraDepthFontTolR, false, false) > 0 {
|
|
changed = true
|
|
}
|
|
// Add words that are within maxIntraReadingGap of `paraWords` in the reading direction.
|
|
// i.e. Stretch paraWords in the reading direction, horizontall for English text.
|
|
if pageWords.scanBand("horizontal", paraWords, partial(readingOverlapPlusGap, maxIntraReadingGap),
|
|
paraWords.minDepth(), paraWords.maxDepth(),
|
|
maxIntraReadingFontTol, false, false) > 0 {
|
|
changed = true
|
|
}
|
|
// The above stretching has got as far as it can go. Repeating it won't pull in more words.
|
|
|
|
// Only try to combine other words if we can't grow paraWords in the simple way above.
|
|
if changed {
|
|
continue
|
|
}
|
|
|
|
// In the following cases, we don't expand `paraWords` while scanning. We look for words
|
|
// around paraWords. If we find them, we add them then expand `paraWords` when we are done.
|
|
// This pulls the numbers to the left of paraWords into paraWords
|
|
// e.g. From
|
|
// Regulatory compliance
|
|
// Archiving
|
|
// Document search
|
|
// to
|
|
// 1. Regulatory compliance
|
|
// 2. Archiving
|
|
// 3. Document search
|
|
|
|
// If there are words to the left of `paraWords`, add them.
|
|
// We need to limit the number of words.
|
|
n := pageWords.scanBand("", paraWords, partial(readingOverlapLeft, minInterReadingGap),
|
|
paraWords.minDepth(), paraWords.maxDepth(),
|
|
minInterReadingFontTol, true, false)
|
|
if n > 0 {
|
|
r := (paraWords.maxDepth() - paraWords.minDepth()) / paraWords.fontsize
|
|
if (n > 1 && float64(n) > 0.3*r) || n <= 10 {
|
|
if pageWords.scanBand("other", paraWords, partial(readingOverlapLeft, minInterReadingGap),
|
|
paraWords.minDepth(), paraWords.maxDepth(),
|
|
minInterReadingFontTol, false, true) > 0 {
|
|
changed = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
paraWordBags = append(paraWordBags, paraWords)
|
|
}
|
|
}
|
|
|
|
return paraWordBags
|
|
}
|
|
|
|
// writeText writes the text in `paras` to `w`.
|
|
func (paras paraList) writeText(w io.Writer) {
|
|
for ip, para := range paras {
|
|
para.writeText(w)
|
|
if ip != len(paras)-1 {
|
|
if sameLine(para, paras[ip+1]) {
|
|
w.Write([]byte(" "))
|
|
} else {
|
|
w.Write([]byte("\n"))
|
|
w.Write([]byte("\n"))
|
|
}
|
|
}
|
|
}
|
|
w.Write([]byte("\n"))
|
|
w.Write([]byte("\n"))
|
|
}
|
|
|
|
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
|
|
// `paras`.writeText().
|
|
func (paras paraList) toTextMarks() []TextMark {
|
|
offset := 0
|
|
var marks []TextMark
|
|
for ip, para := range paras {
|
|
paraMarks := para.toTextMarks(&offset)
|
|
marks = append(marks, paraMarks...)
|
|
if ip != len(paras)-1 {
|
|
if sameLine(para, paras[ip+1]) {
|
|
marks = appendSpaceMark(marks, &offset, " ")
|
|
} else {
|
|
marks = appendSpaceMark(marks, &offset, "\n")
|
|
marks = appendSpaceMark(marks, &offset, "\n")
|
|
}
|
|
}
|
|
}
|
|
marks = appendSpaceMark(marks, &offset, "\n")
|
|
marks = appendSpaceMark(marks, &offset, "\n")
|
|
return marks
|
|
}
|
|
|
|
// sameLine returms true if `para1` and `para2` are on the same line.
|
|
func sameLine(para1, para2 *textPara) bool {
|
|
return isZero(para1.depth() - para2.depth())
|
|
}
|
|
|
|
// tables returns the tables from all the paras that contain them.
|
|
func (paras paraList) tables() []TextTable {
|
|
var tables []TextTable
|
|
for _, para := range paras {
|
|
if para.table != nil {
|
|
tables = append(tables, para.table.toTextTable())
|
|
}
|
|
}
|
|
return tables
|
|
}
|
|
|
|
// sortReadingOrder sorts `paras` in reading order.
|
|
func (paras paraList) sortReadingOrder() {
|
|
common.Log.Trace("sortReadingOrder: paras=%d ===========x=============", len(paras))
|
|
if len(paras) <= 1 {
|
|
return
|
|
}
|
|
paras.computeEBBoxes()
|
|
sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 })
|
|
order := paras.topoOrder()
|
|
paras.reorder(order)
|
|
}
|
|
|
|
// topoOrder returns the ordering of the topological sort of `paras` using readBefore() to determine
|
|
// the incoming nodes to each node.
|
|
func (paras paraList) topoOrder() []int {
|
|
if verbosePage {
|
|
common.Log.Info("topoOrder:")
|
|
}
|
|
n := len(paras)
|
|
visited := make([]bool, n)
|
|
order := make([]int, 0, n)
|
|
llyOrder := paras.llyOrdering()
|
|
|
|
// sortNode recursively sorts below node `idx` in the adjacency matrix.
|
|
var sortNode func(idx int)
|
|
sortNode = func(idx int) {
|
|
visited[idx] = true
|
|
for i := 0; i < n; i++ {
|
|
if !visited[i] {
|
|
if paras.readBefore(llyOrder, idx, i) {
|
|
sortNode(i)
|
|
}
|
|
}
|
|
}
|
|
order = append(order, idx) // Should prepend but it's cheaper to append and reverse later.
|
|
}
|
|
|
|
for idx := 0; idx < n; idx++ {
|
|
if !visited[idx] {
|
|
sortNode(idx)
|
|
}
|
|
}
|
|
|
|
return reversed(order)
|
|
}
|
|
|
|
// readBefore returns true if paras[`i`] comes before paras[`j`].
|
|
// readBefore defines an ordering over `paras`.
|
|
// a = paras[i], b= paras[j]
|
|
// 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if
|
|
// line segment `a` is above line segment `b` on the page.
|
|
// 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if
|
|
// there does not exist a line segment `c` whose y-coordinates are between `a` and `b` and whose
|
|
// range of x coordinates overlaps both `a` and `b`.
|
|
// From Thomas M. Breuel "High Performance Document Layout Analysis"
|
|
func (paras paraList) readBefore(ordering []int, i, j int) bool {
|
|
a, b := paras[i], paras[j]
|
|
// Breuel's rule 1
|
|
if overlappedXPara(a, b) && a.Lly > b.Lly {
|
|
return true
|
|
}
|
|
|
|
// Breuel's rule 2
|
|
if !(a.eBBox.Urx < b.eBBox.Llx) {
|
|
return false
|
|
}
|
|
|
|
lo, hi := a.Lly, b.Lly
|
|
if lo > hi {
|
|
hi, lo = lo, hi
|
|
}
|
|
llx := math.Max(a.eBBox.Llx, b.eBBox.Llx)
|
|
urx := math.Min(a.eBBox.Urx, b.eBBox.Urx)
|
|
|
|
llyOrder := paras.llyRange(ordering, lo, hi)
|
|
for _, k := range llyOrder {
|
|
if k == i || k == j {
|
|
continue
|
|
}
|
|
c := paras[k]
|
|
if c.eBBox.Llx <= urx && llx <= c.eBBox.Urx {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// overlappedX returns true if `r0` and `r1` overlap on the x-axis.
|
|
func overlappedXPara(r0, r1 *textPara) bool {
|
|
return intersectsX(r0.eBBox, r1.eBBox)
|
|
}
|
|
|
|
// llyOrdering is ordering over the indexes of `paras` sorted by Llx is increasing order.
|
|
func (paras paraList) llyOrdering() []int {
|
|
ordering := make([]int, len(paras))
|
|
for i := range paras {
|
|
ordering[i] = i
|
|
}
|
|
sort.SliceStable(ordering, func(i, j int) bool {
|
|
oi, oj := ordering[i], ordering[j]
|
|
return paras[oi].Lly < paras[oj].Lly
|
|
})
|
|
return ordering
|
|
}
|
|
|
|
// llyRange returns the indexes in `paras` of paras p: lo <= p.Llx < hi
|
|
func (paras paraList) llyRange(ordering []int, lo, hi float64) []int {
|
|
n := len(paras)
|
|
if hi < paras[ordering[0]].Lly || lo > paras[ordering[n-1]].Lly {
|
|
return nil
|
|
}
|
|
|
|
// i0 is the lowest i: lly(i) >= lo
|
|
// i1 is the lowest i: lly(i) > hi
|
|
i0 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly >= lo })
|
|
i1 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly > hi })
|
|
|
|
return ordering[i0:i1]
|
|
}
|
|
|
|
// computeEBBoxes computes the eBBox fields in the elements of `paras`.
|
|
// The EBBoxs are the regions around the paras that don't intersect paras in other columns.
|
|
// This is needed for sortReadingOrder to work with skinny paras in a column of fat paras. The
|
|
// sorting assumes the skinny para bounding box is as wide as the fat para bounding boxes.
|
|
func (paras paraList) computeEBBoxes() {
|
|
if verbose {
|
|
common.Log.Info("computeEBBoxes:")
|
|
}
|
|
|
|
for _, para := range paras {
|
|
para.eBBox = para.PdfRectangle
|
|
}
|
|
paraYNeighbours := paras.yNeighbours()
|
|
|
|
for i, aa := range paras {
|
|
a := aa.eBBox
|
|
// [llx, urx] is the reading direction interval for which no paras overlap `a`.
|
|
llx, urx := -1.0e9, +1.0e9
|
|
|
|
for _, j := range paraYNeighbours[aa] {
|
|
b := paras[j].eBBox
|
|
if b.Urx < a.Llx { // `b` to left of `a`. no x overlap.
|
|
llx = math.Max(llx, b.Urx)
|
|
} else if a.Urx < b.Llx { // `b` to right of `a`. no x overlap.
|
|
urx = math.Min(urx, b.Llx)
|
|
}
|
|
}
|
|
|
|
// llx extends left from `a` and overlaps no other paras.
|
|
// urx extends right from `a` and overlaps no other paras.
|
|
|
|
// Go through all paras below `a` within interval [llx, urx] in the reading direction and
|
|
// expand `a` as far as possible to left and right without overlapping any of them.
|
|
for j, bb := range paras {
|
|
b := bb.eBBox
|
|
if i == j || b.Ury > a.Lly {
|
|
continue
|
|
}
|
|
|
|
if llx <= b.Llx && b.Llx < a.Llx {
|
|
// If `b` is completely to right of `llx`, extend `a` left to `b`.
|
|
a.Llx = b.Llx
|
|
} else if b.Urx <= urx && a.Urx < b.Urx {
|
|
// If `b` is completely to left of `urx`, extend `a` right to `b`.
|
|
a.Urx = b.Urx
|
|
}
|
|
}
|
|
if verbose {
|
|
fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50))
|
|
}
|
|
aa.eBBox = a
|
|
}
|
|
if useEBBox {
|
|
for _, para := range paras {
|
|
para.PdfRectangle = para.eBBox
|
|
}
|
|
}
|
|
}
|
|
|
|
// reversed return `order` reversed.
|
|
func reversed(order []int) []int {
|
|
rev := make([]int, len(order))
|
|
for i, v := range order {
|
|
rev[len(order)-1-i] = v
|
|
}
|
|
return rev
|
|
}
|
|
|
|
// reorder reorders `para` to the order in `order`.
|
|
func (paras paraList) reorder(order []int) {
|
|
sorted := make(paraList, len(paras))
|
|
for i, k := range order {
|
|
sorted[i] = paras[k]
|
|
}
|
|
copy(paras, sorted)
|
|
}
|