mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-13 19:29:10 +08:00
445 lines
13 KiB
Go
445 lines
13 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"unicode"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
|
|
// type so we can have methods on it.
|
|
type paraList []*textPara
|
|
|
|
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
|
|
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
|
|
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
|
|
|
|
// Break the marks into words
|
|
words := makeTextWords(marks, pageSize)
|
|
|
|
// Divide the words into depth bins with each the contents of each bin sorted by reading direction
|
|
page := makeTextStrata(words, pageSize.Ury)
|
|
// Divide the page into rectangular regions for each paragraph and creata a textStrata for each one.
|
|
paraStratas := dividePage(page, pageSize.Ury)
|
|
// Arrange the contents of each para into lines
|
|
paras := make(paraList, len(paraStratas))
|
|
for i, para := range paraStratas {
|
|
paras[i] = composePara(para)
|
|
}
|
|
if verbose || true {
|
|
common.Log.Info("unsorted=========----------=====")
|
|
for i, para := range paras {
|
|
common.Log.Info("paras[%d]=%.2f%q", i, para.PdfRectangle, truncate(paras[i].text(), 200))
|
|
}
|
|
}
|
|
|
|
paras.computeEBBoxes()
|
|
paras = paras.extractTables()
|
|
|
|
// Sort the paras into reading order.
|
|
paras.sortReadingOrder()
|
|
if verbose || true {
|
|
common.Log.Info("para sorted in reading order -----------=========")
|
|
for i, para := range paras {
|
|
tab := ""
|
|
if para.table != nil {
|
|
tab = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
|
|
}
|
|
fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tab, truncate(para.text(), 50))
|
|
}
|
|
}
|
|
return paras
|
|
}
|
|
|
|
// dividePage divides page builds a list of paragraph textStrata from `page`, the page textStrata.
|
|
func dividePage(page *textStrata, pageHeight float64) []*textStrata {
|
|
var paraStratas []*textStrata
|
|
|
|
// We move words from `page` to paras until there no words left in page.
|
|
// We do this by iterating through `page` in depth bin order and, for each surving bin (see
|
|
// below), creating a paragraph with seed word, `words[0]` in the code below.
|
|
// We then move words from around the `para` region from `page` to `para` .
|
|
// This may empty some page bins before we iterate to them
|
|
// Some bins are emptied before they iterated to (seee "surving bin" above).
|
|
// If a `page` survives until it is iterated to then at least one `para` will be built around it.
|
|
|
|
if verbose {
|
|
common.Log.Info("dividePage")
|
|
}
|
|
cnt := 0
|
|
for _, depthIdx := range page.depthIndexes() {
|
|
changed := false
|
|
for ; !page.empty(depthIdx); cnt++ {
|
|
// Start a new paragraph region `para`.
|
|
// Build `para` out from the left-most (lowest in reading direction) word `words`[0],
|
|
// in the bins in and below `depthIdx`.
|
|
para := newTextStrata(pageHeight)
|
|
|
|
// words[0] is the leftmost word from the bins in and a few lines below `depthIdx`. We
|
|
// seed 'para` with this word.
|
|
firstReadingIdx := page.firstReadingIndex(depthIdx)
|
|
words := page.getStratum(firstReadingIdx)
|
|
moveWord(firstReadingIdx, page, para, words[0])
|
|
if verbose {
|
|
common.Log.Info("words[0]=%s", words[0].String())
|
|
}
|
|
|
|
// The following 3 numbers define whether words should be added to `para`.
|
|
minInterReadingGap := minInterReadingGapR * para.fontsize
|
|
maxIntraReadingGap := maxIntraReadingGapR * para.fontsize
|
|
maxIntraDepthGap := maxIntraDepthGapR * para.fontsize
|
|
|
|
// Add words to `para` until we pass through the following loop without a new word
|
|
// being added to a `para`.
|
|
for running := true; running; running = changed {
|
|
changed = false
|
|
|
|
// Add words that are within maxIntraDepthGap of `para` in the depth direction.
|
|
// i.e. Stretch para in the depth direction, vertically for English text.
|
|
if verbose {
|
|
common.Log.Info("para depth %.2f - %.2f maxIntraDepthGap=%.2f ",
|
|
para.minDepth(), para.maxDepth(), maxIntraDepthGap)
|
|
}
|
|
if page.scanBand("veritcal", para, partial(readingOverlapPlusGap, 0),
|
|
para.minDepth()-maxIntraDepthGap, para.maxDepth()+maxIntraDepthGap,
|
|
maxIntraDepthFontTolR, false, false) > 0 {
|
|
changed = true
|
|
}
|
|
// Add words that are within maxIntraReadingGap of `para` in the reading direction.
|
|
// i.e. Stretch para in the reading direction, horizontall for English text.
|
|
if page.scanBand("horizontal", para, partial(readingOverlapPlusGap, maxIntraReadingGap),
|
|
para.minDepth(), para.maxDepth(),
|
|
maxIntraReadingFontTol, false, false) > 0 {
|
|
changed = true
|
|
}
|
|
// The above stretching has got as far as it go. Repeating it won't pull in more words.
|
|
|
|
// Only try to combine other words if we can't grow para in the simple way above.
|
|
if changed {
|
|
continue
|
|
}
|
|
|
|
// In the following cases, we don't expand `para` while scanning. We look for words
|
|
// around para. If we find them, we add them then expand `para` when we are done.
|
|
// This pulls the numbers to the left of para into para
|
|
// e.g. From
|
|
// Regulatory compliance
|
|
// Archiving
|
|
// Document search
|
|
// to
|
|
// 1. Regulatory compliance
|
|
// 2. Archiving
|
|
// 3. Document search
|
|
|
|
// If there are words to the left of `para`, add them.
|
|
// We need to limit the number of word
|
|
n := page.scanBand("", para, partial(readingOverlapLeft, minInterReadingGap),
|
|
para.minDepth(), para.maxDepth(),
|
|
minInterReadingFontTol, true, false)
|
|
if n > 0 {
|
|
r := (para.maxDepth() - para.minDepth()) / para.fontsize
|
|
if (n > 1 && float64(n) > 0.3*r) || n <= 5 {
|
|
if page.scanBand("other", para, partial(readingOverlapLeft, minInterReadingGap),
|
|
para.minDepth(), para.maxDepth(),
|
|
minInterReadingFontTol, false, true) > 0 {
|
|
changed = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort the words in `para`'s bins in the reading direction.
|
|
para.sort()
|
|
paraStratas = append(paraStratas, para)
|
|
}
|
|
}
|
|
|
|
return paraStratas
|
|
}
|
|
|
|
const doHyphens = true
|
|
const useTables = true
|
|
|
|
// writeText writes the text in `paras` to `w`.
|
|
func (paras paraList) writeText(w io.Writer) {
|
|
for ip, para := range paras {
|
|
if useTables {
|
|
para.writeText(w)
|
|
} else {
|
|
for il, line := range para.lines {
|
|
s := line.text()
|
|
reduced := false
|
|
if doHyphens {
|
|
if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
|
|
// Line ending with hyphen. Remove it.
|
|
runes := []rune(s)
|
|
s = string(runes[:len(runes)-1])
|
|
reduced = true
|
|
}
|
|
}
|
|
w.Write([]byte(s))
|
|
if reduced {
|
|
// We removed the hyphen from the end of the line so we don't need a line ending.
|
|
continue
|
|
}
|
|
if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
|
|
// Next line is the same depth so it's the same line as this one in the extracted text
|
|
w.Write([]byte(" "))
|
|
continue
|
|
}
|
|
w.Write([]byte("\n"))
|
|
}
|
|
w.Write([]byte("\n"))
|
|
}
|
|
}
|
|
}
|
|
|
|
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
|
|
// paras `paras`.writeText().
|
|
func (paras paraList) toTextMarks() []TextMark {
|
|
offset := 0
|
|
var marks []TextMark
|
|
addMark := func(mark TextMark) {
|
|
mark.Offset = offset
|
|
marks = append(marks, mark)
|
|
offset += len(mark.Text)
|
|
}
|
|
addSpaceMark := func(spaceChar string) {
|
|
mark := spaceMark
|
|
mark.Text = spaceChar
|
|
addMark(mark)
|
|
}
|
|
for ip, para := range paras {
|
|
if useTables {
|
|
paraMarks := para.toTextMarks(&offset)
|
|
marks = append(marks, paraMarks...)
|
|
} else {
|
|
for il, line := range para.lines {
|
|
lineMarks := line.toTextMarks(&offset)
|
|
marks = append(marks, lineMarks...)
|
|
reduced := false
|
|
if doHyphens {
|
|
if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
|
|
tm := marks[len(marks)-1]
|
|
r := []rune(tm.Text)
|
|
if unicode.IsSpace(r[len(r)-1]) {
|
|
panic(tm)
|
|
}
|
|
if len(r) == 1 {
|
|
marks = marks[:len(marks)-1]
|
|
offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
|
|
} else {
|
|
s := string(r[:len(r)-1])
|
|
offset += len(s) - len(tm.Text)
|
|
tm.Text = s
|
|
}
|
|
reduced = true
|
|
}
|
|
}
|
|
if reduced {
|
|
continue
|
|
}
|
|
if il != len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
|
|
// Next line is the same depth so it's the same line as this one in the extracted text
|
|
addSpaceMark(" ")
|
|
continue
|
|
}
|
|
addSpaceMark("\n")
|
|
}
|
|
if ip != len(paras)-1 {
|
|
addSpaceMark("\n")
|
|
}
|
|
}
|
|
}
|
|
return marks
|
|
}
|
|
|
|
// sortReadingOrder sorts `paras` in reading order.
|
|
func (paras paraList) sortReadingOrder() {
|
|
common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras))
|
|
if len(paras) <= 1 {
|
|
return
|
|
}
|
|
adj := paras.adjMatrix()
|
|
order := topoOrder(adj)
|
|
paras.reorder(order)
|
|
}
|
|
|
|
// adjMatrix creates an adjacency matrix for the DAG of connections over `paras`.
|
|
// Node i is connected to node j if i comes before j by Breuel's rules.
|
|
func (paras paraList) adjMatrix() [][]bool {
|
|
n := len(paras)
|
|
adj := make([][]bool, n)
|
|
reasons := make([][]string, n)
|
|
for i := range paras {
|
|
adj[i] = make([]bool, n)
|
|
reasons[i] = make([]string, n)
|
|
for j := range paras {
|
|
if i == j {
|
|
continue
|
|
}
|
|
adj[i][j], reasons[i][j] = paras.before(i, j)
|
|
}
|
|
}
|
|
if verbose && false {
|
|
common.Log.Info("adjMatrix =======")
|
|
for i := 0; i < n; i++ {
|
|
a := paras[i]
|
|
fmt.Printf("%4d: %q %.2f\n", i, truncate(a.text(), 50), a.PdfRectangle)
|
|
for j := 0; j < n; j++ {
|
|
if i == j {
|
|
continue
|
|
}
|
|
if !adj[i][j] {
|
|
continue
|
|
}
|
|
b := paras[j]
|
|
fmt.Printf("%8d: %10s %q %.2f\n", j,
|
|
reasons[i][j], truncate(b.text(), 40), b.PdfRectangle)
|
|
|
|
}
|
|
}
|
|
}
|
|
return adj
|
|
}
|
|
|
|
// before defines an ordering over `paras`.
|
|
// before returns true if `a` comes before `b`.
|
|
// 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if
|
|
// line segment `a` is above line segment `b` on the page.
|
|
// 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if
|
|
// there does not exist a line segment `c` whose y-coordinates are between `a` and `b` and whose
|
|
// range of x coordinates overlaps both `a` and `b`.
|
|
// From Thomas M. Breuel "High Performance Document Layout Analysis"
|
|
func (paras paraList) before(i, j int) (bool, string) {
|
|
a, b := paras[i], paras[j]
|
|
// Breuel's rule 1
|
|
if overlappedXPara(a, b) && a.Lly > b.Lly {
|
|
return true, "above"
|
|
}
|
|
|
|
// Breuel's rule 2
|
|
if !(a.eBBox.Urx < b.eBBox.Llx) {
|
|
return false, "NOT left"
|
|
}
|
|
for k, c := range paras {
|
|
if k == i || k == j {
|
|
continue
|
|
}
|
|
lo := a.Lly
|
|
hi := b.Lly
|
|
if lo > hi {
|
|
hi, lo = lo, hi
|
|
}
|
|
if !(lo < c.Lly && c.Lly < hi) {
|
|
continue
|
|
}
|
|
if overlappedXPara(a, c) && overlappedXPara(c, b) {
|
|
return false, "Y intervening"
|
|
}
|
|
}
|
|
return true, "TO LEFT"
|
|
}
|
|
|
|
// overlappedX returns true if `r0` and `r1` overlap on the x-axis. !@#$ There is another version
|
|
// of this!
|
|
func overlappedXPara(r0, r1 *textPara) bool {
|
|
return overlappedXRect(r0.eBBox, r1.eBBox)
|
|
}
|
|
|
|
// computeEBBoxes computes the eBBox fields in the elements of `paras`.
|
|
func (paras paraList) computeEBBoxes() {
|
|
common.Log.Trace("computeEBBoxes:")
|
|
|
|
for i, a := range paras {
|
|
// [llx, urx] is the reading direction interval for which no paras overlap `a`
|
|
llx := -1.0e9
|
|
urx := +1.0e9
|
|
for j, b := range paras {
|
|
if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) {
|
|
continue
|
|
}
|
|
// y overlap
|
|
|
|
// `b` to left of `a`. no x overlap.
|
|
if b.Urx < a.Llx {
|
|
llx = math.Max(llx, b.Urx)
|
|
}
|
|
// `b` to right of `a`. no x overlap.
|
|
if a.Urx < b.Llx {
|
|
urx = math.Min(urx, b.Llx)
|
|
}
|
|
|
|
}
|
|
// llx extends left from `a` and overlaps no other paras.
|
|
// urx extends right from `a` and overlaps no other paras.
|
|
|
|
// Go through all paras below `a` within interval [llx, urx] in the reading direction and
|
|
// expand `a` as far as possible to left and right without overlapping any of them.
|
|
a.eBBox = a.PdfRectangle
|
|
for j, b := range paras {
|
|
if i == j || b.Ury > a.Lly {
|
|
continue
|
|
}
|
|
|
|
// If `b` is completely to right of `llx`, extend `a` left to `b`.
|
|
if llx <= b.Llx {
|
|
a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx)
|
|
}
|
|
|
|
// If `b` is completely to left of `urx`, extend `a` right to `b`.
|
|
if b.Urx <= urx {
|
|
a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`.
|
|
func topoOrder(adj [][]bool) []int {
|
|
n := len(adj)
|
|
visited := make([]bool, n)
|
|
var order []int
|
|
|
|
// sortNode recursively sorts below node `idx` in the adjacency matrix.
|
|
var sortNode func(idx int)
|
|
sortNode = func(idx int) {
|
|
visited[idx] = true
|
|
for i := 0; i < n; i++ {
|
|
if adj[idx][i] && !visited[i] {
|
|
sortNode(i)
|
|
}
|
|
}
|
|
order = append(order, idx) // Should prepend but it's cheaper to append and reverse later.
|
|
}
|
|
|
|
for idx := 0; idx < n; idx++ {
|
|
if !visited[idx] {
|
|
sortNode(idx)
|
|
}
|
|
}
|
|
// Order is currently reversed so change it to forward order.
|
|
for i := 0; i < n/2; i++ {
|
|
order[i], order[n-1-i] = order[n-1-i], order[i]
|
|
}
|
|
return order
|
|
}
|
|
|
|
// reorder reorders `para` to the order in `order`.
|
|
func (paras paraList) reorder(order []int) {
|
|
sorted := make(paraList, len(paras))
|
|
for i, k := range order {
|
|
sorted[i] = paras[k]
|
|
}
|
|
copy(paras, sorted)
|
|
}
|