/* * This file is subject to the terms and conditions defined in * file 'LICENSE.md', which is part of this source code package. */ package extractor import ( "io" "math" "sort" "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/model" ) // paraList is a sequence of textPara. We use it so often that it is convenient to have its own // type so we can have methods on it. type paraList []*textPara // makeTextPage builds a paraList from `marks`, the textMarks on a page. func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList { common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize) // Break the marks into words words := makeTextWords(marks, pageSize) // Divide the words into depth bins with each the contents of each bin sorted by reading direction page := makeTextStrata(words, pageSize.Ury) // Divide the page into rectangular regions for each paragraph and creata a textStrata for each one. paraStratas := dividePage(page, pageSize.Ury) // Arrange the contents of each para into lines paras := make(paraList, len(paraStratas)) for i, para := range paraStratas { paras[i] = composePara(para) } // Sort the paras into reading order. paras.sortReadingOrder() return paras } // dividePage divides page builds a list of paragraph textStrata from `page`, the page textStrata. func dividePage(page *textStrata, pageHeight float64) []*textStrata { var paraStratas []*textStrata // Move words from `page` to paras until there no words left in page. // Iterate through page in depth bin order. // For each `page` bin, move words until is empty. This will likely move words from other // `page` bins to para bins. // Some bins are emptied before they iterated to. // If a bin is not empty then at least one para is built starting from it cnt := 0 for _, depthIdx := range page.depthIndexes() { changed := false for ; !page.empty(depthIdx); cnt++ { // Start a new paragraph region `para`. // Build `para` out from the left-most (lowest in reading direction) word `words`[0], // in the bins in and below `depthIdx`. para := newTextStrata(pageHeight) // words[0] is the leftmost word from bins near `depthIdx`. firstReadingIdx := page.firstReadingIndex(depthIdx) words := page.getStratum(firstReadingIdx) moveWord(firstReadingIdx, page, para, words[0]) // The following 3 numbers define whether words should be added to `para`. minInterReadingGap := minInterReadingGapR * para.fontsize maxIntraReadingGap := maxIntraReadingGapR * para.fontsize maxIntraDepthGap := maxIntraDepthGapR * para.fontsize // Add words to `para` until we pass through the following loop without a new word // being added to a `para`. for running := true; running; running = changed { changed = false // Add words that are within maxIntraDepthGap of `para` in the depth direction. // i.e. Stretch para in the depth direction, vertically for English text. if page.scanBand(para, partial(readingOverlapPlusGap, 0), para.minDepth()-maxIntraDepthGap, para.maxDepth()+maxIntraDepthGap, maxIntraDepthFontTolR, false, false) > 0 { changed = true } // Add words that are within maxIntraReadingGap of `para` in the reading direction. // i.e. Stretch para in the reading direction, horizontall for English text. if page.scanBand(para, partial(readingOverlapPlusGap, maxIntraReadingGap), para.minDepth(), para.maxDepth(), maxIntraReadingFontTol, false, false) > 0 { changed = true } // The above stretching has got as far as it go. Repeating it won't pull in more words. // Only try to combine other words if we can't grow para in the simple way above. if changed { continue } // In the following cases, we don't expand `para` while scanning. We look for words // around para. If we find them, we add them then expand `para` when we are done. // This pulls the numbers to the left of para into para // e.g. From // Regulatory compliance // Archiving // Document search // to // 1. Regulatory compliance // 2. Archiving // 3. Document search // If there are words to the left of `para`, add them. // We need to limit the number of word n := page.scanBand(para, partial(readingOverlapLeft, minInterReadingGap), para.minDepth(), para.maxDepth(), minInterReadingFontTol, true, false) if n > 0 { r := (para.maxDepth() - para.minDepth()) / para.fontsize if (n > 1 && float64(n) > 0.3*r) || n <= 5 { if page.scanBand(para, partial(readingOverlapLeft, minInterReadingGap), para.minDepth(), para.maxDepth(), minInterReadingFontTol, false, true) > 0 { changed = true } } } } // Sort the words in `para`'s bins in the reading direction. para.sort() paraStratas = append(paraStratas, para) } } return paraStratas } // writeText write the text in `pt` to `w`.`` func (paras paraList) writeText(w io.Writer) { for ip, para := range paras { for il, line := range para.lines { s := line.text() n := len(s) n0 := n if (il < len(para.lines)-1 || ip < len(paras)-1) && line.hyphenated { // Line ending with hyphen. Remove it n-- r := []rune(s) r = r[:len(r)-1] s = string(r) } w.Write([]byte(s)) if n < n0 { // We removed the hyphend from the end of the line so we don't need a line ending. continue } if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) { // Next line is the same depth so it's the same line as this one in the extracted text w.Write([]byte(" ")) continue } w.Write([]byte("\n")) } w.Write([]byte("\n")) } } // sortReadingOrder sorts `paras` in reading order. func (paras paraList) sortReadingOrder() { common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras)) if len(paras) <= 1 { return } paras.computeEBBoxes() // Pre-sort by reading direction then depth sort.Slice(paras, func(i, j int) bool { return diffReadingDepth(paras[i], paras[j]) < 0 }) adj := paras.adjMatrix() order := topoOrder(adj) // `order` now contains the reading order. Set paras to that order. sorted := make(paraList, len(paras)) for i, k := range order { sorted[i] = paras[k] } copy(paras, sorted) } // adjMatrix creates an adjacency matrix for the DAG of connections over `paras`. // Node i is connected to node j if i comes before j by Breuel's rules. func (paras paraList) adjMatrix() [][]bool { n := len(paras) adj := make([][]bool, n) for i := range paras { adj[i] = make([]bool, n) for j := range paras { adj[i][j] = i != j && paras.before(i, j) } } return adj } // before defines an ordering over `paras`. // 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if // line segment `a` is above line segment `b` on the page. // 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if // there does not exist a line segment `c` whose y-coordinates are between `a` and `b` and whose // range of x coordinates overlaps both `a` and `b`. // From Thomas M. Breuel "High Performance Document Layout Analysis" func (paras paraList) before(i, j int) bool { a, b := paras[i], paras[j] // Breuel's rule 1 if overlappedX(a, b) && a.Ury > b.Ury { return true } // Breuel's rule 2 if !(a.eBBox.Urx < b.eBBox.Llx) { return false } for k, c := range paras { if k == i || k == j { continue } lo := a.Lly hi := b.Lly if lo > hi { hi, lo = lo, hi } if !(lo < c.Lly && c.Lly < hi) { continue } if overlappedX(a, c) && overlappedX(c, b) { return false } } return true } // overlappedX returns true if `r0` and `r1` overlap on the x-axis. !@#$ There is another version // of this! func overlappedX(r0, r1 *textPara) bool { return overlappedX01(r0, r1) || overlappedX01(r1, r0) } func overlappedX01(r0, r1 *textPara) bool { return overlappedXRect(r0.eBBox, r1.eBBox) } func overlappedXRect(r0, r1 model.PdfRectangle) bool { return (r0.Llx <= r1.Llx && r1.Llx <= r0.Urx) || (r0.Llx <= r1.Urx && r1.Urx <= r0.Urx) } // computeEBBoxes computes the eBBox fields in the elements of `paras`. func (paras paraList) computeEBBoxes() { common.Log.Trace("computeEBBoxes:") for i, a := range paras { // [llx, urx] is the reading direction interval for which no paras overlap `a` llx := -1.0e9 urx := +1.0e9 for j, b := range paras { if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) { continue } // y overlap // `b` to left of `a`. no x overlap. if b.Urx < a.Llx { llx = math.Max(llx, b.Urx) } // `b` to right of `a`. no x overlap. if a.Urx < b.Llx { urx = math.Min(urx, b.Llx) } } // llx extends left from `a` and overlaps no other paras. // urx extends right from `a` and overlaps no other paras. // Go through all paras below `a` within interval [llx, urx] in the reading direction and // expand `a` as far as possible to left and right without overlapping any of them. a.eBBox = a.PdfRectangle for j, b := range paras { if i == j || b.Ury > a.Lly { continue } // If `b` is completely to right of `llx`, extend `a` left to `b`. if llx <= b.Llx { a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx) } // If `b` is completely to left of `urx`, extend `a` right to `b`. if b.Urx <= urx { a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx) } } } } // topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`. func topoOrder(adj [][]bool) []int { n := len(adj) visited := make([]bool, n) var order []int // sortNode recursively sorts below node `idx` in the adjacency matrix. var sortNode func(idx int) sortNode = func(idx int) { visited[idx] = true for i := 0; i < n; i++ { if adj[idx][i] && !visited[i] { sortNode(i) } } order = append(order, idx) // Should prepend but it's cheaper to append and reverse later. } for idx := 0; idx < n; idx++ { if !visited[idx] { sortNode(idx) } } // Order is currently reversed so change it to forward order. for i := 0; i < n/2; i++ { order[i], order[n-1-i] = order[n-1-i], order[i] } return order }