/* * This file is subject to the terms and conditions defined in * file 'LICENSE.md', which is part of this source code package. */ package extractor import ( "fmt" "io" "math" "sort" "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/model" ) // makeTextPage builds a paraList from `marks`, the textMarks on a page. // The paraList contains the page arranged as // - a list of texPara in reading order // - each textPara contains list of textLine (text lines or parts of text lines) in reading order // - each textLine contains a list of textWord (words or parts of words) in reading order // The paraList is thus an ordering of words on a page. // - Users of the paraList are expected to work with words. This should be adequate for most uses // as words are the basic unit of meaning in written language. // - However we provide links back from the extracted text to the textMarks as follows. // * paraList.writeText() returns the extracted text for a page // * paras.toTextMarks() returns a TextMarkArray containing the marks // * TextMarkArray.RangeOffset(lo, hi) return the marks corresponding offsets [lo:hi] in the // extracted text. // NOTE: The "parts of words" occur because of hyphenation. We do some weak coordinate based // dehypenation. Caller who need strong dehypenation should use NLP librarie. // The "parts of lines" are an implementation detail. Line fragments are combined in // paraList.writeText() // ALGORITHM: // 1) Group the textMarks into textWords based on their bounding boxes. // 2) Group the textWords into textParas based on their bounding boxes. // 3) Detect textParas arranged as cells in a table and convert each one to a textPara containing a // textTable. // 4) Sort the textParas in reading order. func makeTextPage(marks []*textMark, pageSize model.PdfRectangle) paraList { common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize) if len(marks) == 0 { return nil } // Group the marks into word fragments words := makeTextWords(marks, pageSize) if len(words) == 0 { return nil } // Put the word fragments into a container that facilitates the grouping of words into paragraphs. pageWords := makeWordBag(words, pageSize.Ury) // Divide the page into rectangular regions for each paragraph and creata a wordBag for each one. paraWords := dividePage(pageWords, pageSize.Ury) paraWords = mergeWordBags(paraWords) // Arrange the contents of each paragraph wordBag into lines and the lines into whole words. paras := make(paraList, 0, len(paraWords)) for _, bag := range paraWords { para := bag.arrangeText() if para != nil { paras = append(paras, para) } } // Find paras that are cells in tables, convert the tables to paras and remove the cell paras. if len(paras) >= minTableParas { paras = paras.extractTables() } // Sort the paras into reading order. paras.sortReadingOrder() paras.log("sorted in reading order") return paras } // dividePage divides `pageWords`, the page wordBag, into a list of paragraph wordBags. func dividePage(pageWords *wordBag, pageHeight float64) []*wordBag { var paraWordBags []*wordBag // We move words from `page` to paras until there no words left in page. // We do this by iterating through `page` in depth bin order and, for each surving bin (see // below), creating a paragraph with seed word, `words[0]` in the code below. // We then move words from around the `para` region from `page` to `para` . // This may empty some page bins before we iterate to them // Some bins are emptied before they iterated to (seee "surving bin" above). // If a `page` survives until it is iterated to then at least one `para` will be built around it. for _, depthIdx := range pageWords.depthIndexes() { changed := false for !pageWords.empty(depthIdx) { // Start a new paragraph region `paraWords`. // Build `paraWords` out from the left-most (lowest in reading direction) word `words`[0], // in the bins in and below `depthIdx`. // `firstWord` is the left-most word from the bins in and a few lines below `depthIdx`. We // seed 'paraWords` with this word. firstReadingIdx := pageWords.firstReadingIndex(depthIdx) firstWord := pageWords.firstWord(firstReadingIdx) paraWords := newWordBag(firstWord, pageHeight) pageWords.removeWord(firstWord, firstReadingIdx) if verbosePage { common.Log.Info("words[0]=%s", firstWord.String()) } // The following 3 numbers define whether words should be added to `paraWords`. minInterReadingGap := minInterReadingGapR * paraWords.fontsize maxIntraReadingGap := maxIntraReadingGapR * paraWords.fontsize maxIntraDepthGap := maxIntraDepthGapR * paraWords.fontsize // Add words to `paraWords` until we pass through the following loop without adding a // new word. for running := true; running; running = changed { changed = false // Add words that are within maxIntraDepthGap of `paraWords` in the depth direction. // i.e. Stretch paraWords in the depth direction, vertically for English text. if verbosePage { common.Log.Info("paraWords depth %.2f - %.2f maxIntraDepthGap=%.2f ", paraWords.minDepth(), paraWords.maxDepth(), maxIntraDepthGap) } if pageWords.scanBand("vertical", paraWords, partial(readingOverlapPlusGap, 0), paraWords.minDepth()-maxIntraDepthGap, paraWords.maxDepth()+maxIntraDepthGap, maxIntraDepthFontTolR, false, false) > 0 { changed = true } // Add words that are within maxIntraReadingGap of `paraWords` in the reading direction. // i.e. Stretch paraWords in the reading direction, horizontall for English text. if pageWords.scanBand("horizontal", paraWords, partial(readingOverlapPlusGap, maxIntraReadingGap), paraWords.minDepth(), paraWords.maxDepth(), maxIntraReadingFontTol, false, false) > 0 { changed = true } // The above stretching has got as far as it can go. Repeating it won't pull in more words. // Only try to combine other words if we can't grow paraWords in the simple way above. if changed { continue } // In the following cases, we don't expand `paraWords` while scanning. We look for words // around paraWords. If we find them, we add them then expand `paraWords` when we are done. // This pulls the numbers to the left of paraWords into paraWords // e.g. From // Regulatory compliance // Archiving // Document search // to // 1. Regulatory compliance // 2. Archiving // 3. Document search // If there are words to the left of `paraWords`, add them. // We need to limit the number of words. n := pageWords.scanBand("", paraWords, partial(readingOverlapLeft, minInterReadingGap), paraWords.minDepth(), paraWords.maxDepth(), minInterReadingFontTol, true, false) if n > 0 { r := (paraWords.maxDepth() - paraWords.minDepth()) / paraWords.fontsize if (n > 1 && float64(n) > 0.3*r) || n <= 10 { if pageWords.scanBand("other", paraWords, partial(readingOverlapLeft, minInterReadingGap), paraWords.minDepth(), paraWords.maxDepth(), minInterReadingFontTol, false, true) > 0 { changed = true } } } } paraWordBags = append(paraWordBags, paraWords) } } return paraWordBags } // writeText writes the text in `paras` to `w`. func (paras paraList) writeText(w io.Writer) { for ip, para := range paras { para.writeText(w) if ip != len(paras)-1 { if sameLine(para, paras[ip+1]) { w.Write([]byte(" ")) } else { w.Write([]byte("\n")) w.Write([]byte("\n")) } } } w.Write([]byte("\n")) w.Write([]byte("\n")) } // toTextMarks creates the TextMarkArray corresponding to the extracted text created by // `paras`.writeText(). func (paras paraList) toTextMarks() []TextMark { offset := 0 var marks []TextMark for ip, para := range paras { paraMarks := para.toTextMarks(&offset) marks = append(marks, paraMarks...) if ip != len(paras)-1 { if sameLine(para, paras[ip+1]) { marks = appendSpaceMark(marks, &offset, " ") } else { marks = appendSpaceMark(marks, &offset, "\n") marks = appendSpaceMark(marks, &offset, "\n") } } } marks = appendSpaceMark(marks, &offset, "\n") marks = appendSpaceMark(marks, &offset, "\n") return marks } // sameLine returms true if `para1` and `para2` are on the same line. func sameLine(para1, para2 *textPara) bool { return isZero(para1.depth() - para2.depth()) } // tables returns the tables from all the paras that contain them. func (paras paraList) tables() []TextTable { var tables []TextTable for _, para := range paras { if para.table != nil { tables = append(tables, para.table.toTextTable()) } } return tables } // sortReadingOrder sorts `paras` in reading order. func (paras paraList) sortReadingOrder() { common.Log.Trace("sortReadingOrder: paras=%d ===========x=============", len(paras)) if len(paras) <= 1 { return } paras.computeEBBoxes() sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 }) order := paras.topoOrder() paras.reorder(order) } // topoOrder returns the ordering of the topological sort of `paras` using readBefore() to determine // the incoming nodes to each node. func (paras paraList) topoOrder() []int { if verbosePage { common.Log.Info("topoOrder:") } n := len(paras) visited := make([]bool, n) order := make([]int, 0, n) llyOrder := paras.llyOrdering() // sortNode recursively sorts below node `idx` in the adjacency matrix. var sortNode func(idx int) sortNode = func(idx int) { visited[idx] = true for i := 0; i < n; i++ { if !visited[i] { if paras.readBefore(llyOrder, idx, i) { sortNode(i) } } } order = append(order, idx) // Should prepend but it's cheaper to append and reverse later. } for idx := 0; idx < n; idx++ { if !visited[idx] { sortNode(idx) } } return reversed(order) } // readBefore returns true if paras[`i`] comes before paras[`j`]. // readBefore defines an ordering over `paras`. // a = paras[i], b= paras[j] // 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if // line segment `a` is above line segment `b` on the page. // 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if // there does not exist a line segment `c` whose y-coordinates are between `a` and `b` and whose // range of x coordinates overlaps both `a` and `b`. // From Thomas M. Breuel "High Performance Document Layout Analysis" func (paras paraList) readBefore(ordering []int, i, j int) bool { a, b := paras[i], paras[j] // Breuel's rule 1 if overlappedXPara(a, b) && a.Lly > b.Lly { return true } // Breuel's rule 2 if !(a.eBBox.Urx < b.eBBox.Llx) { return false } lo, hi := a.Lly, b.Lly if lo > hi { hi, lo = lo, hi } llx := math.Max(a.eBBox.Llx, b.eBBox.Llx) urx := math.Min(a.eBBox.Urx, b.eBBox.Urx) llyOrder := paras.llyRange(ordering, lo, hi) for _, k := range llyOrder { if k == i || k == j { continue } c := paras[k] if c.eBBox.Llx <= urx && llx <= c.eBBox.Urx { return false } } return true } // overlappedX returns true if `r0` and `r1` overlap on the x-axis. func overlappedXPara(r0, r1 *textPara) bool { return intersectsX(r0.eBBox, r1.eBBox) } // llyOrdering is ordering over the indexes of `paras` sorted by Llx is increasing order. func (paras paraList) llyOrdering() []int { ordering := make([]int, len(paras)) for i := range paras { ordering[i] = i } sort.SliceStable(ordering, func(i, j int) bool { oi, oj := ordering[i], ordering[j] return paras[oi].Lly < paras[oj].Lly }) return ordering } // llyRange returns the indexes in `paras` of paras p: lo <= p.Llx < hi func (paras paraList) llyRange(ordering []int, lo, hi float64) []int { n := len(paras) if hi < paras[ordering[0]].Lly || lo > paras[ordering[n-1]].Lly { return nil } // i0 is the lowest i: lly(i) >= lo // i1 is the lowest i: lly(i) > hi i0 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly >= lo }) i1 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly > hi }) return ordering[i0:i1] } // computeEBBoxes computes the eBBox fields in the elements of `paras`. // The EBBoxs are the regions around the paras that don't intersect paras in other columns. // This is needed for sortReadingOrder to work with skinny paras in a column of fat paras. The // sorting assumes the skinny para bounding box is as wide as the fat para bounding boxes. func (paras paraList) computeEBBoxes() { if verbose { common.Log.Info("computeEBBoxes:") } for _, para := range paras { para.eBBox = para.PdfRectangle } paraYNeighbours := paras.yNeighbours() for i, aa := range paras { a := aa.eBBox // [llx, urx] is the reading direction interval for which no paras overlap `a`. llx, urx := -1.0e9, +1.0e9 for _, j := range paraYNeighbours[aa] { b := paras[j].eBBox if b.Urx < a.Llx { // `b` to left of `a`. no x overlap. llx = math.Max(llx, b.Urx) } else if a.Urx < b.Llx { // `b` to right of `a`. no x overlap. urx = math.Min(urx, b.Llx) } } // llx extends left from `a` and overlaps no other paras. // urx extends right from `a` and overlaps no other paras. // Go through all paras below `a` within interval [llx, urx] in the reading direction and // expand `a` as far as possible to left and right without overlapping any of them. for j, bb := range paras { b := bb.eBBox if i == j || b.Ury > a.Lly { continue } if llx <= b.Llx && b.Llx < a.Llx { // If `b` is completely to right of `llx`, extend `a` left to `b`. a.Llx = b.Llx } else if b.Urx <= urx && a.Urx < b.Urx { // If `b` is completely to left of `urx`, extend `a` right to `b`. a.Urx = b.Urx } } if verbose { fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50)) } aa.eBBox = a } if useEBBox { for _, para := range paras { para.PdfRectangle = para.eBBox } } } // reversed return `order` reversed. func reversed(order []int) []int { rev := make([]int, len(order)) for i, v := range order { rev[len(order)-1-i] = v } return rev } // reorder reorders `para` to the order in `order`. func (paras paraList) reorder(order []int) { sorted := make(paraList, len(paras)) for i, k := range order { sorted[i] = paras[k] } copy(paras, sorted) }