unipdf/extractor/text_page.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"io"
	"math"
	"sort"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/model"
)

// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
// type so we can have methods on it.
type paraList []*textPara

// makeTextPage builds a paraList from `marks`, the textMarks on a page.
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
	common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)

	// Break the marks into words
	words := makeTextWords(marks, pageSize)

	// Divide the words into depth bins with each the contents of each bin sorted by reading direction
	page := makeTextStrata(words, pageSize.Ury)
	// Divide the page into rectangular regions for each paragraph and creata a textStrata for each one.
	paraStratas := dividePage(page, pageSize.Ury)
	// Arrange the contents of each para into lines
	paras := make(paraList, len(paraStratas))
	for i, para := range paraStratas {
		paras[i] = composePara(para)
	}

	// Sort the paras into reading order.
	paras.sortReadingOrder()
	return paras
}

// dividePage divides page builds a list of paragraph textStrata from `page`, the page textStrata.
func dividePage(page *textStrata, pageHeight float64) []*textStrata {
	var paraStratas []*textStrata

	// We move words from `page` to paras until there no words left in page.
	// We do this by iterating through `page` in depth bin order and, for each surving bin (see
	// below),  creating a paragraph with seed word, `words[0]` in the code below.
	// We then move words from around the `para` region from `page` to `para` .
	// This may empty some page bins before we iterate to them
	// Some bins are emptied before they iterated to (seee "surving bin" above).
	// If a `page` survives until it is iterated to then at least one `para` will be built around it.

	cnt := 0
	for _, depthIdx := range page.depthIndexes() {
		changed := false
		for ; !page.empty(depthIdx); cnt++ {
			// Start a new paragraph region `para`.
			// Build `para` out from the left-most (lowest in reading direction) word `words`[0],
			// in the bins in and below `depthIdx`.
			para := newTextStrata(pageHeight)

			// words[0] is the leftmost word from the bins in and a few lines below `depthIdx`. We
			// seed 'para` with this word.
			firstReadingIdx := page.firstReadingIndex(depthIdx)
			words := page.getStratum(firstReadingIdx)
			moveWord(firstReadingIdx, page, para, words[0])

			// The following 3 numbers define whether words should be added to `para`.
			minInterReadingGap := minInterReadingGapR * para.fontsize
			maxIntraReadingGap := maxIntraReadingGapR * para.fontsize
			maxIntraDepthGap := maxIntraDepthGapR * para.fontsize

			// Add words to `para` until we pass through the following loop without a new word
			// being added to a `para`.
			for running := true; running; running = changed {
				changed = false

				// Add words that are within maxIntraDepthGap of `para` in the depth direction.
				// i.e. Stretch para in the depth direction, vertically for English text.
				if page.scanBand(para, partial(readingOverlapPlusGap, 0),
					para.minDepth()-maxIntraDepthGap, para.maxDepth()+maxIntraDepthGap,
					maxIntraDepthFontTolR, false, false) > 0 {
					changed = true
				}
				// Add words that are within maxIntraReadingGap of `para` in the reading direction.
				// i.e. Stretch para in the reading direction, horizontall for English text.
				if page.scanBand(para, partial(readingOverlapPlusGap, maxIntraReadingGap),
					para.minDepth(), para.maxDepth(),
					maxIntraReadingFontTol, false, false) > 0 {
					changed = true
				}
				// The above stretching has got as far as it go. Repeating it won't pull in more words.

				// Only try to combine other words if we can't grow para in the simple way above.
				if changed {
					continue
				}

				// In the following cases, we don't expand `para` while scanning. We look for words
				// around para. If we find them, we add them then expand `para` when we are done.
				// This pulls the numbers to the left of para into para
				// e.g. From
				// 		Regulatory compliance
				// 		Archiving
				// 		Document search
				// to
				// 		1. Regulatory compliance
				// 		2. Archiving
				// 		3. Document search

				// If there are words to the left of `para`, add them.
				// We need to limit the number of word
				n := page.scanBand(para, partial(readingOverlapLeft, minInterReadingGap),
					para.minDepth(), para.maxDepth(),
					minInterReadingFontTol, true, false)
				if n > 0 {
					r := (para.maxDepth() - para.minDepth()) / para.fontsize
					if (n > 1 && float64(n) > 0.3*r) || n <= 5 {
						if page.scanBand(para, partial(readingOverlapLeft, minInterReadingGap),
							para.minDepth(), para.maxDepth(),
							minInterReadingFontTol, false, true) > 0 {
							changed = true
						}
					}
				}
			}

			// Sort the words in `para`'s bins in the reading direction.
			para.sort()
			paraStratas = append(paraStratas, para)
		}
	}

	return paraStratas
}

// writeText write the text in `pt` to `w`.``
func (paras paraList) writeText(w io.Writer) {
	for ip, para := range paras {
		for il, line := range para.lines {
			s := line.text()
			n := len(s)
			n0 := n
			if (il < len(para.lines)-1 || ip < len(paras)-1) && line.hyphenated {
				// Line ending with hyphen. Remove it
				n--
				r := []rune(s)
				r = r[:len(r)-1]
				s = string(r)
			}

			w.Write([]byte(s))
			if n < n0 {
				// We removed the hyphend from the end of the line so we don't need a line ending.
				continue
			}
			if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
				// Next line is the same depth so it's the same line as this one in the extracted text
				w.Write([]byte(" "))
				continue
			}
			w.Write([]byte("\n"))
		}
		w.Write([]byte("\n"))
	}
}

// sortReadingOrder sorts `paras` in reading order.
func (paras paraList) sortReadingOrder() {
	common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras))
	if len(paras) <= 1 {
		return
	}
	paras.computeEBBoxes()
	// Pre-sort by reading direction then depth
	sort.Slice(paras, func(i, j int) bool {
		return diffReadingDepth(paras[i], paras[j]) < 0
	})

	adj := paras.adjMatrix()
	order := topoOrder(adj)
	// `order` now contains the reading order. Set paras to that order.
	sorted := make(paraList, len(paras))
	for i, k := range order {
		sorted[i] = paras[k]
	}
	copy(paras, sorted)
}

// adjMatrix creates an adjacency matrix for the DAG of connections over `paras`.
// Node i is connected to node j if i comes before j by Breuel's rules.
func (paras paraList) adjMatrix() [][]bool {
	n := len(paras)
	adj := make([][]bool, n)
	for i := range paras {
		adj[i] = make([]bool, n)
		for j := range paras {
			adj[i][j] = i != j && paras.before(i, j)
		}
	}
	return adj
}

// before defines an ordering over `paras`.
// 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if
//    line segment `a` is above line segment `b` on the page.
// 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if
//    there does not exist a line segment `c` whose y-coordinates  are between `a` and `b` and whose
//    range of x coordinates overlaps both `a` and `b`.
// From Thomas M. Breuel "High Performance Document Layout Analysis"
func (paras paraList) before(i, j int) bool {
	a, b := paras[i], paras[j]
	// Breuel's rule 1
	if overlappedX(a, b) && a.Ury > b.Ury {
		return true
	}
	// Breuel's rule 2
	if !(a.eBBox.Urx < b.eBBox.Llx) {
		return false
	}
	for k, c := range paras {
		if k == i || k == j {
			continue
		}
		lo := a.Lly
		hi := b.Lly
		if lo > hi {
			hi, lo = lo, hi
		}
		if !(lo < c.Lly && c.Lly < hi) {
			continue
		}
		if overlappedX(a, c) && overlappedX(c, b) {
			return false
		}
	}
	return true
}

// overlappedX returns true if `r0` and `r1` overlap on the x-axis. !@#$ There is another version
// of this!
func overlappedX(r0, r1 *textPara) bool {
	return overlappedX01(r0, r1) || overlappedX01(r1, r0)
}

func overlappedX01(r0, r1 *textPara) bool {
	return overlappedXRect(r0.eBBox, r1.eBBox)
}

func overlappedXRect(r0, r1 model.PdfRectangle) bool {
	return (r0.Llx <= r1.Llx && r1.Llx <= r0.Urx) || (r0.Llx <= r1.Urx && r1.Urx <= r0.Urx)
}

// computeEBBoxes computes the eBBox fields in the elements of `paras`.
func (paras paraList) computeEBBoxes() {
	common.Log.Trace("computeEBBoxes:")

	for i, a := range paras {
		// [llx, urx] is the reading direction interval for which no paras overlap `a`
		llx := -1.0e9
		urx := +1.0e9
		for j, b := range paras {
			if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) {
				continue
			}
			// y overlap

			// `b` to left of `a`. no x overlap.
			if b.Urx < a.Llx {
				llx = math.Max(llx, b.Urx)
			}
			// `b` to right of `a`. no x overlap.
			if a.Urx < b.Llx {
				urx = math.Min(urx, b.Llx)
			}

		}
		// llx extends left from `a` and overlaps no other paras.
		// urx extends right from `a` and overlaps no other paras.

		// Go through all paras below `a` within interval [llx, urx] in the reading direction and
		// expand `a` as far as possible to left and right without overlapping any of them.
		a.eBBox = a.PdfRectangle
		for j, b := range paras {
			if i == j || b.Ury > a.Lly {
				continue
			}

			// If `b` is completely to right of `llx`, extend `a` left to `b`.
			if llx <= b.Llx {
				a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx)
			}

			// If `b` is completely to left of `urx`, extend `a` right to `b`.
			if b.Urx <= urx {
				a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx)
			}
		}
	}
}

// topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`.
func topoOrder(adj [][]bool) []int {
	n := len(adj)
	visited := make([]bool, n)
	var order []int

	// sortNode recursively sorts below node `idx` in the adjacency matrix.
	var sortNode func(idx int)
	sortNode = func(idx int) {
		visited[idx] = true
		for i := 0; i < n; i++ {
			if adj[idx][i] && !visited[i] {
				sortNode(i)
			}
		}
		order = append(order, idx) // Should prepend but it's cheaper to append and reverse later.
	}

	for idx := 0; idx < n; idx++ {
		if !visited[idx] {
			sortNode(idx)
		}
	}
	// Order is currently reversed so change it to forward order.
	for i := 0; i < n/2; i++ {
		order[i], order[n-1-i] = order[n-1-i], order[i]
	}
	return order
}