unipdf/extractor/text_page.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"fmt"
	"io"
	"math"
	"unicode"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/model"
)

// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
// type so we can have methods on it.
type paraList []*textPara

// makeTextPage builds a paraList from `marks`, the textMarks on a page.
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
	common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)

	// Break the marks into words
	words := makeTextWords(marks, pageSize)

	// Divide the words into depth bins with each the contents of each bin sorted by reading direction
	page := makeTextStrata(words, pageSize.Ury)
	// Divide the page into rectangular regions for each paragraph and creata a textStrata for each one.
	paraStratas := dividePage(page, pageSize.Ury)
	// Arrange the contents of each para into lines
	paras := make(paraList, len(paraStratas))
	for i, para := range paraStratas {
		paras[i] = composePara(para)
	}
	if verbose || true {
		common.Log.Info("unsorted=========----------=====")
		for i, para := range paras {
			common.Log.Info("paras[%d]=%.2f%q", i, para.PdfRectangle, truncate(paras[i].text(), 200))
		}
	}

	paras.computeEBBoxes()
	paras = paras.extractTables()

	// Sort the paras into reading order.
	paras.sortReadingOrder()
	if verbose || true {
		common.Log.Info("para sorted in reading order -----------=========")
		for i, para := range paras {
			tab := ""
			if para.table != nil {
				tab = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
			}
			fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tab, truncate(para.text(), 50))
		}
	}
	return paras
}

// dividePage divides page builds a list of paragraph textStrata from `page`, the page textStrata.
func dividePage(page *textStrata, pageHeight float64) []*textStrata {
	var paraStratas []*textStrata

	// We move words from `page` to paras until there no words left in page.
	// We do this by iterating through `page` in depth bin order and, for each surving bin (see
	// below),  creating a paragraph with seed word, `words[0]` in the code below.
	// We then move words from around the `para` region from `page` to `para` .
	// This may empty some page bins before we iterate to them
	// Some bins are emptied before they iterated to (seee "surving bin" above).
	// If a `page` survives until it is iterated to then at least one `para` will be built around it.

	if verbose {
		common.Log.Info("dividePage")
	}
	cnt := 0
	for _, depthIdx := range page.depthIndexes() {
		changed := false
		for ; !page.empty(depthIdx); cnt++ {
			// Start a new paragraph region `para`.
			// Build `para` out from the left-most (lowest in reading direction) word `words`[0],
			// in the bins in and below `depthIdx`.
			para := newTextStrata(pageHeight)

			// words[0] is the leftmost word from the bins in and a few lines below `depthIdx`. We
			// seed 'para` with this word.
			firstReadingIdx := page.firstReadingIndex(depthIdx)
			words := page.getStratum(firstReadingIdx)
			moveWord(firstReadingIdx, page, para, words[0])
			if verbose {
				common.Log.Info("words[0]=%s", words[0].String())
			}

			// The following 3 numbers define whether words should be added to `para`.
			minInterReadingGap := minInterReadingGapR * para.fontsize
			maxIntraReadingGap := maxIntraReadingGapR * para.fontsize
			maxIntraDepthGap := maxIntraDepthGapR * para.fontsize

			// Add words to `para` until we pass through the following loop without a new word
			// being added to a `para`.
			for running := true; running; running = changed {
				changed = false

				// Add words that are within maxIntraDepthGap of `para` in the depth direction.
				// i.e. Stretch para in the depth direction, vertically for English text.
				if verbose {
					common.Log.Info("para depth %.2f - %.2f maxIntraDepthGap=%.2f ",
						para.minDepth(), para.maxDepth(), maxIntraDepthGap)
				}
				if page.scanBand("veritcal", para, partial(readingOverlapPlusGap, 0),
					para.minDepth()-maxIntraDepthGap, para.maxDepth()+maxIntraDepthGap,
					maxIntraDepthFontTolR, false, false) > 0 {
					changed = true
				}
				// Add words that are within maxIntraReadingGap of `para` in the reading direction.
				// i.e. Stretch para in the reading direction, horizontall for English text.
				if page.scanBand("horizontal", para, partial(readingOverlapPlusGap, maxIntraReadingGap),
					para.minDepth(), para.maxDepth(),
					maxIntraReadingFontTol, false, false) > 0 {
					changed = true
				}
				// The above stretching has got as far as it go. Repeating it won't pull in more words.

				// Only try to combine other words if we can't grow para in the simple way above.
				if changed {
					continue
				}

				// In the following cases, we don't expand `para` while scanning. We look for words
				// around para. If we find them, we add them then expand `para` when we are done.
				// This pulls the numbers to the left of para into para
				// e.g. From
				// 		Regulatory compliance
				// 		Archiving
				// 		Document search
				// to
				// 		1. Regulatory compliance
				// 		2. Archiving
				// 		3. Document search

				// If there are words to the left of `para`, add them.
				// We need to limit the number of word
				n := page.scanBand("", para, partial(readingOverlapLeft, minInterReadingGap),
					para.minDepth(), para.maxDepth(),
					minInterReadingFontTol, true, false)
				if n > 0 {
					r := (para.maxDepth() - para.minDepth()) / para.fontsize
					if (n > 1 && float64(n) > 0.3*r) || n <= 5 {
						if page.scanBand("other", para, partial(readingOverlapLeft, minInterReadingGap),
							para.minDepth(), para.maxDepth(),
							minInterReadingFontTol, false, true) > 0 {
							changed = true
						}
					}
				}
			}

			// Sort the words in `para`'s bins in the reading direction.
			para.sort()
			paraStratas = append(paraStratas, para)
		}
	}

	return paraStratas
}

const doHyphens = true
const useTables = true

// writeText writes the text in `paras` to `w`.
func (paras paraList) writeText(w io.Writer) {
	for ip, para := range paras {
		if useTables {
			para.writeText(w)
		} else {
			for il, line := range para.lines {
				s := line.text()
				reduced := false
				if doHyphens {
					if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
						// Line ending with hyphen. Remove it.
						runes := []rune(s)
						s = string(runes[:len(runes)-1])
						reduced = true
					}
				}
				w.Write([]byte(s))
				if reduced {
					// We removed the hyphen from the end of the line so we don't need a line ending.
					continue
				}
				if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
					// Next line is the same depth so it's the same line as this one in the extracted text
					w.Write([]byte(" "))
					continue
				}
				w.Write([]byte("\n"))
			}
			w.Write([]byte("\n"))
		}
	}
}

// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
// paras `paras`.writeText().
func (paras paraList) toTextMarks() []TextMark {
	offset := 0
	var marks []TextMark
	addMark := func(mark TextMark) {
		mark.Offset = offset
		marks = append(marks, mark)
		offset += len(mark.Text)
	}
	addSpaceMark := func(spaceChar string) {
		mark := spaceMark
		mark.Text = spaceChar
		addMark(mark)
	}
	for ip, para := range paras {
		if useTables {
			paraMarks := para.toTextMarks(&offset)
			marks = append(marks, paraMarks...)
		} else {
			for il, line := range para.lines {
				lineMarks := line.toTextMarks(&offset)
				marks = append(marks, lineMarks...)
				reduced := false
				if doHyphens {
					if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
						tm := marks[len(marks)-1]
						r := []rune(tm.Text)
						if unicode.IsSpace(r[len(r)-1]) {
							panic(tm)
						}
						if len(r) == 1 {
							marks = marks[:len(marks)-1]
							offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
						} else {
							s := string(r[:len(r)-1])
							offset += len(s) - len(tm.Text)
							tm.Text = s
						}
						reduced = true
					}
				}
				if reduced {
					continue
				}
				if il != len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
					// Next line is the same depth so it's the same line as this one in the extracted text
					addSpaceMark(" ")
					continue
				}
				addSpaceMark("\n")
			}
			if ip != len(paras)-1 {
				addSpaceMark("\n")
			}
		}
	}
	return marks
}

// sortReadingOrder sorts `paras` in reading order.
func (paras paraList) sortReadingOrder() {
	common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras))
	if len(paras) <= 1 {
		return
	}
	adj := paras.adjMatrix()
	order := topoOrder(adj)
	paras.reorder(order)
}

// adjMatrix creates an adjacency matrix for the DAG of connections over `paras`.
// Node i is connected to node j if i comes before j by Breuel's rules.
func (paras paraList) adjMatrix() [][]bool {
	n := len(paras)
	adj := make([][]bool, n)
	reasons := make([][]string, n)
	for i := range paras {
		adj[i] = make([]bool, n)
		reasons[i] = make([]string, n)
		for j := range paras {
			if i == j {
				continue
			}
			adj[i][j], reasons[i][j] = paras.before(i, j)
		}
	}
	if verbose && false {
		common.Log.Info("adjMatrix =======")
		for i := 0; i < n; i++ {
			a := paras[i]
			fmt.Printf("%4d: %q %.2f\n", i, truncate(a.text(), 50), a.PdfRectangle)
			for j := 0; j < n; j++ {
				if i == j {
					continue
				}
				if !adj[i][j] {
					continue
				}
				b := paras[j]
				fmt.Printf("%8d: %10s %q %.2f\n", j,
					reasons[i][j], truncate(b.text(), 40), b.PdfRectangle)

			}
		}
	}
	return adj
}

// before defines an ordering over `paras`.
// before returns true if `a` comes before `b`.
// 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if
//    line segment `a` is above line segment `b` on the page.
// 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if
//    there does not exist a line segment `c` whose y-coordinates are between `a` and `b` and whose
//    range of x coordinates overlaps both `a` and `b`.
// From Thomas M. Breuel "High Performance Document Layout Analysis"
func (paras paraList) before(i, j int) (bool, string) {
	a, b := paras[i], paras[j]
	// Breuel's rule 1
	if overlappedXPara(a, b) && a.Lly > b.Lly {
		return true, "above"
	}

	// Breuel's rule 2
	if !(a.eBBox.Urx < b.eBBox.Llx) {
		return false, "NOT left"
	}
	for k, c := range paras {
		if k == i || k == j {
			continue
		}
		lo := a.Lly
		hi := b.Lly
		if lo > hi {
			hi, lo = lo, hi
		}
		if !(lo < c.Lly && c.Lly < hi) {
			continue
		}
		if overlappedXPara(a, c) && overlappedXPara(c, b) {
			return false, "Y intervening"
		}
	}
	return true, "TO LEFT"
}

// overlappedX returns true if `r0` and `r1` overlap on the x-axis. !@#$ There is another version
// of this!
func overlappedXPara(r0, r1 *textPara) bool {
	return overlappedXRect(r0.eBBox, r1.eBBox)
}

// computeEBBoxes computes the eBBox fields in the elements of `paras`.
func (paras paraList) computeEBBoxes() {
	common.Log.Trace("computeEBBoxes:")

	for i, a := range paras {
		// [llx, urx] is the reading direction interval for which no paras overlap `a`
		llx := -1.0e9
		urx := +1.0e9
		for j, b := range paras {
			if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) {
				continue
			}
			// y overlap

			// `b` to left of `a`. no x overlap.
			if b.Urx < a.Llx {
				llx = math.Max(llx, b.Urx)
			}
			// `b` to right of `a`. no x overlap.
			if a.Urx < b.Llx {
				urx = math.Min(urx, b.Llx)
			}

		}
		// llx extends left from `a` and overlaps no other paras.
		// urx extends right from `a` and overlaps no other paras.

		// Go through all paras below `a` within interval [llx, urx] in the reading direction and
		// expand `a` as far as possible to left and right without overlapping any of them.
		a.eBBox = a.PdfRectangle
		for j, b := range paras {
			if i == j || b.Ury > a.Lly {
				continue
			}

			// If `b` is completely to right of `llx`, extend `a` left to `b`.
			if llx <= b.Llx {
				a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx)
			}

			// If `b` is completely to left of `urx`, extend `a` right to `b`.
			if b.Urx <= urx {
				a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx)
			}
		}
	}
}

// topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`.
func topoOrder(adj [][]bool) []int {
	n := len(adj)
	visited := make([]bool, n)
	var order []int

	// sortNode recursively sorts below node `idx` in the adjacency matrix.
	var sortNode func(idx int)
	sortNode = func(idx int) {
		visited[idx] = true
		for i := 0; i < n; i++ {
			if adj[idx][i] && !visited[i] {
				sortNode(i)
			}
		}
		order = append(order, idx) // Should prepend but it's cheaper to append and reverse later.
	}

	for idx := 0; idx < n; idx++ {
		if !visited[idx] {
			sortNode(idx)
		}
	}
	// Order is currently reversed so change it to forward order.
	for i := 0; i < n/2; i++ {
		order[i], order[n-1-i] = order[n-1-i], order[i]
	}
	return order
}

// reorder reorders `para` to the order in `order`.
func (paras paraList) reorder(order []int) {
	sorted := make(paraList, len(paras))
	for i, k := range order {
		sorted[i] = paras[k]
	}
	copy(paras, sorted)
}