unipdf/extractor/text_bag.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"fmt"
	"math"
	"sort"
	"strings"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/model"
)

// wordBag is just a list of textWords in a rectangular region. It is needed for efficient
// comparison of the bounding boxes of the words to arrange them into paragraph regions.
// The implementation is not important as long as it implements the main function scanBand()
// efficiently.
// In the current implementation, wordBag is a list of word fragment bins arranged by their depth on
// a page with the word fragments  in each bin are sorted in reading order.
type wordBag struct {
	model.PdfRectangle         // Bounding box of all the textWord in the wordBag.
	fontsize           float64 // The size of the largest font in the wordBag.
	// The following fields are for the current bin based implementation
	pageHeight float64             // Used to calculate depths
	bins       map[int][]*textWord // bins[n] = w: n*depthBinPoints <= w.depth < (n+1)*depthBinPoints
}

// makeWordBag return a wordBag containg `words`
// In the current implementation, it does this by putting the words into the appropriate depth bins.
// Caller must check that `words` has at least one element.
func makeWordBag(words []*textWord, pageHeight float64) *wordBag {
	b := newWordBag(words[0], pageHeight)
	for _, w := range words[1:] {
		depthIdx := depthIndex(w.depth)
		b.bins[depthIdx] = append(b.bins[depthIdx], w)
	}
	b.sort()
	return b
}

// newWordBag returns a wordBag with page height `pageHeight` with the single word fragment `word`.
func newWordBag(word *textWord, pageHeight float64) *wordBag {
	depthIdx := depthIndex(word.depth)
	words := []*textWord{word}
	bag := wordBag{
		bins:         map[int][]*textWord{depthIdx: words},
		PdfRectangle: word.PdfRectangle,
		fontsize:     word.fontsize,
		pageHeight:   pageHeight,
	}
	return &bag
}

// String returns a description of `b`.
func (b *wordBag) String() string {
	var texts []string
	for _, depthIdx := range b.depthIndexes() {
		words, _ := b.bins[depthIdx]
		for _, w := range words {
			texts = append(texts, w.text)
		}
	}
	return fmt.Sprintf("%.2f fontsize=%.2f %d %q", b.PdfRectangle, b.fontsize, len(texts), texts)
}

// scanBand scans the bins for words w:
//     `minDepth` <= w.depth <= `maxDepth` &&  // in the depth diraction
//    `readingOverlap`(`para`, w) &&  // in the reading directon
//     math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance
// and applies `moveWord`(depthIdx, s,para w) to them.
// If `detectOnly` is true, moveWord is not applied.
// If `freezeDepth` is true, minDepth and maxDepth are not updated in scan as words are added.
func (b *wordBag) scanBand(title string, para *wordBag,
	readingOverlap func(para *wordBag, word *textWord) bool,
	minDepth, maxDepth, fontTol float64,
	detectOnly, freezeDepth bool) int {
	fontsize := para.fontsize
	lineDepth := lineDepthR * fontsize
	n := 0
	minDepth0, maxDepth0 := minDepth, maxDepth
	var newWords []*textWord
	for _, depthIdx := range b.depthBand(minDepth-lineDepth, maxDepth+lineDepth) {
		for _, word := range b.bins[depthIdx] {
			if !(minDepth-lineDepth <= word.depth && word.depth <= maxDepth+lineDepth) {
				continue
			}
			if !readingOverlap(para, word) {
				continue
			}
			fontRatio1 := math.Abs(word.fontsize-fontsize) / fontsize
			fontRatio2 := word.fontsize / fontsize
			fontRatio := math.Min(fontRatio1, fontRatio2)
			if fontTol > 0 {
				if fontRatio > fontTol {
					continue
				}
			}

			if !detectOnly {
				para.pullWord(b, word, depthIdx)
			}
			newWords = append(newWords, word)
			n++
			if !freezeDepth {
				if word.depth < minDepth {
					minDepth = word.depth
				}
				if word.depth > maxDepth {
					maxDepth = word.depth
				}
			}
			// Has no effect on results
			// fontsize = para.fontsize
			// lineDepth = lineDepthR * fontsize
			if detectOnly {
				break
			}
		}
	}
	if verbose {
		if len(title) > 0 {
			common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f fontsize=%.2f %q",
				title,
				minDepth0, maxDepth0,
				minDepth, maxDepth,
				para.PdfRectangle, para.fontsize, truncate(para.text(), 20))
			for i, word := range newWords {
				fmt.Printf("  %q", word.text)
				if i >= 5 {
					break
				}
			}
			if len(newWords) > 0 {
				fmt.Println()
			}
		}
	}
	return n
}

// highestWord returns the hight word in b.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
func (b *wordBag) highestWord(depthIdx int, minDepth, maxDepth float64) *textWord {
	for _, word := range b.bins[depthIdx] {
		if minDepth <= word.depth && word.depth <= maxDepth {
			return word
		}
	}
	return nil
}

// depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`.
func (b *wordBag) depthBand(minDepth, maxDepth float64) []int {
	if len(b.bins) == 0 {
		return nil
	}
	return b.depthRange(b.getDepthIdx(minDepth), b.getDepthIdx(maxDepth))
}

// depthRange returns the sorted keys of b.bins for depths indexes [`minDepth`,`maxDepth`).
func (b *wordBag) depthRange(minDepthIdx, maxDepthIdx int) []int {
	indexes := b.depthIndexes()
	var rangeIndexes []int
	for _, depthIdx := range indexes {
		if minDepthIdx <= depthIdx && depthIdx <= maxDepthIdx {
			rangeIndexes = append(rangeIndexes, depthIdx)
		}
	}
	return rangeIndexes
}

// firstReadingIndex returns the index of the bin containing the left-most word near the top of `b`.
// Precisely, this is the index of the depth bin that starts with that word with the smallest
// reading direction value in the depth region `minDepthIndex` < depth <= minDepthIndex+ 4*fontsize
// The point of this function is to find the top-most left-most word in `b` that is not a superscript.
func (b *wordBag) firstReadingIndex(minDepthIdx int) int {
	fontsize := b.firstWord(minDepthIdx).fontsize
	minDepth := float64(minDepthIdx+1) * depthBinPoints
	maxDepth := minDepth + topWordRangeR*fontsize
	firstReadingIdx := minDepthIdx
	for _, depthIdx := range b.depthBand(minDepth, maxDepth) {
		if diffReading(b.firstWord(depthIdx), b.firstWord(firstReadingIdx)) < 0 {
			firstReadingIdx = depthIdx
		}
	}
	return firstReadingIdx
}

// getDepthIdx returns the index into `b.bins` for depth axis value `depth`.
// Caller must check that len(b.bins) > 0.
func (b *wordBag) getDepthIdx(depth float64) int {
	indexes := b.depthIndexes()
	depthIdx := depthIndex(depth)
	if depthIdx < indexes[0] {
		return indexes[0]
	}
	if depthIdx > indexes[len(indexes)-1] {
		return indexes[len(indexes)-1]
	}
	return depthIdx
}

// empty returns true if the depth bin with index `depthIdx` is empty.
// NOTE: We delete bins as soon as they become empty so we just have to check for the bin's existence.
func (b *wordBag) empty(depthIdx int) bool {
	_, ok := b.bins[depthIdx]
	return !ok
}

// firstWord returns the first word in reading order in bin `depthIdx`.
func (b *wordBag) firstWord(depthIdx int) *textWord {
	return b.bins[depthIdx][0]
}

// stratum returns a copy of `b`.bins[`depthIdx`].
// stratum is guaranteed to return a non-nil value. It must be called with a valid depth index.
// NOTE: We need to return a copy because remove() and other functions manipulate the array
// underlying the slice.
func (b *wordBag) stratum(depthIdx int) []*textWord {
	words := b.bins[depthIdx]
	dup := make([]*textWord, len(words))
	copy(dup, words)
	return dup
}

// pullWord adds `word` to `b` and removes it from `bag`.
// `depthIdx` is the depth index of `word` in all wordBags.
// TODO(peterwilliams97): Compute depthIdx from `word` instead of passing it around.
func (b *wordBag) pullWord(bag *wordBag, word *textWord, depthIdx int) {
	b.PdfRectangle = rectUnion(b.PdfRectangle, word.PdfRectangle)
	if word.fontsize > b.fontsize {
		b.fontsize = word.fontsize
	}
	b.bins[depthIdx] = append(b.bins[depthIdx], word)
	bag.removeWord(word, depthIdx)
}

// removeWord removes `word`from `b`.
// In the current implementation it  removes `word`from `b`.bins[`depthIdx`].
// NOTE: We delete bins as soon as they become empty to save code that calls other wordBag
// functions from having to check for empty bins.
// TODO(peterwilliams97): Find a more efficient way of doing this.
func (b *wordBag) removeWord(word *textWord, depthIdx int) {
	words := removeWord(b.stratum(depthIdx), word)
	if len(words) == 0 {
		delete(b.bins, depthIdx)
	} else {
		b.bins[depthIdx] = words
	}
}

// mergeWordBags merges the bags less than a character width to the left of a bag into that bag.
func mergeWordBags(paraWords []*wordBag) []*wordBag {
	if len(paraWords) <= 1 {
		return paraWords
	}
	if verbose {
		common.Log.Info("mergeWordBags:")
	}
	sort.Slice(paraWords, func(i, j int) bool {
		pi, pj := paraWords[i], paraWords[j]
		ai := pi.Width() * pi.Height()
		aj := pj.Width() * pj.Height()
		if ai != aj {
			return ai > aj
		}
		if pi.Height() != pj.Height() {
			return pi.Height() > pj.Height()
		}
		return i < j
	})
	var merged []*wordBag
	absorbed := map[int]struct{}{}
	for i0 := 0; i0 < len(paraWords); i0++ {
		if _, ok := absorbed[i0]; ok {
			continue
		}
		para0 := paraWords[i0]
		for i1 := i0 + 1; i1 < len(paraWords); i1++ {
			if _, ok := absorbed[i0]; ok {
				continue
			}
			para1 := paraWords[i1]
			r := para0.PdfRectangle
			r.Llx -= para0.fontsize
			if rectContainsRect(r, para1.PdfRectangle) {
				para0.absorb(para1)
				absorbed[i1] = struct{}{}
			}
		}
		merged = append(merged, para0)
	}

	if len(paraWords) != len(merged)+len(absorbed) {
		common.Log.Error("mergeWordBags: %d->%d absorbed=%d",
			len(paraWords), len(merged), len(absorbed))
	}
	return merged
}

// absorb combines the words from `bag` into `b`.
func (b *wordBag) absorb(bag *wordBag) {
	for depthIdx, words := range bag.bins {
		for _, word := range words {
			b.pullWord(bag, word, depthIdx)
		}
	}
}

// depthIndex returns a bin index for depth `depth`.
// The returned depthIdx obeys the following rule.
// depthIdx * depthBinPoints <= depth <= (depthIdx+1) * depthBinPoint
func depthIndex(depth float64) int {
	var depthIdx int
	if depth >= 0 {
		depthIdx = int(depth / depthBinPoints)
	} else {
		depthIdx = int(depth/depthBinPoints) - 1
	}
	return depthIdx
}

// depthIndexes returns the sorted keys of b.bins.
func (b *wordBag) depthIndexes() []int {
	if len(b.bins) == 0 {
		return nil
	}
	indexes := make([]int, len(b.bins))
	i := 0
	for idx := range b.bins {
		indexes[i] = idx
		i++
	}
	sort.Ints(indexes)
	return indexes
}

// sort sorts the word fragments in each bin in `b` in the reading direction.
func (b *wordBag) sort() {
	for _, bin := range b.bins {
		sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 })
	}
}

// minDepth returns the minimum depth that word fragments in `b` touch.
func (b *wordBag) minDepth() float64 {
	return b.pageHeight - (b.Ury - b.fontsize)
}

// maxDepth returns the maximum depth that word fragments in `b` touch.
func (b *wordBag) maxDepth() float64 {
	return b.pageHeight - b.Lly
}

// The following functions are used only for logging.

func (b *wordBag) text() string {
	words := b.allWords()
	texts := make([]string, len(words))
	for i, w := range words {
		texts[i] = w.text
	}
	return strings.Join(texts, " ")
}

func (b *wordBag) allWords() []*textWord {
	var wordList []*textWord
	for _, words := range b.bins {
		wordList = append(wordList, words...)
	}
	return wordList
}