mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-13 19:29:10 +08:00
281 lines
8.3 KiB
Go
281 lines
8.3 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"sort"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// textStrata is a list of word bins arranged by their depth on a page.
|
|
// The words in each bin are sorted in reading order.
|
|
type textStrata struct {
|
|
serial int // Sequence number for debugging.
|
|
model.PdfRectangle // Bounding box (union of words' in bins bounding boxes).
|
|
bins map[int][]*textWord // bins[n] = w: n*depthBinPoints <= w.depth < (n+1)*depthBinPoints
|
|
pageHeight float64
|
|
fontsize float64
|
|
}
|
|
|
|
// makeTextStrata builds a textStrata from `words` by putting the words into the appropriate
|
|
// depth bins.
|
|
func makeTextStrata(words []*textWord, pageHeight float64) *textStrata {
|
|
s := newTextStrata(pageHeight)
|
|
for _, w := range words {
|
|
depthIdx := depthIndex(w.depth)
|
|
s.bins[depthIdx] = append(s.bins[depthIdx], w)
|
|
}
|
|
s.sort()
|
|
return s
|
|
}
|
|
|
|
// newTextStrata returns an empty textStrata with page height `pageHeight`.
|
|
func newTextStrata(pageHeight float64) *textStrata {
|
|
bins := textStrata{
|
|
serial: serial.bins,
|
|
bins: map[int][]*textWord{},
|
|
PdfRectangle: model.PdfRectangle{Urx: -1.0, Ury: -1.0},
|
|
pageHeight: pageHeight,
|
|
}
|
|
serial.bins++
|
|
return &bins
|
|
}
|
|
|
|
// String returns a description of `s`.
|
|
func (s *textStrata) String() string {
|
|
var texts []string
|
|
for _, depthIdx := range s.depthIndexes() {
|
|
words, _ := s.bins[depthIdx]
|
|
for _, w := range words {
|
|
texts = append(texts, w.text())
|
|
}
|
|
}
|
|
return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts)
|
|
}
|
|
|
|
// sort sorts the words in each bin in `s` in the reading direction.
|
|
func (s *textStrata) sort() {
|
|
for _, bin := range s.bins {
|
|
sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 })
|
|
}
|
|
}
|
|
|
|
// minDepth returns the minimum depth that words in `s` touch.
|
|
func (s *textStrata) minDepth() float64 {
|
|
return s.pageHeight - s.Ury
|
|
}
|
|
|
|
// maxDepth returns the maximum depth that words in `s` touch.
|
|
func (s *textStrata) maxDepth() float64 {
|
|
return s.pageHeight - s.Lly
|
|
}
|
|
|
|
// depthIndex returns a bin index for depth `depth`.
|
|
// The returned depthIdx obeys the following rule.
|
|
// depthIdx * depthBinPoints <= depth <= (depthIdx+1) * depthBinPoint
|
|
func depthIndex(depth float64) int {
|
|
var depthIdx int
|
|
if depth >= 0 {
|
|
depthIdx = int(depth / depthBinPoints)
|
|
} else {
|
|
depthIdx = int(depth/depthBinPoints) - 1
|
|
}
|
|
return depthIdx
|
|
}
|
|
|
|
// depthIndexes returns the sorted keys of s.bins.
|
|
func (s *textStrata) depthIndexes() []int {
|
|
if len(s.bins) == 0 {
|
|
return nil
|
|
}
|
|
indexes := make([]int, len(s.bins))
|
|
i := 0
|
|
for idx := range s.bins {
|
|
indexes[i] = idx
|
|
i++
|
|
}
|
|
sort.Ints(indexes)
|
|
return indexes
|
|
}
|
|
|
|
// scanBand scans the bins for words w:
|
|
// `minDepth` <= w.depth <= `maxDepth` && // in the depth diraction
|
|
// `readingOverlap`(`para`, w) && // in the reading directon
|
|
// math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance
|
|
// and applies `moveWord`(depthIdx, s,para w) to them.
|
|
// If `detectOnly` is true, don't appy moveWord.
|
|
// If `freezeDepth` is true, don't update minDepth and maxDepth in scan as words are added.
|
|
func (s *textStrata) scanBand(title string, para *textStrata,
|
|
readingOverlap func(para *textStrata, word *textWord) bool,
|
|
minDepth, maxDepth, fontTol float64,
|
|
detectOnly, freezeDepth bool) int {
|
|
fontsize := para.fontsize
|
|
lineDepth := lineDepthR * fontsize
|
|
n := 0
|
|
// var newWords []*textWord
|
|
for _, depthIdx := range s.depthBand(minDepth-lineDepth, maxDepth+lineDepth) {
|
|
for _, word := range s.bins[depthIdx] {
|
|
if !(minDepth-lineDepth <= word.depth && word.depth <= maxDepth+lineDepth) {
|
|
continue
|
|
}
|
|
if !readingOverlap(para, word) {
|
|
continue
|
|
}
|
|
if fontTol > 0 && math.Abs(word.fontsize-fontsize) > fontTol*fontsize {
|
|
continue
|
|
}
|
|
if !detectOnly {
|
|
moveWord(depthIdx, s, para, word)
|
|
}
|
|
// newWords = append(newWords, word)
|
|
n++
|
|
if !freezeDepth {
|
|
if word.depth < minDepth {
|
|
minDepth = word.depth
|
|
}
|
|
if word.depth > maxDepth {
|
|
maxDepth = word.depth
|
|
}
|
|
}
|
|
// Has no effect on results
|
|
// fontsize = para.fontsize
|
|
// lineDepth = lineDepthR * fontsize
|
|
if detectOnly {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if verbose {
|
|
if len(title) > 0 {
|
|
common.Log.Info("scanBand: %s para=%.2f", title, para.PdfRectangle)
|
|
// for i, word := range newWords {
|
|
// fmt.Printf("%4d: %s\n", i, word)
|
|
// }
|
|
}
|
|
}
|
|
return n
|
|
}
|
|
|
|
// stratumBand returns the words in s.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
|
|
func (s *textStrata) stratumBand(depthIdx int, minDepth, maxDepth float64) []*textWord {
|
|
if len(s.bins) == 0 {
|
|
return nil
|
|
}
|
|
var words []*textWord
|
|
for _, word := range s.bins[depthIdx] {
|
|
if minDepth <= word.depth && word.depth <= maxDepth {
|
|
words = append(words, word)
|
|
}
|
|
}
|
|
return words
|
|
}
|
|
|
|
// depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`.
|
|
func (s *textStrata) depthBand(minDepth, maxDepth float64) []int {
|
|
if len(s.bins) == 0 {
|
|
return nil
|
|
}
|
|
return s.depthRange(s.getDepthIdx(minDepth), s.getDepthIdx(maxDepth))
|
|
}
|
|
|
|
// depthRange returns the sorted keys of s.bins for depths indexes [`minDepth`,`maxDepth`).
|
|
func (s *textStrata) depthRange(minDepthIdx, maxDepthIdx int) []int {
|
|
indexes := s.depthIndexes()
|
|
var rangeIndexes []int
|
|
for _, depthIdx := range indexes {
|
|
if minDepthIdx <= depthIdx && depthIdx <= maxDepthIdx {
|
|
rangeIndexes = append(rangeIndexes, depthIdx)
|
|
}
|
|
}
|
|
return rangeIndexes
|
|
}
|
|
|
|
// firstReadingIndex returns the index of the depth bin that starts with that word with the smallest
|
|
// reading direction value in the depth region `minDepthIndex` < depth <= minDepthIndex+ 4*fontsize
|
|
// This avoids choosing a bin that starts with a superscript word.
|
|
func (s *textStrata) firstReadingIndex(minDepthIdx int) int {
|
|
firstReadingIdx := minDepthIdx
|
|
firstReadingWords := s.getStratum(firstReadingIdx)
|
|
fontsize := firstReadingWords[0].fontsize
|
|
minDepth := float64(minDepthIdx+1) * depthBinPoints
|
|
for _, depthIdx := range s.depthBand(minDepth, minDepth+4*fontsize) {
|
|
words := s.getStratum(depthIdx)
|
|
if diffReading(words[0], firstReadingWords[0]) < 0 {
|
|
firstReadingIdx = depthIdx
|
|
firstReadingWords = s.getStratum(firstReadingIdx)
|
|
}
|
|
}
|
|
return firstReadingIdx
|
|
}
|
|
|
|
// getDepthIdx returns the index into `s.bins` for depth axis value `depth`.
|
|
func (s *textStrata) getDepthIdx(depth float64) int {
|
|
if len(s.bins) == 0 {
|
|
panic("NOT ALLOWED")
|
|
}
|
|
indexes := s.depthIndexes()
|
|
depthIdx := depthIndex(depth)
|
|
if depthIdx < indexes[0] {
|
|
return indexes[0]
|
|
}
|
|
if depthIdx > indexes[len(indexes)-1] {
|
|
return indexes[len(indexes)-1]
|
|
}
|
|
return depthIdx
|
|
}
|
|
|
|
// empty returns true if the depth bin with index `depthIdx` is empty.
|
|
// NOTE: We delete bins as soon as they become empty so we just have to check for the bin's existence.
|
|
func (s *textStrata) empty(depthIdx int) bool {
|
|
_, ok := s.bins[depthIdx]
|
|
return !ok
|
|
}
|
|
|
|
// getStratum returns a copy of `p`.bins[`depthIdx`].
|
|
// getStratum is guaranteed to return a non-nil value. It must be called with a valid depth index.
|
|
// NOTE: We need to return a copy because remove() and other functions manipulate the array
|
|
// underlying the slice.
|
|
func (s *textStrata) getStratum(depthIdx int) []*textWord {
|
|
words := s.bins[depthIdx]
|
|
if words == nil {
|
|
panic("NOT ALLOWED")
|
|
}
|
|
dup := make([]*textWord, len(words))
|
|
copy(dup, words)
|
|
return dup
|
|
}
|
|
|
|
// moveWord moves `word` from 'page'[`depthIdx`] to 'para'[`depthIdx`].
|
|
func moveWord(depthIdx int, page, para *textStrata, word *textWord) {
|
|
if para.Llx > para.Urx {
|
|
para.PdfRectangle = word.PdfRectangle
|
|
} else {
|
|
para.PdfRectangle = rectUnion(para.PdfRectangle, word.PdfRectangle)
|
|
}
|
|
if word.fontsize > para.fontsize {
|
|
para.fontsize = word.fontsize
|
|
}
|
|
para.bins[depthIdx] = append(para.bins[depthIdx], word)
|
|
page.removeWord(depthIdx, word)
|
|
}
|
|
|
|
// removeWord removes `word`from `s`.bins[`depthIdx`].
|
|
// NOTE: We delete bins as soon as they become empty to save code that calls other textStrata
|
|
// functions from having to check for empty bins.
|
|
// !@#$ Find a more efficient way of doing this.
|
|
func (s *textStrata) removeWord(depthIdx int, word *textWord) {
|
|
words := removeWord(s.getStratum(depthIdx), word)
|
|
if len(words) == 0 {
|
|
delete(s.bins, depthIdx)
|
|
} else {
|
|
s.bins[depthIdx] = words
|
|
}
|
|
}
|