/* * This file is subject to the terms and conditions defined in * file 'LICENSE.md', which is part of this source code package. */ package extractor import ( "fmt" "math" "sort" "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/model" ) // textStrata is a list of word bins arranged by their depth on a page. // The words in each bin are sorted in reading order. type textStrata struct { serial int // Sequence number for debugging. model.PdfRectangle // Bounding box (union of words' in bins bounding boxes). bins map[int][]*textWord // bins[n] = w: n*depthBinPoints <= w.depth < (n+1)*depthBinPoints pageHeight float64 fontsize float64 } // makeTextStrata builds a textStrata from `words` by putting the words into the appropriate // depth bins. func makeTextStrata(words []*textWord, pageHeight float64) *textStrata { s := newTextStrata(pageHeight) for _, w := range words { depthIdx := depthIndex(w.depth) s.bins[depthIdx] = append(s.bins[depthIdx], w) } s.sort() return s } // newTextStrata returns an empty textStrata with page height `pageHeight`. func newTextStrata(pageHeight float64) *textStrata { bins := textStrata{ serial: serial.bins, bins: map[int][]*textWord{}, PdfRectangle: model.PdfRectangle{Urx: -1.0, Ury: -1.0}, pageHeight: pageHeight, } serial.bins++ return &bins } // String returns a description of `s`. func (s *textStrata) String() string { var texts []string for _, depthIdx := range s.depthIndexes() { words, _ := s.bins[depthIdx] for _, w := range words { texts = append(texts, w.text()) } } return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts) } // sort sorts the words in each bin in `s` in the reading direction. func (s *textStrata) sort() { for _, bin := range s.bins { sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 }) } } // minDepth returns the minimum depth that words in `s` touch. func (s *textStrata) minDepth() float64 { return s.pageHeight - s.Ury } // maxDepth returns the maximum depth that words in `s` touch. func (s *textStrata) maxDepth() float64 { return s.pageHeight - s.Lly } // depthIndex returns a bin index for depth `depth`. // The returned depthIdx obeys the following rule. // depthIdx * depthBinPoints <= depth <= (depthIdx+1) * depthBinPoint func depthIndex(depth float64) int { var depthIdx int if depth >= 0 { depthIdx = int(depth / depthBinPoints) } else { depthIdx = int(depth/depthBinPoints) - 1 } return depthIdx } // depthIndexes returns the sorted keys of s.bins. func (s *textStrata) depthIndexes() []int { if len(s.bins) == 0 { return nil } indexes := make([]int, len(s.bins)) i := 0 for idx := range s.bins { indexes[i] = idx i++ } sort.Ints(indexes) return indexes } // scanBand scans the bins for words w: // `minDepth` <= w.depth <= `maxDepth` && // in the depth diraction // `readingOverlap`(`para`, w) && // in the reading directon // math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance // and applies `moveWord`(depthIdx, s,para w) to them. // If `detectOnly` is true, don't appy moveWord. // If `freezeDepth` is true, don't update minDepth and maxDepth in scan as words are added. func (s *textStrata) scanBand(title string, para *textStrata, readingOverlap func(para *textStrata, word *textWord) bool, minDepth, maxDepth, fontTol float64, detectOnly, freezeDepth bool) int { fontsize := para.fontsize lineDepth := lineDepthR * fontsize n := 0 // var newWords []*textWord for _, depthIdx := range s.depthBand(minDepth-lineDepth, maxDepth+lineDepth) { for _, word := range s.bins[depthIdx] { if !(minDepth-lineDepth <= word.depth && word.depth <= maxDepth+lineDepth) { continue } if !readingOverlap(para, word) { continue } if fontTol > 0 && math.Abs(word.fontsize-fontsize) > fontTol*fontsize { continue } if !detectOnly { moveWord(depthIdx, s, para, word) } // newWords = append(newWords, word) n++ if !freezeDepth { if word.depth < minDepth { minDepth = word.depth } if word.depth > maxDepth { maxDepth = word.depth } } // Has no effect on results // fontsize = para.fontsize // lineDepth = lineDepthR * fontsize if detectOnly { break } } } if verbose { if len(title) > 0 { common.Log.Info("scanBand: %s para=%.2f", title, para.PdfRectangle) // for i, word := range newWords { // fmt.Printf("%4d: %s\n", i, word) // } } } return n } // stratumBand returns the words in s.bins[depthIdx] w: minDepth <= w.depth <= maxDepth. func (s *textStrata) stratumBand(depthIdx int, minDepth, maxDepth float64) []*textWord { if len(s.bins) == 0 { return nil } var words []*textWord for _, word := range s.bins[depthIdx] { if minDepth <= word.depth && word.depth <= maxDepth { words = append(words, word) } } return words } // depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`. func (s *textStrata) depthBand(minDepth, maxDepth float64) []int { if len(s.bins) == 0 { return nil } return s.depthRange(s.getDepthIdx(minDepth), s.getDepthIdx(maxDepth)) } // depthRange returns the sorted keys of s.bins for depths indexes [`minDepth`,`maxDepth`). func (s *textStrata) depthRange(minDepthIdx, maxDepthIdx int) []int { indexes := s.depthIndexes() var rangeIndexes []int for _, depthIdx := range indexes { if minDepthIdx <= depthIdx && depthIdx <= maxDepthIdx { rangeIndexes = append(rangeIndexes, depthIdx) } } return rangeIndexes } // firstReadingIndex returns the index of the depth bin that starts with that word with the smallest // reading direction value in the depth region `minDepthIndex` < depth <= minDepthIndex+ 4*fontsize // This avoids choosing a bin that starts with a superscript word. func (s *textStrata) firstReadingIndex(minDepthIdx int) int { firstReadingIdx := minDepthIdx firstReadingWords := s.getStratum(firstReadingIdx) fontsize := firstReadingWords[0].fontsize minDepth := float64(minDepthIdx+1) * depthBinPoints for _, depthIdx := range s.depthBand(minDepth, minDepth+4*fontsize) { words := s.getStratum(depthIdx) if diffReading(words[0], firstReadingWords[0]) < 0 { firstReadingIdx = depthIdx firstReadingWords = s.getStratum(firstReadingIdx) } } return firstReadingIdx } // getDepthIdx returns the index into `s.bins` for depth axis value `depth`. func (s *textStrata) getDepthIdx(depth float64) int { if len(s.bins) == 0 { panic("NOT ALLOWED") } indexes := s.depthIndexes() depthIdx := depthIndex(depth) if depthIdx < indexes[0] { return indexes[0] } if depthIdx > indexes[len(indexes)-1] { return indexes[len(indexes)-1] } return depthIdx } // empty returns true if the depth bin with index `depthIdx` is empty. // NOTE: We delete bins as soon as they become empty so we just have to check for the bin's existence. func (s *textStrata) empty(depthIdx int) bool { _, ok := s.bins[depthIdx] return !ok } // getStratum returns a copy of `p`.bins[`depthIdx`]. // getStratum is guaranteed to return a non-nil value. It must be called with a valid depth index. // NOTE: We need to return a copy because remove() and other functions manipulate the array // underlying the slice. func (s *textStrata) getStratum(depthIdx int) []*textWord { words := s.bins[depthIdx] if words == nil { panic("NOT ALLOWED") } dup := make([]*textWord, len(words)) copy(dup, words) return dup } // moveWord moves `word` from 'page'[`depthIdx`] to 'para'[`depthIdx`]. func moveWord(depthIdx int, page, para *textStrata, word *textWord) { if para.Llx > para.Urx { para.PdfRectangle = word.PdfRectangle } else { para.PdfRectangle = rectUnion(para.PdfRectangle, word.PdfRectangle) } if word.fontsize > para.fontsize { para.fontsize = word.fontsize } para.bins[depthIdx] = append(para.bins[depthIdx], word) page.removeWord(depthIdx, word) } // removeWord removes `word`from `s`.bins[`depthIdx`]. // NOTE: We delete bins as soon as they become empty to save code that calls other textStrata // functions from having to check for empty bins. // !@#$ Find a more efficient way of doing this. func (s *textStrata) removeWord(depthIdx int, word *textWord) { words := removeWord(s.getStratum(depthIdx), word) if len(words) == 0 { delete(s.bins, depthIdx) } else { s.bins[depthIdx] = words } }