From 603b5ff4e7cff7a2d0e274f2bf27c1c8be45b916 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Mon, 25 May 2020 14:00:00 +1000 Subject: [PATCH] Added function comments. --- extractor/README.md | 5 +++ extractor/text_const.go | 12 ++++--- extractor/text_line.go | 1 + extractor/text_mark.go | 1 + extractor/text_para.go | 1 + extractor/text_strata.go | 68 +++++++++++++++++++++------------------- extractor/text_word.go | 1 + 7 files changed, 53 insertions(+), 36 deletions(-) diff --git a/extractor/README.md b/extractor/README.md index a5e8ffc9..1fa4b671 100644 --- a/extractor/README.md +++ b/extractor/README.md @@ -39,3 +39,8 @@ WHERE TO START * The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and its constituent lines is a `textPara`. * The `textPara`s are sorted into reading order. + + +TODO +==== +Remove serial code. diff --git a/extractor/text_const.go b/extractor/text_const.go index daf6ac7b..bd336c29 100644 --- a/extractor/text_const.go +++ b/extractor/text_const.go @@ -10,6 +10,10 @@ const ( // Size of depth bins in points depthBinPoints = 6 + // Variation in line depth as a fraction of font size. +lineDepthR for subscripts, -lineDepthR for + // superscripts + lineDepthR = 0.5 + // All constants that end in R are relative to font size. // Max difference in font sizes allowed within a word. @@ -25,18 +29,18 @@ const ( // into the para. maxIntraReadingGapR = 0.3 // Max diffrence in font size for word and para for the above case - maxIntraReadingFontTol = 0.6 // maxIntraReadingGapR + maxIntraReadingFontTol = 0.6 // Minimum spacing between paras in the reading direction. minInterReadingGapR = 1.0 // Max diffrence in font size for word and para for the above case - minInterReadingFontTol = 0.1 // minInterReadingGapR + minInterReadingFontTol = 0.1 // Maximum inter-word spacing. - maxIntraWordGapR = 1.5 + maxIntraWordGapR = 1.4 // Maximum overlap between characters allowd within a line - maxIntraLineOverlapR = 0.5 + maxIntraLineOverlapR = 0.46 // Maximum spacing between characters within a line. maxIntraLineGapR = 0.03 diff --git a/extractor/text_line.go b/extractor/text_line.go index e771017b..72cc9b11 100644 --- a/extractor/text_line.go +++ b/extractor/text_line.go @@ -45,6 +45,7 @@ func (l *textLine) String() string { l.serial, l.depth, l.PdfRectangle, l.fontsize, l.text()) } +// bbox makes textLine implementethe `bounded` interface. func (l *textLine) bbox() model.PdfRectangle { return l.PdfRectangle } diff --git a/extractor/text_mark.go b/extractor/text_mark.go index db72f000..c094bd59 100644 --- a/extractor/text_mark.go +++ b/extractor/text_mark.go @@ -102,6 +102,7 @@ func (tm *textMark) String() string { tm.serial, tm.PdfRectangle, tm.fontsize, tm.text) } +// bbox makes textMark implement the `bounded` interface. func (tm *textMark) bbox() model.PdfRectangle { return tm.PdfRectangle } diff --git a/extractor/text_para.go b/extractor/text_para.go index 919469ae..3d628f1f 100644 --- a/extractor/text_para.go +++ b/extractor/text_para.go @@ -38,6 +38,7 @@ func (p *textPara) String() string { return fmt.Sprintf("serial=%d %.2f %d lines", p.serial, p.PdfRectangle, len(p.lines)) } +// bbox makes textPara implement the `bounded` interface. func (p *textPara) bbox() model.PdfRectangle { return p.PdfRectangle } diff --git a/extractor/text_strata.go b/extractor/text_strata.go index 7b99aa31..58d6fe22 100644 --- a/extractor/text_strata.go +++ b/extractor/text_strata.go @@ -13,17 +13,17 @@ import ( "github.com/unidoc/unipdf/v3/model" ) -// textStrata is a list of word bings arranged by their depth on a page. +// textStrata is a list of word bins arranged by their depth on a page. // The words in each bin are sorted in reading order. type textStrata struct { serial int // Sequence number for debugging. model.PdfRectangle // Bounding box (union of words' in bins bounding boxes). - bins map[int][]*textWord // bins[n] = w: (n-1)*depthBinPoints <= w.depth < (n-1)*depthBinPoints + bins map[int][]*textWord // bins[n] = w: n*depthBinPoints <= w.depth < (n+1)*depthBinPoints pageHeight float64 fontsize float64 } -// makeTextStrata builds a textStrata from `words` but putting the words into the appropriate +// makeTextStrata builds a textStrata from `words` by putting the words into the appropriate // depth bins. func makeTextStrata(words []*textWord, pageHeight float64) *textStrata { s := newTextStrata(pageHeight) @@ -35,6 +35,7 @@ func makeTextStrata(words []*textWord, pageHeight float64) *textStrata { return s } +// newTextStrata returns an empty textStrata with page height `pageHeight`. func newTextStrata(pageHeight float64) *textStrata { bins := textStrata{ serial: serial.bins, @@ -58,17 +59,19 @@ func (s *textStrata) String() string { return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts) } -// sort sorts the words in each in `s` in the reading direction. +// sort sorts the words in each bin in `s` in the reading direction. func (s *textStrata) sort() { for _, bin := range s.bins { sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 }) } } +// minDepth returns the minimum depth that words in `s` touch. func (s *textStrata) minDepth() float64 { return s.pageHeight - s.Ury } +// maxDepth returns the maximum depth that words in `s` touch. func (s *textStrata) maxDepth() float64 { return s.pageHeight - s.Lly } @@ -86,14 +89,11 @@ func depthIndex(depth float64) int { return depthIdx } -func depthBand(depthIdx int) (float64, float64) { - minDepth := float64(depthIdx) * depthBinPoints - maxDepth := float64(depthIdx+1) * depthBinPoints - return minDepth, maxDepth -} - // depthIndexes returns the sorted keys of s.bins. func (s *textStrata) depthIndexes() []int { + if len(s.bins) == 0 { + return nil + } indexes := make([]int, len(s.bins)) i := 0 for idx := range s.bins { @@ -104,17 +104,13 @@ func (s *textStrata) depthIndexes() []int { return indexes } -// Variation in line depth as a fraction of font size. +lineDepthR for subscripts, -lineDepthR for -// superscripts -const lineDepthR = 0.5 - -// scanBand scans the bins for words -// w: `minDepth` <= w.depth <= `maxDepth` && // in the depth diraction -// `readingOverlap`(`para`, w) && in the reading directon +// scanBand scans the bins for words w: +// `minDepth` <= w.depth <= `maxDepth` && // in the depth diraction +// `readingOverlap`(`para`, w) && // in the reading directon // math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance // and applies `moveWord`(depthIdx, s,para w) to them. // If `detectOnly` is true, don't appy moveWord. -// If `freezeDepth` is trus, don't update minDepth and maxDepth in scan as words are added/ +// If `freezeDepth` is true, don't update minDepth and maxDepth in scan as words are added. func (s *textStrata) scanBand(para *textStrata, readingOverlap func(para *textStrata, word *textWord) bool, minDepth, maxDepth, fontTol float64, @@ -158,6 +154,9 @@ func (s *textStrata) scanBand(para *textStrata, // stratumBand returns the words in s.bins[depthIdx] w: minDepth <= w.depth <= maxDepth. func (s *textStrata) stratumBand(depthIdx int, minDepth, maxDepth float64) []*textWord { + if len(s.bins) == 0 { + return nil + } var words []*textWord for _, word := range s.bins[depthIdx] { if minDepth <= word.depth && word.depth <= maxDepth { @@ -169,6 +168,9 @@ func (s *textStrata) stratumBand(depthIdx int, minDepth, maxDepth float64) []*te // depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`. func (s *textStrata) depthBand(minDepth, maxDepth float64) []int { + if len(s.bins) == 0 { + return nil + } return s.depthRange(s.getDepthIdx(minDepth), s.getDepthIdx(maxDepth)) } @@ -202,37 +204,37 @@ func (s *textStrata) firstReadingIndex(minDepthIdx int) int { return firstReadingIdx } -// getDepthIdx returns the index into `s.bins` for non-reading axis value `depth`. +// getDepthIdx returns the index into `s.bins` for depth axis value `depth`. func (s *textStrata) getDepthIdx(depth float64) int { - depthIdx, minIdx, maxIdx := -101, -101, -101 + if len(s.bins) == 0 { + panic("NOT ALLOWED") + } indexes := s.depthIndexes() - if len(indexes) > 0 { - depthIdx = depthIndex(depth) - minIdx = indexes[0] - maxIdx = indexes[len(indexes)-1] - if depthIdx < minIdx { - depthIdx = minIdx - } - if depthIdx > maxIdx { - depthIdx = maxIdx - } + depthIdx := depthIndex(depth) + if depthIdx < indexes[0] { + return indexes[0] + } + if depthIdx > indexes[len(indexes)-1] { + return indexes[len(indexes)-1] } return depthIdx } +// empty returns true if the depth bin with index `depthIdx` is empty. +// NOTE: We delete bins as soon as they become empty so we just have to check for the bin's existence. func (s *textStrata) empty(depthIdx int) bool { _, ok := s.bins[depthIdx] return !ok } // getStratum returns a copy of `p`.bins[`depthIdx`]. -// getStratum is guaranteed to return a non-nil value (!@#$ Will need to check it is called with valid index) +// getStratum is guaranteed to return a non-nil value. It must be called with a valid depth index. // NOTE: We need to return a copy because remove() and other functions manipulate the array // underlying the slice. func (s *textStrata) getStratum(depthIdx int) []*textWord { words := s.bins[depthIdx] if words == nil { - panic(depthIdx) + panic("NOT ALLOWED") } dup := make([]*textWord, len(words)) copy(dup, words) @@ -254,6 +256,8 @@ func moveWord(depthIdx int, page, para *textStrata, word *textWord) { } // removeWord removes `word`from `s`.bins[`depthIdx`]. +// NOTE: We delete bins as soon as they become empty to save code that calls other textStrata +// functions from having to check for empty bins. // !@#$ Find a more efficient way of doing this. func (s *textStrata) removeWord(depthIdx int, word *textWord) { words := removeWord(s.getStratum(depthIdx), word) diff --git a/extractor/text_word.go b/extractor/text_word.go index 3951a348..c6374665 100644 --- a/extractor/text_word.go +++ b/extractor/text_word.go @@ -106,6 +106,7 @@ func (w *textWord) String() string { w.serial, w.depth, w.PdfRectangle, w.fontsize, w.text()) } +// bbox makes textWord implement the `bounded` interface. func (w *textWord) bbox() model.PdfRectangle { return w.PdfRectangle }