mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-14 19:29:50 +08:00
Added function comments.
This commit is contained in:
parent
c515472849
commit
603b5ff4e7
@ -39,3 +39,8 @@ WHERE TO START
|
|||||||
* The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and
|
* The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and
|
||||||
its constituent lines is a `textPara`.
|
its constituent lines is a `textPara`.
|
||||||
* The `textPara`s are sorted into reading order.
|
* The `textPara`s are sorted into reading order.
|
||||||
|
|
||||||
|
|
||||||
|
TODO
|
||||||
|
====
|
||||||
|
Remove serial code.
|
||||||
|
@ -10,6 +10,10 @@ const (
|
|||||||
// Size of depth bins in points
|
// Size of depth bins in points
|
||||||
depthBinPoints = 6
|
depthBinPoints = 6
|
||||||
|
|
||||||
|
// Variation in line depth as a fraction of font size. +lineDepthR for subscripts, -lineDepthR for
|
||||||
|
// superscripts
|
||||||
|
lineDepthR = 0.5
|
||||||
|
|
||||||
// All constants that end in R are relative to font size.
|
// All constants that end in R are relative to font size.
|
||||||
|
|
||||||
// Max difference in font sizes allowed within a word.
|
// Max difference in font sizes allowed within a word.
|
||||||
@ -25,18 +29,18 @@ const (
|
|||||||
// into the para.
|
// into the para.
|
||||||
maxIntraReadingGapR = 0.3
|
maxIntraReadingGapR = 0.3
|
||||||
// Max diffrence in font size for word and para for the above case
|
// Max diffrence in font size for word and para for the above case
|
||||||
maxIntraReadingFontTol = 0.6 // maxIntraReadingGapR
|
maxIntraReadingFontTol = 0.6
|
||||||
|
|
||||||
// Minimum spacing between paras in the reading direction.
|
// Minimum spacing between paras in the reading direction.
|
||||||
minInterReadingGapR = 1.0
|
minInterReadingGapR = 1.0
|
||||||
// Max diffrence in font size for word and para for the above case
|
// Max diffrence in font size for word and para for the above case
|
||||||
minInterReadingFontTol = 0.1 // minInterReadingGapR
|
minInterReadingFontTol = 0.1
|
||||||
|
|
||||||
// Maximum inter-word spacing.
|
// Maximum inter-word spacing.
|
||||||
maxIntraWordGapR = 1.5
|
maxIntraWordGapR = 1.4
|
||||||
|
|
||||||
// Maximum overlap between characters allowd within a line
|
// Maximum overlap between characters allowd within a line
|
||||||
maxIntraLineOverlapR = 0.5
|
maxIntraLineOverlapR = 0.46
|
||||||
|
|
||||||
// Maximum spacing between characters within a line.
|
// Maximum spacing between characters within a line.
|
||||||
maxIntraLineGapR = 0.03
|
maxIntraLineGapR = 0.03
|
||||||
|
@ -45,6 +45,7 @@ func (l *textLine) String() string {
|
|||||||
l.serial, l.depth, l.PdfRectangle, l.fontsize, l.text())
|
l.serial, l.depth, l.PdfRectangle, l.fontsize, l.text())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bbox makes textLine implementethe `bounded` interface.
|
||||||
func (l *textLine) bbox() model.PdfRectangle {
|
func (l *textLine) bbox() model.PdfRectangle {
|
||||||
return l.PdfRectangle
|
return l.PdfRectangle
|
||||||
}
|
}
|
||||||
|
@ -102,6 +102,7 @@ func (tm *textMark) String() string {
|
|||||||
tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
|
tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bbox makes textMark implement the `bounded` interface.
|
||||||
func (tm *textMark) bbox() model.PdfRectangle {
|
func (tm *textMark) bbox() model.PdfRectangle {
|
||||||
return tm.PdfRectangle
|
return tm.PdfRectangle
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,7 @@ func (p *textPara) String() string {
|
|||||||
return fmt.Sprintf("serial=%d %.2f %d lines", p.serial, p.PdfRectangle, len(p.lines))
|
return fmt.Sprintf("serial=%d %.2f %d lines", p.serial, p.PdfRectangle, len(p.lines))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bbox makes textPara implement the `bounded` interface.
|
||||||
func (p *textPara) bbox() model.PdfRectangle {
|
func (p *textPara) bbox() model.PdfRectangle {
|
||||||
return p.PdfRectangle
|
return p.PdfRectangle
|
||||||
}
|
}
|
||||||
|
@ -13,17 +13,17 @@ import (
|
|||||||
"github.com/unidoc/unipdf/v3/model"
|
"github.com/unidoc/unipdf/v3/model"
|
||||||
)
|
)
|
||||||
|
|
||||||
// textStrata is a list of word bings arranged by their depth on a page.
|
// textStrata is a list of word bins arranged by their depth on a page.
|
||||||
// The words in each bin are sorted in reading order.
|
// The words in each bin are sorted in reading order.
|
||||||
type textStrata struct {
|
type textStrata struct {
|
||||||
serial int // Sequence number for debugging.
|
serial int // Sequence number for debugging.
|
||||||
model.PdfRectangle // Bounding box (union of words' in bins bounding boxes).
|
model.PdfRectangle // Bounding box (union of words' in bins bounding boxes).
|
||||||
bins map[int][]*textWord // bins[n] = w: (n-1)*depthBinPoints <= w.depth < (n-1)*depthBinPoints
|
bins map[int][]*textWord // bins[n] = w: n*depthBinPoints <= w.depth < (n+1)*depthBinPoints
|
||||||
pageHeight float64
|
pageHeight float64
|
||||||
fontsize float64
|
fontsize float64
|
||||||
}
|
}
|
||||||
|
|
||||||
// makeTextStrata builds a textStrata from `words` but putting the words into the appropriate
|
// makeTextStrata builds a textStrata from `words` by putting the words into the appropriate
|
||||||
// depth bins.
|
// depth bins.
|
||||||
func makeTextStrata(words []*textWord, pageHeight float64) *textStrata {
|
func makeTextStrata(words []*textWord, pageHeight float64) *textStrata {
|
||||||
s := newTextStrata(pageHeight)
|
s := newTextStrata(pageHeight)
|
||||||
@ -35,6 +35,7 @@ func makeTextStrata(words []*textWord, pageHeight float64) *textStrata {
|
|||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// newTextStrata returns an empty textStrata with page height `pageHeight`.
|
||||||
func newTextStrata(pageHeight float64) *textStrata {
|
func newTextStrata(pageHeight float64) *textStrata {
|
||||||
bins := textStrata{
|
bins := textStrata{
|
||||||
serial: serial.bins,
|
serial: serial.bins,
|
||||||
@ -58,17 +59,19 @@ func (s *textStrata) String() string {
|
|||||||
return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts)
|
return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts)
|
||||||
}
|
}
|
||||||
|
|
||||||
// sort sorts the words in each in `s` in the reading direction.
|
// sort sorts the words in each bin in `s` in the reading direction.
|
||||||
func (s *textStrata) sort() {
|
func (s *textStrata) sort() {
|
||||||
for _, bin := range s.bins {
|
for _, bin := range s.bins {
|
||||||
sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 })
|
sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// minDepth returns the minimum depth that words in `s` touch.
|
||||||
func (s *textStrata) minDepth() float64 {
|
func (s *textStrata) minDepth() float64 {
|
||||||
return s.pageHeight - s.Ury
|
return s.pageHeight - s.Ury
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// maxDepth returns the maximum depth that words in `s` touch.
|
||||||
func (s *textStrata) maxDepth() float64 {
|
func (s *textStrata) maxDepth() float64 {
|
||||||
return s.pageHeight - s.Lly
|
return s.pageHeight - s.Lly
|
||||||
}
|
}
|
||||||
@ -86,14 +89,11 @@ func depthIndex(depth float64) int {
|
|||||||
return depthIdx
|
return depthIdx
|
||||||
}
|
}
|
||||||
|
|
||||||
func depthBand(depthIdx int) (float64, float64) {
|
|
||||||
minDepth := float64(depthIdx) * depthBinPoints
|
|
||||||
maxDepth := float64(depthIdx+1) * depthBinPoints
|
|
||||||
return minDepth, maxDepth
|
|
||||||
}
|
|
||||||
|
|
||||||
// depthIndexes returns the sorted keys of s.bins.
|
// depthIndexes returns the sorted keys of s.bins.
|
||||||
func (s *textStrata) depthIndexes() []int {
|
func (s *textStrata) depthIndexes() []int {
|
||||||
|
if len(s.bins) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
indexes := make([]int, len(s.bins))
|
indexes := make([]int, len(s.bins))
|
||||||
i := 0
|
i := 0
|
||||||
for idx := range s.bins {
|
for idx := range s.bins {
|
||||||
@ -104,17 +104,13 @@ func (s *textStrata) depthIndexes() []int {
|
|||||||
return indexes
|
return indexes
|
||||||
}
|
}
|
||||||
|
|
||||||
// Variation in line depth as a fraction of font size. +lineDepthR for subscripts, -lineDepthR for
|
// scanBand scans the bins for words w:
|
||||||
// superscripts
|
// `minDepth` <= w.depth <= `maxDepth` && // in the depth diraction
|
||||||
const lineDepthR = 0.5
|
// `readingOverlap`(`para`, w) && // in the reading directon
|
||||||
|
|
||||||
// scanBand scans the bins for words
|
|
||||||
// w: `minDepth` <= w.depth <= `maxDepth` && // in the depth diraction
|
|
||||||
// `readingOverlap`(`para`, w) && in the reading directon
|
|
||||||
// math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance
|
// math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance
|
||||||
// and applies `moveWord`(depthIdx, s,para w) to them.
|
// and applies `moveWord`(depthIdx, s,para w) to them.
|
||||||
// If `detectOnly` is true, don't appy moveWord.
|
// If `detectOnly` is true, don't appy moveWord.
|
||||||
// If `freezeDepth` is trus, don't update minDepth and maxDepth in scan as words are added/
|
// If `freezeDepth` is true, don't update minDepth and maxDepth in scan as words are added.
|
||||||
func (s *textStrata) scanBand(para *textStrata,
|
func (s *textStrata) scanBand(para *textStrata,
|
||||||
readingOverlap func(para *textStrata, word *textWord) bool,
|
readingOverlap func(para *textStrata, word *textWord) bool,
|
||||||
minDepth, maxDepth, fontTol float64,
|
minDepth, maxDepth, fontTol float64,
|
||||||
@ -158,6 +154,9 @@ func (s *textStrata) scanBand(para *textStrata,
|
|||||||
|
|
||||||
// stratumBand returns the words in s.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
|
// stratumBand returns the words in s.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
|
||||||
func (s *textStrata) stratumBand(depthIdx int, minDepth, maxDepth float64) []*textWord {
|
func (s *textStrata) stratumBand(depthIdx int, minDepth, maxDepth float64) []*textWord {
|
||||||
|
if len(s.bins) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
var words []*textWord
|
var words []*textWord
|
||||||
for _, word := range s.bins[depthIdx] {
|
for _, word := range s.bins[depthIdx] {
|
||||||
if minDepth <= word.depth && word.depth <= maxDepth {
|
if minDepth <= word.depth && word.depth <= maxDepth {
|
||||||
@ -169,6 +168,9 @@ func (s *textStrata) stratumBand(depthIdx int, minDepth, maxDepth float64) []*te
|
|||||||
|
|
||||||
// depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`.
|
// depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`.
|
||||||
func (s *textStrata) depthBand(minDepth, maxDepth float64) []int {
|
func (s *textStrata) depthBand(minDepth, maxDepth float64) []int {
|
||||||
|
if len(s.bins) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return s.depthRange(s.getDepthIdx(minDepth), s.getDepthIdx(maxDepth))
|
return s.depthRange(s.getDepthIdx(minDepth), s.getDepthIdx(maxDepth))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,37 +204,37 @@ func (s *textStrata) firstReadingIndex(minDepthIdx int) int {
|
|||||||
return firstReadingIdx
|
return firstReadingIdx
|
||||||
}
|
}
|
||||||
|
|
||||||
// getDepthIdx returns the index into `s.bins` for non-reading axis value `depth`.
|
// getDepthIdx returns the index into `s.bins` for depth axis value `depth`.
|
||||||
func (s *textStrata) getDepthIdx(depth float64) int {
|
func (s *textStrata) getDepthIdx(depth float64) int {
|
||||||
depthIdx, minIdx, maxIdx := -101, -101, -101
|
if len(s.bins) == 0 {
|
||||||
|
panic("NOT ALLOWED")
|
||||||
|
}
|
||||||
indexes := s.depthIndexes()
|
indexes := s.depthIndexes()
|
||||||
if len(indexes) > 0 {
|
depthIdx := depthIndex(depth)
|
||||||
depthIdx = depthIndex(depth)
|
if depthIdx < indexes[0] {
|
||||||
minIdx = indexes[0]
|
return indexes[0]
|
||||||
maxIdx = indexes[len(indexes)-1]
|
|
||||||
if depthIdx < minIdx {
|
|
||||||
depthIdx = minIdx
|
|
||||||
}
|
|
||||||
if depthIdx > maxIdx {
|
|
||||||
depthIdx = maxIdx
|
|
||||||
}
|
}
|
||||||
|
if depthIdx > indexes[len(indexes)-1] {
|
||||||
|
return indexes[len(indexes)-1]
|
||||||
}
|
}
|
||||||
return depthIdx
|
return depthIdx
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// empty returns true if the depth bin with index `depthIdx` is empty.
|
||||||
|
// NOTE: We delete bins as soon as they become empty so we just have to check for the bin's existence.
|
||||||
func (s *textStrata) empty(depthIdx int) bool {
|
func (s *textStrata) empty(depthIdx int) bool {
|
||||||
_, ok := s.bins[depthIdx]
|
_, ok := s.bins[depthIdx]
|
||||||
return !ok
|
return !ok
|
||||||
}
|
}
|
||||||
|
|
||||||
// getStratum returns a copy of `p`.bins[`depthIdx`].
|
// getStratum returns a copy of `p`.bins[`depthIdx`].
|
||||||
// getStratum is guaranteed to return a non-nil value (!@#$ Will need to check it is called with valid index)
|
// getStratum is guaranteed to return a non-nil value. It must be called with a valid depth index.
|
||||||
// NOTE: We need to return a copy because remove() and other functions manipulate the array
|
// NOTE: We need to return a copy because remove() and other functions manipulate the array
|
||||||
// underlying the slice.
|
// underlying the slice.
|
||||||
func (s *textStrata) getStratum(depthIdx int) []*textWord {
|
func (s *textStrata) getStratum(depthIdx int) []*textWord {
|
||||||
words := s.bins[depthIdx]
|
words := s.bins[depthIdx]
|
||||||
if words == nil {
|
if words == nil {
|
||||||
panic(depthIdx)
|
panic("NOT ALLOWED")
|
||||||
}
|
}
|
||||||
dup := make([]*textWord, len(words))
|
dup := make([]*textWord, len(words))
|
||||||
copy(dup, words)
|
copy(dup, words)
|
||||||
@ -254,6 +256,8 @@ func moveWord(depthIdx int, page, para *textStrata, word *textWord) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// removeWord removes `word`from `s`.bins[`depthIdx`].
|
// removeWord removes `word`from `s`.bins[`depthIdx`].
|
||||||
|
// NOTE: We delete bins as soon as they become empty to save code that calls other textStrata
|
||||||
|
// functions from having to check for empty bins.
|
||||||
// !@#$ Find a more efficient way of doing this.
|
// !@#$ Find a more efficient way of doing this.
|
||||||
func (s *textStrata) removeWord(depthIdx int, word *textWord) {
|
func (s *textStrata) removeWord(depthIdx int, word *textWord) {
|
||||||
words := removeWord(s.getStratum(depthIdx), word)
|
words := removeWord(s.getStratum(depthIdx), word)
|
||||||
|
@ -106,6 +106,7 @@ func (w *textWord) String() string {
|
|||||||
w.serial, w.depth, w.PdfRectangle, w.fontsize, w.text())
|
w.serial, w.depth, w.PdfRectangle, w.fontsize, w.text())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bbox makes textWord implement the `bounded` interface.
|
||||||
func (w *textWord) bbox() model.PdfRectangle {
|
func (w *textWord) bbox() model.PdfRectangle {
|
||||||
return w.PdfRectangle
|
return w.PdfRectangle
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user