Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

This commit is contained in:
Peter Williams 2020-05-25 09:39:30 +10:00
parent 83033182fa
commit c515472849
6 changed files with 43 additions and 41 deletions

View File

@ -5,7 +5,7 @@ understand the code you may wish to recombine this in the orginal `text.go`.
BASIC IDEAS BASIC IDEAS
----------- -----------
There are two directions There are two [directions](https://www.w3.org/International/questions/qa-scripts.en#directions)s\.
- *reading* - *reading*
- *depth* - *depth*
@ -34,7 +34,7 @@ WHERE TO START
* A page's `textMark`s are obtained from its contentstream. * A page's `textMark`s are obtained from its contentstream.
* The `textMark`s are divided into `textWord`s. * The `textMark`s are divided into `textWord`s.
* The `textWord`s are grouped into depth bins with each the contents of each bin sorted by reading direction. * The `textWord`s are grouped into depth bins with the contents of each bin sorted by reading direction.
* The page area is divided into rectangular regions, one for each paragraph. * The page area is divided into rectangular regions, one for each paragraph.
* The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and * The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and
its constituent lines is a `textPara`. its constituent lines is a `textPara`.

View File

@ -663,7 +663,7 @@ type textObject struct {
state *textState state *textState
tm transform.Matrix // Text matrix. For the character pointer. tm transform.Matrix // Text matrix. For the character pointer.
tlm transform.Matrix // Text line matrix. For the start of line pointer. tlm transform.Matrix // Text line matrix. For the start of line pointer.
marks []textMark // Text marks get written here. marks []*textMark // Text marks get written here.
} }
// newTextState returns a default textState. // newTextState returns a default textState.
@ -812,7 +812,7 @@ func (to *textObject) renderText(data []byte) error {
} }
} }
common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm) common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
to.marks = append(to.marks, mark) to.marks = append(to.marks, &mark)
// update the text matrix by the displacement of the text location. // update the text matrix by the displacement of the text location.
to.tm.Concat(td) to.tm.Concat(td)
@ -859,7 +859,7 @@ func isTextSpace(text string) bool {
// PageText represents the layout of text on a device page. // PageText represents the layout of text on a device page.
type PageText struct { type PageText struct {
marks []textMark // Texts and their positions on a PDF page. marks []*textMark // Texts and their positions on a PDF page.
viewText string // Extracted page text. viewText string // Extracted page text.
viewMarks []TextMark // Public view of `marks`. viewMarks []TextMark // Public view of `marks`.
pageSize model.PdfRectangle pageSize model.PdfRectangle

View File

@ -48,6 +48,11 @@ type bounded interface {
bbox() model.PdfRectangle bbox() model.PdfRectangle
} }
// getDepth returns the depth of `a` on a page of size `pageSize`.
func getDepth(pageSize model.PdfRectangle, a bounded) float64 {
return pageSize.Ury - a.bbox().Lly
}
// diffReading returns `a` - `b` in the reading direction. // diffReading returns `a` - `b` in the reading direction.
func diffReading(a, b bounded) float64 { func diffReading(a, b bounded) float64 {
return a.bbox().Llx - b.bbox().Llx return a.bbox().Llx - b.bbox().Llx
@ -93,11 +98,6 @@ func readingOverlapLeft(para *textStrata, word *textWord, delta float64) bool {
return para.Urx <= word.Llx && word.Llx < para.Urx+delta return para.Urx <= word.Llx && word.Llx < para.Urx+delta
} }
// readingOverlaplapRight returns true is the left of `word` is in within `para` but at least delta from its left
func readingOverlaplapRight(para *textStrata, word *textWord, delta float64) bool {
return para.Llx+delta < word.Llx && word.Llx <= para.Urx
}
// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap] // readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
// in the reading direction. // in the reading direction.
func readingOverlapPlusGap(para *textStrata, word *textWord, maxIntraReadingGap float64) bool { func readingOverlapPlusGap(para *textStrata, word *textWord, maxIntraReadingGap float64) bool {

View File

@ -101,17 +101,18 @@ func (tm *textMark) String() string {
return fmt.Sprintf("serial=%d %.2f fontsize=%.2f \"%s\"", return fmt.Sprintf("serial=%d %.2f fontsize=%.2f \"%s\"",
tm.serial, tm.PdfRectangle, tm.fontsize, tm.text) tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
} }
func (tm *textMark) bbox() model.PdfRectangle { func (tm *textMark) bbox() model.PdfRectangle {
return tm.PdfRectangle return tm.PdfRectangle
} }
// Width returns the width of `tm`.text in the text direction. // Width returns the width of `tm`.text in the text direction.
func (tm textMark) Width() float64 { func (tm *textMark) Width() float64 {
return math.Abs(tm.orientedStart.X - tm.orientedEnd.X) return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
} }
// ToTextMark returns the public view of `tm`. // ToTextMark returns the public view of `tm`.
func (tm textMark) ToTextMark() TextMark { func (tm *textMark) ToTextMark() TextMark {
return TextMark{ return TextMark{
count: int64(tm.serial), count: int64(tm.serial),
Text: tm.text, Text: tm.text,

View File

@ -19,7 +19,7 @@ import (
type paraList []*textPara type paraList []*textPara
// makeTextPage builds a paraList from `marks`, the textMarks on a page. // makeTextPage builds a paraList from `marks`, the textMarks on a page.
func makeTextPage(marks []textMark, pageSize model.PdfRectangle, rot int) paraList { func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize) common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
// Break the marks into words // Break the marks into words

View File

@ -21,41 +21,42 @@ type textWord struct {
serial int // Sequence number for debugging. serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box (union of `marks` bounding boxes). model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
depth float64 // Distance from bottom of word to top of page. depth float64 // Distance from bottom of word to top of page.
marks []textMark // Marks in this word. marks []*textMark // Marks in this word.
fontsize float64 // Largest fontsize in `marks` w fontsize float64 // Largest fontsize in `marks` w
spaceAfter bool spaceAfter bool
} }
// makeTextPage builds a word list from `marks`, the textMarks on a page. // makeTextPage builds a word list from `marks`, the textMarks on a page.
// `pageSize` is used to calculate the words` depths depth on the page // `pageSize` is used to calculate the words` depths depth on the page.
func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []*textWord { func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
var words []*textWord var words []*textWord
var cursor *textWord var newWord *textWord // The word being built.
// addWord adds `cursor` to `words` and resets it to nil // addNewWord adds `newWord` to `words` and resets `newWord` to nil.
addWord := func() { addNewWord := func() {
if cursor != nil { if newWord != nil {
if !isTextSpace(cursor.text()) { if !isTextSpace(newWord.text()) {
words = append(words, cursor) words = append(words, newWord)
} }
cursor = nil newWord = nil
} }
} }
for _, tm := range marks { for _, tm := range marks {
isSpace := isTextSpace(tm.text) isSpace := isTextSpace(tm.text)
if cursor == nil && !isSpace { if newWord == nil && !isSpace {
cursor = newTextWord([]textMark{tm}, pageSize) newWord = newTextWord([]*textMark{tm}, pageSize)
continue continue
} }
if isSpace { if isSpace {
addWord() addNewWord()
continue continue
} }
depthGap := pageSize.Ury - tm.Lly - cursor.depth depthGap := getDepth(pageSize, tm) - newWord.depth
readingGap := tm.Llx - cursor.Urx readingGap := gapReading(tm, newWord)
fontsize := cursor.fontsize
fontsize := newWord.fontsize
// These are the conditions for `tm` to be from a new word. // These are the conditions for `tm` to be from a new word.
// - Change in reading position is larger than a space which we guess to be 0.11*fontsize. // - Change in reading position is larger than a space which we guess to be 0.11*fontsize.
@ -64,20 +65,20 @@ func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []*textWord {
sameWord := -0.19*fontsize <= readingGap && readingGap <= 0.11*fontsize && sameWord := -0.19*fontsize <= readingGap && readingGap <= 0.11*fontsize &&
math.Abs(depthGap) <= 0.04*fontsize math.Abs(depthGap) <= 0.04*fontsize
if !sameWord { if !sameWord {
addWord() addNewWord()
cursor = newTextWord([]textMark{tm}, pageSize) newWord = newTextWord([]*textMark{tm}, pageSize)
continue continue
} }
cursor.addMark(tm, pageSize) newWord.addMark(tm, pageSize)
} }
addWord() addNewWord()
return words return words
} }
// newTextWord creates a textWords containing `marks`. // newTextWord creates a textWords containing `marks`.
// `pageSize` is used to calculate the word's depth on the page. // `pageSize` is used to calculate the word's depth on the page.
func newTextWord(marks []textMark, pageSize model.PdfRectangle) *textWord { func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
r := marks[0].PdfRectangle r := marks[0].PdfRectangle
fontsize := marks[0].fontsize fontsize := marks[0].fontsize
for _, tm := range marks[1:] { for _, tm := range marks[1:] {
@ -111,7 +112,7 @@ func (w *textWord) bbox() model.PdfRectangle {
// addMark adds textMark `tm` to word `w`. // addMark adds textMark `tm` to word `w`.
// `pageSize` is used to calculate the word's depth on the page. // `pageSize` is used to calculate the word's depth on the page.
func (w *textWord) addMark(tm textMark, pageSize model.PdfRectangle) { func (w *textWord) addMark(tm *textMark, pageSize model.PdfRectangle) {
w.marks = append(w.marks, tm) w.marks = append(w.marks, tm)
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle) w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
if tm.fontsize > w.fontsize { if tm.fontsize > w.fontsize {