mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-14 19:29:50 +08:00
Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.
This commit is contained in:
parent
83033182fa
commit
c515472849
@ -5,7 +5,7 @@ understand the code you may wish to recombine this in the orginal `text.go`.
|
|||||||
|
|
||||||
BASIC IDEAS
|
BASIC IDEAS
|
||||||
-----------
|
-----------
|
||||||
There are two directions
|
There are two [directions](https://www.w3.org/International/questions/qa-scripts.en#directions)s\.
|
||||||
|
|
||||||
- *reading*
|
- *reading*
|
||||||
- *depth*
|
- *depth*
|
||||||
@ -34,7 +34,7 @@ WHERE TO START
|
|||||||
|
|
||||||
* A page's `textMark`s are obtained from its contentstream.
|
* A page's `textMark`s are obtained from its contentstream.
|
||||||
* The `textMark`s are divided into `textWord`s.
|
* The `textMark`s are divided into `textWord`s.
|
||||||
* The `textWord`s are grouped into depth bins with each the contents of each bin sorted by reading direction.
|
* The `textWord`s are grouped into depth bins with the contents of each bin sorted by reading direction.
|
||||||
* The page area is divided into rectangular regions, one for each paragraph.
|
* The page area is divided into rectangular regions, one for each paragraph.
|
||||||
* The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and
|
* The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and
|
||||||
its constituent lines is a `textPara`.
|
its constituent lines is a `textPara`.
|
||||||
|
@ -663,7 +663,7 @@ type textObject struct {
|
|||||||
state *textState
|
state *textState
|
||||||
tm transform.Matrix // Text matrix. For the character pointer.
|
tm transform.Matrix // Text matrix. For the character pointer.
|
||||||
tlm transform.Matrix // Text line matrix. For the start of line pointer.
|
tlm transform.Matrix // Text line matrix. For the start of line pointer.
|
||||||
marks []textMark // Text marks get written here.
|
marks []*textMark // Text marks get written here.
|
||||||
}
|
}
|
||||||
|
|
||||||
// newTextState returns a default textState.
|
// newTextState returns a default textState.
|
||||||
@ -812,7 +812,7 @@ func (to *textObject) renderText(data []byte) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
|
common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
|
||||||
to.marks = append(to.marks, mark)
|
to.marks = append(to.marks, &mark)
|
||||||
|
|
||||||
// update the text matrix by the displacement of the text location.
|
// update the text matrix by the displacement of the text location.
|
||||||
to.tm.Concat(td)
|
to.tm.Concat(td)
|
||||||
@ -859,7 +859,7 @@ func isTextSpace(text string) bool {
|
|||||||
|
|
||||||
// PageText represents the layout of text on a device page.
|
// PageText represents the layout of text on a device page.
|
||||||
type PageText struct {
|
type PageText struct {
|
||||||
marks []textMark // Texts and their positions on a PDF page.
|
marks []*textMark // Texts and their positions on a PDF page.
|
||||||
viewText string // Extracted page text.
|
viewText string // Extracted page text.
|
||||||
viewMarks []TextMark // Public view of `marks`.
|
viewMarks []TextMark // Public view of `marks`.
|
||||||
pageSize model.PdfRectangle
|
pageSize model.PdfRectangle
|
||||||
|
@ -48,6 +48,11 @@ type bounded interface {
|
|||||||
bbox() model.PdfRectangle
|
bbox() model.PdfRectangle
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getDepth returns the depth of `a` on a page of size `pageSize`.
|
||||||
|
func getDepth(pageSize model.PdfRectangle, a bounded) float64 {
|
||||||
|
return pageSize.Ury - a.bbox().Lly
|
||||||
|
}
|
||||||
|
|
||||||
// diffReading returns `a` - `b` in the reading direction.
|
// diffReading returns `a` - `b` in the reading direction.
|
||||||
func diffReading(a, b bounded) float64 {
|
func diffReading(a, b bounded) float64 {
|
||||||
return a.bbox().Llx - b.bbox().Llx
|
return a.bbox().Llx - b.bbox().Llx
|
||||||
@ -93,11 +98,6 @@ func readingOverlapLeft(para *textStrata, word *textWord, delta float64) bool {
|
|||||||
return para.Urx <= word.Llx && word.Llx < para.Urx+delta
|
return para.Urx <= word.Llx && word.Llx < para.Urx+delta
|
||||||
}
|
}
|
||||||
|
|
||||||
// readingOverlaplapRight returns true is the left of `word` is in within `para` but at least delta from its left
|
|
||||||
func readingOverlaplapRight(para *textStrata, word *textWord, delta float64) bool {
|
|
||||||
return para.Llx+delta < word.Llx && word.Llx <= para.Urx
|
|
||||||
}
|
|
||||||
|
|
||||||
// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
|
// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
|
||||||
// in the reading direction.
|
// in the reading direction.
|
||||||
func readingOverlapPlusGap(para *textStrata, word *textWord, maxIntraReadingGap float64) bool {
|
func readingOverlapPlusGap(para *textStrata, word *textWord, maxIntraReadingGap float64) bool {
|
||||||
|
@ -101,17 +101,18 @@ func (tm *textMark) String() string {
|
|||||||
return fmt.Sprintf("serial=%d %.2f fontsize=%.2f \"%s\"",
|
return fmt.Sprintf("serial=%d %.2f fontsize=%.2f \"%s\"",
|
||||||
tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
|
tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (tm *textMark) bbox() model.PdfRectangle {
|
func (tm *textMark) bbox() model.PdfRectangle {
|
||||||
return tm.PdfRectangle
|
return tm.PdfRectangle
|
||||||
}
|
}
|
||||||
|
|
||||||
// Width returns the width of `tm`.text in the text direction.
|
// Width returns the width of `tm`.text in the text direction.
|
||||||
func (tm textMark) Width() float64 {
|
func (tm *textMark) Width() float64 {
|
||||||
return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
|
return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToTextMark returns the public view of `tm`.
|
// ToTextMark returns the public view of `tm`.
|
||||||
func (tm textMark) ToTextMark() TextMark {
|
func (tm *textMark) ToTextMark() TextMark {
|
||||||
return TextMark{
|
return TextMark{
|
||||||
count: int64(tm.serial),
|
count: int64(tm.serial),
|
||||||
Text: tm.text,
|
Text: tm.text,
|
||||||
|
@ -19,7 +19,7 @@ import (
|
|||||||
type paraList []*textPara
|
type paraList []*textPara
|
||||||
|
|
||||||
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
|
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
|
||||||
func makeTextPage(marks []textMark, pageSize model.PdfRectangle, rot int) paraList {
|
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
|
||||||
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
|
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
|
||||||
|
|
||||||
// Break the marks into words
|
// Break the marks into words
|
||||||
|
@ -21,41 +21,42 @@ type textWord struct {
|
|||||||
serial int // Sequence number for debugging.
|
serial int // Sequence number for debugging.
|
||||||
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
||||||
depth float64 // Distance from bottom of word to top of page.
|
depth float64 // Distance from bottom of word to top of page.
|
||||||
marks []textMark // Marks in this word.
|
marks []*textMark // Marks in this word.
|
||||||
fontsize float64 // Largest fontsize in `marks` w
|
fontsize float64 // Largest fontsize in `marks` w
|
||||||
spaceAfter bool
|
spaceAfter bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// makeTextPage builds a word list from `marks`, the textMarks on a page.
|
// makeTextPage builds a word list from `marks`, the textMarks on a page.
|
||||||
// `pageSize` is used to calculate the words` depths depth on the page
|
// `pageSize` is used to calculate the words` depths depth on the page.
|
||||||
func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []*textWord {
|
func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
|
||||||
var words []*textWord
|
var words []*textWord
|
||||||
var cursor *textWord
|
var newWord *textWord // The word being built.
|
||||||
|
|
||||||
// addWord adds `cursor` to `words` and resets it to nil
|
// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
|
||||||
addWord := func() {
|
addNewWord := func() {
|
||||||
if cursor != nil {
|
if newWord != nil {
|
||||||
if !isTextSpace(cursor.text()) {
|
if !isTextSpace(newWord.text()) {
|
||||||
words = append(words, cursor)
|
words = append(words, newWord)
|
||||||
}
|
}
|
||||||
cursor = nil
|
newWord = nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tm := range marks {
|
for _, tm := range marks {
|
||||||
isSpace := isTextSpace(tm.text)
|
isSpace := isTextSpace(tm.text)
|
||||||
if cursor == nil && !isSpace {
|
if newWord == nil && !isSpace {
|
||||||
cursor = newTextWord([]textMark{tm}, pageSize)
|
newWord = newTextWord([]*textMark{tm}, pageSize)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if isSpace {
|
if isSpace {
|
||||||
addWord()
|
addNewWord()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
depthGap := pageSize.Ury - tm.Lly - cursor.depth
|
depthGap := getDepth(pageSize, tm) - newWord.depth
|
||||||
readingGap := tm.Llx - cursor.Urx
|
readingGap := gapReading(tm, newWord)
|
||||||
fontsize := cursor.fontsize
|
|
||||||
|
fontsize := newWord.fontsize
|
||||||
|
|
||||||
// These are the conditions for `tm` to be from a new word.
|
// These are the conditions for `tm` to be from a new word.
|
||||||
// - Change in reading position is larger than a space which we guess to be 0.11*fontsize.
|
// - Change in reading position is larger than a space which we guess to be 0.11*fontsize.
|
||||||
@ -64,20 +65,20 @@ func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []*textWord {
|
|||||||
sameWord := -0.19*fontsize <= readingGap && readingGap <= 0.11*fontsize &&
|
sameWord := -0.19*fontsize <= readingGap && readingGap <= 0.11*fontsize &&
|
||||||
math.Abs(depthGap) <= 0.04*fontsize
|
math.Abs(depthGap) <= 0.04*fontsize
|
||||||
if !sameWord {
|
if !sameWord {
|
||||||
addWord()
|
addNewWord()
|
||||||
cursor = newTextWord([]textMark{tm}, pageSize)
|
newWord = newTextWord([]*textMark{tm}, pageSize)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
cursor.addMark(tm, pageSize)
|
newWord.addMark(tm, pageSize)
|
||||||
}
|
}
|
||||||
addWord()
|
addNewWord()
|
||||||
return words
|
return words
|
||||||
}
|
}
|
||||||
|
|
||||||
// newTextWord creates a textWords containing `marks`.
|
// newTextWord creates a textWords containing `marks`.
|
||||||
// `pageSize` is used to calculate the word's depth on the page.
|
// `pageSize` is used to calculate the word's depth on the page.
|
||||||
func newTextWord(marks []textMark, pageSize model.PdfRectangle) *textWord {
|
func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
|
||||||
r := marks[0].PdfRectangle
|
r := marks[0].PdfRectangle
|
||||||
fontsize := marks[0].fontsize
|
fontsize := marks[0].fontsize
|
||||||
for _, tm := range marks[1:] {
|
for _, tm := range marks[1:] {
|
||||||
@ -111,7 +112,7 @@ func (w *textWord) bbox() model.PdfRectangle {
|
|||||||
|
|
||||||
// addMark adds textMark `tm` to word `w`.
|
// addMark adds textMark `tm` to word `w`.
|
||||||
// `pageSize` is used to calculate the word's depth on the page.
|
// `pageSize` is used to calculate the word's depth on the page.
|
||||||
func (w *textWord) addMark(tm textMark, pageSize model.PdfRectangle) {
|
func (w *textWord) addMark(tm *textMark, pageSize model.PdfRectangle) {
|
||||||
w.marks = append(w.marks, tm)
|
w.marks = append(w.marks, tm)
|
||||||
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
|
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
|
||||||
if tm.fontsize > w.fontsize {
|
if tm.fontsize > w.fontsize {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user