mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-14 19:29:50 +08:00
Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.
This commit is contained in:
parent
83033182fa
commit
c515472849
@ -5,7 +5,7 @@ understand the code you may wish to recombine this in the orginal `text.go`.
|
||||
|
||||
BASIC IDEAS
|
||||
-----------
|
||||
There are two directions
|
||||
There are two [directions](https://www.w3.org/International/questions/qa-scripts.en#directions)s\.
|
||||
|
||||
- *reading*
|
||||
- *depth*
|
||||
@ -34,7 +34,7 @@ WHERE TO START
|
||||
|
||||
* A page's `textMark`s are obtained from its contentstream.
|
||||
* The `textMark`s are divided into `textWord`s.
|
||||
* The `textWord`s are grouped into depth bins with each the contents of each bin sorted by reading direction.
|
||||
* The `textWord`s are grouped into depth bins with the contents of each bin sorted by reading direction.
|
||||
* The page area is divided into rectangular regions, one for each paragraph.
|
||||
* The words in of each rectangular region are aranged inot`textLine`s. Each rectangular region and
|
||||
its constituent lines is a `textPara`.
|
||||
|
@ -663,7 +663,7 @@ type textObject struct {
|
||||
state *textState
|
||||
tm transform.Matrix // Text matrix. For the character pointer.
|
||||
tlm transform.Matrix // Text line matrix. For the start of line pointer.
|
||||
marks []textMark // Text marks get written here.
|
||||
marks []*textMark // Text marks get written here.
|
||||
}
|
||||
|
||||
// newTextState returns a default textState.
|
||||
@ -812,7 +812,7 @@ func (to *textObject) renderText(data []byte) error {
|
||||
}
|
||||
}
|
||||
common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
|
||||
to.marks = append(to.marks, mark)
|
||||
to.marks = append(to.marks, &mark)
|
||||
|
||||
// update the text matrix by the displacement of the text location.
|
||||
to.tm.Concat(td)
|
||||
@ -859,9 +859,9 @@ func isTextSpace(text string) bool {
|
||||
|
||||
// PageText represents the layout of text on a device page.
|
||||
type PageText struct {
|
||||
marks []textMark // Texts and their positions on a PDF page.
|
||||
viewText string // Extracted page text.
|
||||
viewMarks []TextMark // Public view of `marks`.
|
||||
marks []*textMark // Texts and their positions on a PDF page.
|
||||
viewText string // Extracted page text.
|
||||
viewMarks []TextMark // Public view of `marks`.
|
||||
pageSize model.PdfRectangle
|
||||
}
|
||||
|
||||
|
@ -48,6 +48,11 @@ type bounded interface {
|
||||
bbox() model.PdfRectangle
|
||||
}
|
||||
|
||||
// getDepth returns the depth of `a` on a page of size `pageSize`.
|
||||
func getDepth(pageSize model.PdfRectangle, a bounded) float64 {
|
||||
return pageSize.Ury - a.bbox().Lly
|
||||
}
|
||||
|
||||
// diffReading returns `a` - `b` in the reading direction.
|
||||
func diffReading(a, b bounded) float64 {
|
||||
return a.bbox().Llx - b.bbox().Llx
|
||||
@ -93,11 +98,6 @@ func readingOverlapLeft(para *textStrata, word *textWord, delta float64) bool {
|
||||
return para.Urx <= word.Llx && word.Llx < para.Urx+delta
|
||||
}
|
||||
|
||||
// readingOverlaplapRight returns true is the left of `word` is in within `para` but at least delta from its left
|
||||
func readingOverlaplapRight(para *textStrata, word *textWord, delta float64) bool {
|
||||
return para.Llx+delta < word.Llx && word.Llx <= para.Urx
|
||||
}
|
||||
|
||||
// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
|
||||
// in the reading direction.
|
||||
func readingOverlapPlusGap(para *textStrata, word *textWord, maxIntraReadingGap float64) bool {
|
||||
|
@ -101,17 +101,18 @@ func (tm *textMark) String() string {
|
||||
return fmt.Sprintf("serial=%d %.2f fontsize=%.2f \"%s\"",
|
||||
tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
|
||||
}
|
||||
|
||||
func (tm *textMark) bbox() model.PdfRectangle {
|
||||
return tm.PdfRectangle
|
||||
}
|
||||
|
||||
// Width returns the width of `tm`.text in the text direction.
|
||||
func (tm textMark) Width() float64 {
|
||||
func (tm *textMark) Width() float64 {
|
||||
return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
|
||||
}
|
||||
|
||||
// ToTextMark returns the public view of `tm`.
|
||||
func (tm textMark) ToTextMark() TextMark {
|
||||
func (tm *textMark) ToTextMark() TextMark {
|
||||
return TextMark{
|
||||
count: int64(tm.serial),
|
||||
Text: tm.text,
|
||||
|
@ -19,7 +19,7 @@ import (
|
||||
type paraList []*textPara
|
||||
|
||||
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
|
||||
func makeTextPage(marks []textMark, pageSize model.PdfRectangle, rot int) paraList {
|
||||
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
|
||||
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
|
||||
|
||||
// Break the marks into words
|
||||
|
@ -18,44 +18,45 @@ import (
|
||||
// textWord represents a word. It's a sequence of textMarks that are close enough toghether in the
|
||||
// reading direction and doesn't have any space textMarks.
|
||||
type textWord struct {
|
||||
serial int // Sequence number for debugging.
|
||||
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
||||
depth float64 // Distance from bottom of word to top of page.
|
||||
marks []textMark // Marks in this word.
|
||||
fontsize float64 // Largest fontsize in `marks` w
|
||||
serial int // Sequence number for debugging.
|
||||
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
||||
depth float64 // Distance from bottom of word to top of page.
|
||||
marks []*textMark // Marks in this word.
|
||||
fontsize float64 // Largest fontsize in `marks` w
|
||||
spaceAfter bool
|
||||
}
|
||||
|
||||
// makeTextPage builds a word list from `marks`, the textMarks on a page.
|
||||
// `pageSize` is used to calculate the words` depths depth on the page
|
||||
func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []*textWord {
|
||||
// `pageSize` is used to calculate the words` depths depth on the page.
|
||||
func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
|
||||
var words []*textWord
|
||||
var cursor *textWord
|
||||
var newWord *textWord // The word being built.
|
||||
|
||||
// addWord adds `cursor` to `words` and resets it to nil
|
||||
addWord := func() {
|
||||
if cursor != nil {
|
||||
if !isTextSpace(cursor.text()) {
|
||||
words = append(words, cursor)
|
||||
// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
|
||||
addNewWord := func() {
|
||||
if newWord != nil {
|
||||
if !isTextSpace(newWord.text()) {
|
||||
words = append(words, newWord)
|
||||
}
|
||||
cursor = nil
|
||||
newWord = nil
|
||||
}
|
||||
}
|
||||
|
||||
for _, tm := range marks {
|
||||
isSpace := isTextSpace(tm.text)
|
||||
if cursor == nil && !isSpace {
|
||||
cursor = newTextWord([]textMark{tm}, pageSize)
|
||||
if newWord == nil && !isSpace {
|
||||
newWord = newTextWord([]*textMark{tm}, pageSize)
|
||||
continue
|
||||
}
|
||||
if isSpace {
|
||||
addWord()
|
||||
addNewWord()
|
||||
continue
|
||||
}
|
||||
|
||||
depthGap := pageSize.Ury - tm.Lly - cursor.depth
|
||||
readingGap := tm.Llx - cursor.Urx
|
||||
fontsize := cursor.fontsize
|
||||
depthGap := getDepth(pageSize, tm) - newWord.depth
|
||||
readingGap := gapReading(tm, newWord)
|
||||
|
||||
fontsize := newWord.fontsize
|
||||
|
||||
// These are the conditions for `tm` to be from a new word.
|
||||
// - Change in reading position is larger than a space which we guess to be 0.11*fontsize.
|
||||
@ -64,20 +65,20 @@ func makeTextWords(marks []textMark, pageSize model.PdfRectangle) []*textWord {
|
||||
sameWord := -0.19*fontsize <= readingGap && readingGap <= 0.11*fontsize &&
|
||||
math.Abs(depthGap) <= 0.04*fontsize
|
||||
if !sameWord {
|
||||
addWord()
|
||||
cursor = newTextWord([]textMark{tm}, pageSize)
|
||||
addNewWord()
|
||||
newWord = newTextWord([]*textMark{tm}, pageSize)
|
||||
continue
|
||||
}
|
||||
|
||||
cursor.addMark(tm, pageSize)
|
||||
newWord.addMark(tm, pageSize)
|
||||
}
|
||||
addWord()
|
||||
addNewWord()
|
||||
return words
|
||||
}
|
||||
|
||||
// newTextWord creates a textWords containing `marks`.
|
||||
// `pageSize` is used to calculate the word's depth on the page.
|
||||
func newTextWord(marks []textMark, pageSize model.PdfRectangle) *textWord {
|
||||
func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
|
||||
r := marks[0].PdfRectangle
|
||||
fontsize := marks[0].fontsize
|
||||
for _, tm := range marks[1:] {
|
||||
@ -111,7 +112,7 @@ func (w *textWord) bbox() model.PdfRectangle {
|
||||
|
||||
// addMark adds textMark `tm` to word `w`.
|
||||
// `pageSize` is used to calculate the word's depth on the page.
|
||||
func (w *textWord) addMark(tm textMark, pageSize model.PdfRectangle) {
|
||||
func (w *textWord) addMark(tm *textMark, pageSize model.PdfRectangle) {
|
||||
w.marks = append(w.marks, tm)
|
||||
w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
|
||||
if tm.fontsize > w.fontsize {
|
||||
|
Loading…
x
Reference in New Issue
Block a user