mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00
Make PageText.sortPosition() sort order deterministic. (#153)
This commit is contained in:
parent
2d20058a26
commit
aea4cb1d55
@ -1180,14 +1180,46 @@ var (
|
||||
// sortPosition sorts a text list by its elements' positions on a page.
|
||||
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
|
||||
// is horizontal.
|
||||
// Text is considered to be on different lines if the lines' orientedStart.Y differs by more than `tol`.
|
||||
func (pt *PageText) sortPosition(tol float64) {
|
||||
if len(pt.marks) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// For grouping data vertically into lines, it is necessary to have the data presorted by
|
||||
// descending y position.
|
||||
sort.SliceStable(pt.marks, func(i, j int) bool {
|
||||
ti, tj := pt.marks[i], pt.marks[j]
|
||||
if ti.orient != tj.orient {
|
||||
return ti.orient < tj.orient
|
||||
}
|
||||
if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol {
|
||||
return ti.orientedStart.Y > tj.orientedStart.Y
|
||||
return ti.orientedStart.Y >= tj.orientedStart.Y
|
||||
})
|
||||
|
||||
// Cluster the marks into y-clusters by relative y proximity. Each cluster is our guess of what
|
||||
// makes up a line of text.
|
||||
clusters := make([]int, len(pt.marks))
|
||||
cluster := 0
|
||||
clusters[0] = cluster
|
||||
for i := 1; i < len(pt.marks); i++ {
|
||||
if pt.marks[i-1].orient != pt.marks[i].orient {
|
||||
cluster++
|
||||
} else {
|
||||
if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol {
|
||||
cluster++
|
||||
}
|
||||
}
|
||||
clusters[i] = cluster
|
||||
}
|
||||
|
||||
// Sort by y-cluster and x.
|
||||
sort.SliceStable(pt.marks, func(i, j int) bool {
|
||||
ti, tj := pt.marks[i], pt.marks[j]
|
||||
if ti.orient != tj.orient {
|
||||
return ti.orient < tj.orient
|
||||
}
|
||||
if clusters[i] != clusters[j] {
|
||||
return clusters[i] < clusters[j]
|
||||
}
|
||||
return ti.orientedStart.X < tj.orientedStart.X
|
||||
})
|
||||
|
@ -22,6 +22,7 @@ import (
|
||||
|
||||
"github.com/unidoc/unipdf/v3/common"
|
||||
"github.com/unidoc/unipdf/v3/creator"
|
||||
"github.com/unidoc/unipdf/v3/internal/transform"
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
@ -180,6 +181,53 @@ func TestTermMarksFiles(t *testing.T) {
|
||||
testTermMarksFiles(t)
|
||||
}
|
||||
|
||||
// TestTextSort checks that PageText.sortPosition() gives expected results
|
||||
func TestTextSort(t *testing.T) {
|
||||
// marks0 is in the expected sort order for tol=15
|
||||
marks0 := []textMark{
|
||||
// y difference > tol => sorts by Y descending
|
||||
textMark{orientedStart: transform.Point{X: 300, Y: 160}, text: "00"},
|
||||
textMark{orientedStart: transform.Point{X: 200, Y: 140}, text: "01"},
|
||||
textMark{orientedStart: transform.Point{X: 100, Y: 120}, text: "02"},
|
||||
|
||||
// y difference < tol => sort by X ascending for approx same Y
|
||||
textMark{orientedStart: transform.Point{X: 100, Y: 30}, text: "10"},
|
||||
textMark{orientedStart: transform.Point{X: 200, Y: 40}, text: "11"},
|
||||
textMark{orientedStart: transform.Point{X: 300, Y: 50}, text: "12"},
|
||||
|
||||
// y difference < tol => sorts by X descending for approx same Y, different from previous Y
|
||||
textMark{orientedStart: transform.Point{X: 100, Y: 3}, text: "20"},
|
||||
textMark{orientedStart: transform.Point{X: 200, Y: 4}, text: "21"},
|
||||
textMark{orientedStart: transform.Point{X: 300, Y: 5}, text: "22"},
|
||||
}
|
||||
|
||||
// marks is a copy of marks0 with its order scrambled.
|
||||
marks := make([]textMark, len(marks0))
|
||||
copy(marks, marks0)
|
||||
sort.Slice(marks, func(i, j int) bool {
|
||||
ti, tj := marks[i], marks[j]
|
||||
if ti.orientedStart.X != tj.orientedStart.X {
|
||||
return ti.orientedStart.X > tj.orientedStart.X
|
||||
}
|
||||
if ti.orient != tj.orient {
|
||||
return ti.orient > tj.orient
|
||||
}
|
||||
return ti.orientedStart.Y < tj.orientedStart.Y
|
||||
})
|
||||
|
||||
// Copy marks to PageText and sort them. This should give the same order as marks0.
|
||||
pt := PageText{marks: marks}
|
||||
pt.sortPosition(15)
|
||||
|
||||
// Check that marks order is the same as marks0.
|
||||
for i, m0 := range marks0 {
|
||||
m := pt.marks[i]
|
||||
if m.orientedStart.X != m0.orientedStart.X || m.orientedStart.Y != m0.orientedStart.Y {
|
||||
t.Fatalf("i=%d m=%v != m0=%v", i, m, m0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fileExtractionTests are PDF file names and terms we expect to find on specified pages of those
|
||||
// PDF files.
|
||||
// `pageTerms`[pageNum] are the terms we expect to find on (1-offset) page number pageNum of
|
||||
|
Loading…
x
Reference in New Issue
Block a user