Make PageText.sortPosition() sort order deterministic. (#153)

This commit is contained in:
Peter Williams 2019-08-30 04:26:53 +10:00 committed by Gunnsteinn Hall
parent 2d20058a26
commit aea4cb1d55
2 changed files with 82 additions and 2 deletions

View File

@ -1180,14 +1180,46 @@ var (
// sortPosition sorts a text list by its elements' positions on a page.
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
// is horizontal.
// Text is considered to be on different lines if the lines' orientedStart.Y differs by more than `tol`.
func (pt *PageText) sortPosition(tol float64) {
if len(pt.marks) == 0 {
return
}
// For grouping data vertically into lines, it is necessary to have the data presorted by
// descending y position.
sort.SliceStable(pt.marks, func(i, j int) bool {
ti, tj := pt.marks[i], pt.marks[j]
if ti.orient != tj.orient {
return ti.orient < tj.orient
}
if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol {
return ti.orientedStart.Y > tj.orientedStart.Y
return ti.orientedStart.Y >= tj.orientedStart.Y
})
// Cluster the marks into y-clusters by relative y proximity. Each cluster is our guess of what
// makes up a line of text.
clusters := make([]int, len(pt.marks))
cluster := 0
clusters[0] = cluster
for i := 1; i < len(pt.marks); i++ {
if pt.marks[i-1].orient != pt.marks[i].orient {
cluster++
} else {
if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol {
cluster++
}
}
clusters[i] = cluster
}
// Sort by y-cluster and x.
sort.SliceStable(pt.marks, func(i, j int) bool {
ti, tj := pt.marks[i], pt.marks[j]
if ti.orient != tj.orient {
return ti.orient < tj.orient
}
if clusters[i] != clusters[j] {
return clusters[i] < clusters[j]
}
return ti.orientedStart.X < tj.orientedStart.X
})

View File

@ -22,6 +22,7 @@ import (
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/creator"
"github.com/unidoc/unipdf/v3/internal/transform"
"github.com/unidoc/unipdf/v3/model"
"golang.org/x/text/unicode/norm"
)
@ -180,6 +181,53 @@ func TestTermMarksFiles(t *testing.T) {
testTermMarksFiles(t)
}
// TestTextSort checks that PageText.sortPosition() gives expected results
func TestTextSort(t *testing.T) {
// marks0 is in the expected sort order for tol=15
marks0 := []textMark{
// y difference > tol => sorts by Y descending
textMark{orientedStart: transform.Point{X: 300, Y: 160}, text: "00"},
textMark{orientedStart: transform.Point{X: 200, Y: 140}, text: "01"},
textMark{orientedStart: transform.Point{X: 100, Y: 120}, text: "02"},
// y difference < tol => sort by X ascending for approx same Y
textMark{orientedStart: transform.Point{X: 100, Y: 30}, text: "10"},
textMark{orientedStart: transform.Point{X: 200, Y: 40}, text: "11"},
textMark{orientedStart: transform.Point{X: 300, Y: 50}, text: "12"},
// y difference < tol => sorts by X descending for approx same Y, different from previous Y
textMark{orientedStart: transform.Point{X: 100, Y: 3}, text: "20"},
textMark{orientedStart: transform.Point{X: 200, Y: 4}, text: "21"},
textMark{orientedStart: transform.Point{X: 300, Y: 5}, text: "22"},
}
// marks is a copy of marks0 with its order scrambled.
marks := make([]textMark, len(marks0))
copy(marks, marks0)
sort.Slice(marks, func(i, j int) bool {
ti, tj := marks[i], marks[j]
if ti.orientedStart.X != tj.orientedStart.X {
return ti.orientedStart.X > tj.orientedStart.X
}
if ti.orient != tj.orient {
return ti.orient > tj.orient
}
return ti.orientedStart.Y < tj.orientedStart.Y
})
// Copy marks to PageText and sort them. This should give the same order as marks0.
pt := PageText{marks: marks}
pt.sortPosition(15)
// Check that marks order is the same as marks0.
for i, m0 := range marks0 {
m := pt.marks[i]
if m.orientedStart.X != m0.orientedStart.X || m.orientedStart.Y != m0.orientedStart.Y {
t.Fatalf("i=%d m=%v != m0=%v", i, m, m0)
}
}
}
// fileExtractionTests are PDF file names and terms we expect to find on specified pages of those
// PDF files.
// `pageTerms`[pageNum] are the terms we expect to find on (1-offset) page number pageNum of