diff --git a/extractor/text.go b/extractor/text.go index 7d66a636..53758825 100644 --- a/extractor/text.go +++ b/extractor/text.go @@ -1180,14 +1180,46 @@ var ( // sortPosition sorts a text list by its elements' positions on a page. // Sorting is by orientation then top to bottom, left to right when page is orientated so that text // is horizontal. +// Text is considered to be on different lines if the lines' orientedStart.Y differs by more than `tol`. func (pt *PageText) sortPosition(tol float64) { + if len(pt.marks) == 0 { + return + } + + // For grouping data vertically into lines, it is necessary to have the data presorted by + // descending y position. sort.SliceStable(pt.marks, func(i, j int) bool { ti, tj := pt.marks[i], pt.marks[j] if ti.orient != tj.orient { return ti.orient < tj.orient } - if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol { - return ti.orientedStart.Y > tj.orientedStart.Y + return ti.orientedStart.Y >= tj.orientedStart.Y + }) + + // Cluster the marks into y-clusters by relative y proximity. Each cluster is our guess of what + // makes up a line of text. + clusters := make([]int, len(pt.marks)) + cluster := 0 + clusters[0] = cluster + for i := 1; i < len(pt.marks); i++ { + if pt.marks[i-1].orient != pt.marks[i].orient { + cluster++ + } else { + if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol { + cluster++ + } + } + clusters[i] = cluster + } + + // Sort by y-cluster and x. + sort.SliceStable(pt.marks, func(i, j int) bool { + ti, tj := pt.marks[i], pt.marks[j] + if ti.orient != tj.orient { + return ti.orient < tj.orient + } + if clusters[i] != clusters[j] { + return clusters[i] < clusters[j] } return ti.orientedStart.X < tj.orientedStart.X }) diff --git a/extractor/text_test.go b/extractor/text_test.go index 49c71181..c6b6c547 100644 --- a/extractor/text_test.go +++ b/extractor/text_test.go @@ -22,6 +22,7 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/creator" + "github.com/unidoc/unipdf/v3/internal/transform" "github.com/unidoc/unipdf/v3/model" "golang.org/x/text/unicode/norm" ) @@ -180,6 +181,53 @@ func TestTermMarksFiles(t *testing.T) { testTermMarksFiles(t) } +// TestTextSort checks that PageText.sortPosition() gives expected results +func TestTextSort(t *testing.T) { + // marks0 is in the expected sort order for tol=15 + marks0 := []textMark{ + // y difference > tol => sorts by Y descending + textMark{orientedStart: transform.Point{X: 300, Y: 160}, text: "00"}, + textMark{orientedStart: transform.Point{X: 200, Y: 140}, text: "01"}, + textMark{orientedStart: transform.Point{X: 100, Y: 120}, text: "02"}, + + // y difference < tol => sort by X ascending for approx same Y + textMark{orientedStart: transform.Point{X: 100, Y: 30}, text: "10"}, + textMark{orientedStart: transform.Point{X: 200, Y: 40}, text: "11"}, + textMark{orientedStart: transform.Point{X: 300, Y: 50}, text: "12"}, + + // y difference < tol => sorts by X descending for approx same Y, different from previous Y + textMark{orientedStart: transform.Point{X: 100, Y: 3}, text: "20"}, + textMark{orientedStart: transform.Point{X: 200, Y: 4}, text: "21"}, + textMark{orientedStart: transform.Point{X: 300, Y: 5}, text: "22"}, + } + + // marks is a copy of marks0 with its order scrambled. + marks := make([]textMark, len(marks0)) + copy(marks, marks0) + sort.Slice(marks, func(i, j int) bool { + ti, tj := marks[i], marks[j] + if ti.orientedStart.X != tj.orientedStart.X { + return ti.orientedStart.X > tj.orientedStart.X + } + if ti.orient != tj.orient { + return ti.orient > tj.orient + } + return ti.orientedStart.Y < tj.orientedStart.Y + }) + + // Copy marks to PageText and sort them. This should give the same order as marks0. + pt := PageText{marks: marks} + pt.sortPosition(15) + + // Check that marks order is the same as marks0. + for i, m0 := range marks0 { + m := pt.marks[i] + if m.orientedStart.X != m0.orientedStart.X || m.orientedStart.Y != m0.orientedStart.Y { + t.Fatalf("i=%d m=%v != m0=%v", i, m, m0) + } + } +} + // fileExtractionTests are PDF file names and terms we expect to find on specified pages of those // PDF files. // `pageTerms`[pageNum] are the terms we expect to find on (1-offset) page number pageNum of