Added duplicate text detection.

2025-05-13 19:29:10 +08:00 · 2020-06-23 15:33:34 +10:00 · 2020-06-23 15:33:34 +10:00 · 5933a3dd81
commit 5933a3dd81
parent e65fb041e5
4 changed files with 53 additions and 4 deletions
--- a/extractor/text_const.go
+++ b/extractor/text_const.go
@ -67,9 +67,12 @@ const (
 	// Maximum spacing between characters within a line.
 	maxIntraLineGapR = 0.02

+	// Max difference in coordinates of duplicated textWords.
+	maxDuplicateWordR = 0.2
+
 	minHyphenation = 4

-	//
+	// The distance we look down from the top of a wordBag for the leftmost word.
 	topWordRangeR = 4.0
 	// minimum number of cells in a textTable
 	minTableParas = 6
--- a/extractor/text_page.go
+++ b/extractor/text_page.go
@ -58,9 +58,12 @@ func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraL
 	paraWords = mergWordBags(paraWords)

 	// Arrange the contents of each paragraph wordBag into lines and the lines into whole words.
-	paras := make(paraList, len(paraWords))
-	for i, para := range paraWords {
-		paras[i] = para.arrangeText()
+	paras := make(paraList, 0, len(paraWords))
+	for _, bag := range paraWords {
+		para := bag.arrangeText()
+		if para != nil {
+			paras = append(paras, para)
+		}
 	}

 	// Find paras that are cells in tables, convert the tables to paras and remove the cell paras.
--- a/extractor/text_para.go
+++ b/extractor/text_para.go
@ -9,6 +9,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"math"
 	"sort"

 	"github.com/unidoc/unipdf/v3/common"
@ -192,6 +193,41 @@ func (p *textPara) fontsize() float64 {
 	return p.lines[0].fontsize
 }

+// removeDuplicates removes duplicate word fragments such as those used for bolding.
+func (b *wordBag) removeDuplicates() {
+	for _, depthIdx := range b.depthIndexes() {
+		word := b.bins[depthIdx][0]
+		delta := maxDuplicateWordR * word.fontsize
+		minDepth := word.depth
+		for _, idx := range b.depthBand(minDepth, minDepth+delta) {
+			duplicates := map[*textWord]struct{}{}
+			words := b.bins[idx]
+			for _, w := range words {
+				if w != word && w.text == word.text &&
+					math.Abs(w.Llx-word.Llx) < delta &&
+					math.Abs(w.Urx-word.Urx) < delta &&
+					math.Abs(w.Lly-word.Lly) < delta &&
+					math.Abs(w.Ury-word.Ury) < delta {
+					duplicates[w] = struct{}{}
+				}
+			}
+			if len(duplicates) > 0 {
+				i := 0
+				for _, w := range words {
+					if _, ok := duplicates[w]; !ok {
+						words[i] = w
+						i++
+					}
+				}
+				b.bins[idx] = words[:len(words)-len(duplicates)]
+				if len(b.bins[idx]) == 0 {
+					delete(b.bins, idx)
+				}
+			}
+		}
+	}
+}
+
 // arrangeText arranges the word fragments (textWords) in `b` into lines and words.
 // The lines are groups of textWords of similar depths.
 // The textWords in each line are sorted in reading order and those that start whole words (as
@ -199,6 +235,8 @@ func (p *textPara) fontsize() float64 {
 func (b *wordBag) arrangeText() *textPara {
 	b.sort() // Sort the words in `b`'s bins in the reading direction.

+	b.removeDuplicates()
+
 	var lines []*textLine

 	// Build the lines by iterating through the words from top to bottom.
@ -257,6 +295,10 @@ func (b *wordBag) arrangeText() *textPara {
 		}
 	}

+	if len(lines) == 0 {
+		return nil
+	}
+
 	sort.Slice(lines, func(i, j int) bool {
 		return diffDepthReading(lines[i], lines[j]) < 0
 	})
--- a/extractor/text_test.go
+++ b/extractor/text_test.go
@ -719,6 +719,7 @@ var extractReferenceTests = []extractReference{
 	{"eu.pdf", 5},
 	{"we-dms.pdf", 1},
 	{"Productivity.pdf", 1},
+	{"Nuance.pdf", 1},
 }

 // extractReference describes a PDF file and page number.