Added duplicate text detection.

This commit is contained in:
Peter Williams 2020-06-23 15:33:34 +10:00
parent e65fb041e5
commit 5933a3dd81
4 changed files with 53 additions and 4 deletions

View File

@ -67,9 +67,12 @@ const (
// Maximum spacing between characters within a line.
maxIntraLineGapR = 0.02
// Max difference in coordinates of duplicated textWords.
maxDuplicateWordR = 0.2
minHyphenation = 4
//
// The distance we look down from the top of a wordBag for the leftmost word.
topWordRangeR = 4.0
// minimum number of cells in a textTable
minTableParas = 6

View File

@ -58,9 +58,12 @@ func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraL
paraWords = mergWordBags(paraWords)
// Arrange the contents of each paragraph wordBag into lines and the lines into whole words.
paras := make(paraList, len(paraWords))
for i, para := range paraWords {
paras[i] = para.arrangeText()
paras := make(paraList, 0, len(paraWords))
for _, bag := range paraWords {
para := bag.arrangeText()
if para != nil {
paras = append(paras, para)
}
}
// Find paras that are cells in tables, convert the tables to paras and remove the cell paras.

View File

@ -9,6 +9,7 @@ import (
"bytes"
"fmt"
"io"
"math"
"sort"
"github.com/unidoc/unipdf/v3/common"
@ -192,6 +193,41 @@ func (p *textPara) fontsize() float64 {
return p.lines[0].fontsize
}
// removeDuplicates removes duplicate word fragments such as those used for bolding.
func (b *wordBag) removeDuplicates() {
for _, depthIdx := range b.depthIndexes() {
word := b.bins[depthIdx][0]
delta := maxDuplicateWordR * word.fontsize
minDepth := word.depth
for _, idx := range b.depthBand(minDepth, minDepth+delta) {
duplicates := map[*textWord]struct{}{}
words := b.bins[idx]
for _, w := range words {
if w != word && w.text == word.text &&
math.Abs(w.Llx-word.Llx) < delta &&
math.Abs(w.Urx-word.Urx) < delta &&
math.Abs(w.Lly-word.Lly) < delta &&
math.Abs(w.Ury-word.Ury) < delta {
duplicates[w] = struct{}{}
}
}
if len(duplicates) > 0 {
i := 0
for _, w := range words {
if _, ok := duplicates[w]; !ok {
words[i] = w
i++
}
}
b.bins[idx] = words[:len(words)-len(duplicates)]
if len(b.bins[idx]) == 0 {
delete(b.bins, idx)
}
}
}
}
}
// arrangeText arranges the word fragments (textWords) in `b` into lines and words.
// The lines are groups of textWords of similar depths.
// The textWords in each line are sorted in reading order and those that start whole words (as
@ -199,6 +235,8 @@ func (p *textPara) fontsize() float64 {
func (b *wordBag) arrangeText() *textPara {
b.sort() // Sort the words in `b`'s bins in the reading direction.
b.removeDuplicates()
var lines []*textLine
// Build the lines by iterating through the words from top to bottom.
@ -257,6 +295,10 @@ func (b *wordBag) arrangeText() *textPara {
}
}
if len(lines) == 0 {
return nil
}
sort.Slice(lines, func(i, j int) bool {
return diffDepthReading(lines[i], lines[j]) < 0
})

View File

@ -719,6 +719,7 @@ var extractReferenceTests = []extractReference{
{"eu.pdf", 5},
{"we-dms.pdf", 1},
{"Productivity.pdf", 1},
{"Nuance.pdf", 1},
}
// extractReference describes a PDF file and page number.