mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-13 19:29:10 +08:00
Added duplicate text detection.
This commit is contained in:
parent
e65fb041e5
commit
5933a3dd81
@ -67,9 +67,12 @@ const (
|
||||
// Maximum spacing between characters within a line.
|
||||
maxIntraLineGapR = 0.02
|
||||
|
||||
// Max difference in coordinates of duplicated textWords.
|
||||
maxDuplicateWordR = 0.2
|
||||
|
||||
minHyphenation = 4
|
||||
|
||||
//
|
||||
// The distance we look down from the top of a wordBag for the leftmost word.
|
||||
topWordRangeR = 4.0
|
||||
// minimum number of cells in a textTable
|
||||
minTableParas = 6
|
||||
|
@ -58,9 +58,12 @@ func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraL
|
||||
paraWords = mergWordBags(paraWords)
|
||||
|
||||
// Arrange the contents of each paragraph wordBag into lines and the lines into whole words.
|
||||
paras := make(paraList, len(paraWords))
|
||||
for i, para := range paraWords {
|
||||
paras[i] = para.arrangeText()
|
||||
paras := make(paraList, 0, len(paraWords))
|
||||
for _, bag := range paraWords {
|
||||
para := bag.arrangeText()
|
||||
if para != nil {
|
||||
paras = append(paras, para)
|
||||
}
|
||||
}
|
||||
|
||||
// Find paras that are cells in tables, convert the tables to paras and remove the cell paras.
|
||||
|
@ -9,6 +9,7 @@ import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/unidoc/unipdf/v3/common"
|
||||
@ -192,6 +193,41 @@ func (p *textPara) fontsize() float64 {
|
||||
return p.lines[0].fontsize
|
||||
}
|
||||
|
||||
// removeDuplicates removes duplicate word fragments such as those used for bolding.
|
||||
func (b *wordBag) removeDuplicates() {
|
||||
for _, depthIdx := range b.depthIndexes() {
|
||||
word := b.bins[depthIdx][0]
|
||||
delta := maxDuplicateWordR * word.fontsize
|
||||
minDepth := word.depth
|
||||
for _, idx := range b.depthBand(minDepth, minDepth+delta) {
|
||||
duplicates := map[*textWord]struct{}{}
|
||||
words := b.bins[idx]
|
||||
for _, w := range words {
|
||||
if w != word && w.text == word.text &&
|
||||
math.Abs(w.Llx-word.Llx) < delta &&
|
||||
math.Abs(w.Urx-word.Urx) < delta &&
|
||||
math.Abs(w.Lly-word.Lly) < delta &&
|
||||
math.Abs(w.Ury-word.Ury) < delta {
|
||||
duplicates[w] = struct{}{}
|
||||
}
|
||||
}
|
||||
if len(duplicates) > 0 {
|
||||
i := 0
|
||||
for _, w := range words {
|
||||
if _, ok := duplicates[w]; !ok {
|
||||
words[i] = w
|
||||
i++
|
||||
}
|
||||
}
|
||||
b.bins[idx] = words[:len(words)-len(duplicates)]
|
||||
if len(b.bins[idx]) == 0 {
|
||||
delete(b.bins, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// arrangeText arranges the word fragments (textWords) in `b` into lines and words.
|
||||
// The lines are groups of textWords of similar depths.
|
||||
// The textWords in each line are sorted in reading order and those that start whole words (as
|
||||
@ -199,6 +235,8 @@ func (p *textPara) fontsize() float64 {
|
||||
func (b *wordBag) arrangeText() *textPara {
|
||||
b.sort() // Sort the words in `b`'s bins in the reading direction.
|
||||
|
||||
b.removeDuplicates()
|
||||
|
||||
var lines []*textLine
|
||||
|
||||
// Build the lines by iterating through the words from top to bottom.
|
||||
@ -257,6 +295,10 @@ func (b *wordBag) arrangeText() *textPara {
|
||||
}
|
||||
}
|
||||
|
||||
if len(lines) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
sort.Slice(lines, func(i, j int) bool {
|
||||
return diffDepthReading(lines[i], lines[j]) < 0
|
||||
})
|
||||
|
@ -719,6 +719,7 @@ var extractReferenceTests = []extractReference{
|
||||
{"eu.pdf", 5},
|
||||
{"we-dms.pdf", 1},
|
||||
{"Productivity.pdf", 1},
|
||||
{"Nuance.pdf", 1},
|
||||
}
|
||||
|
||||
// extractReference describes a PDF file and page number.
|
||||
|
Loading…
x
Reference in New Issue
Block a user