mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00
178 lines
4.5 KiB
Go
178 lines
4.5 KiB
Go
/*
|
||
* This file is subject to the terms and conditions defined in
|
||
* file 'LICENSE.md', which is part of this source code package.
|
||
*/
|
||
|
||
package extractor
|
||
|
||
import (
|
||
"flag"
|
||
"os"
|
||
"regexp"
|
||
"sort"
|
||
"strings"
|
||
"testing"
|
||
|
||
"github.com/unidoc/unidoc/common"
|
||
"github.com/unidoc/unidoc/pdf/model"
|
||
)
|
||
|
||
func init() {
|
||
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
|
||
if flag.Lookup("test.v") != nil {
|
||
isTesting = true
|
||
}
|
||
}
|
||
|
||
const testContents1 = `
|
||
BT
|
||
/UniDocCourier 24 Tf
|
||
(Hello World!)Tj
|
||
0 -10 Td
|
||
(Doink)Tj
|
||
ET
|
||
`
|
||
|
||
const testExpected1 = "Hello World!\nDoink"
|
||
|
||
func TestTextExtraction1(t *testing.T) {
|
||
e := Extractor{}
|
||
e.contents = testContents1
|
||
|
||
s, err := e.ExtractText()
|
||
if err != nil {
|
||
t.Errorf("Error extracting text: %v", err)
|
||
return
|
||
}
|
||
if s != testExpected1 {
|
||
t.Errorf("Text mismatch. Got %q. Expected %q", s, testExpected1)
|
||
return
|
||
}
|
||
}
|
||
|
||
func TestTextExtraction2(t *testing.T) {
|
||
for _, test := range extract2Tests {
|
||
testExtract2(t, test.filename, test.expectedPageText)
|
||
}
|
||
}
|
||
|
||
var extract2Tests = []struct {
|
||
filename string
|
||
expectedPageText map[int][]string
|
||
}{
|
||
{filename: "testdata/reader.pdf",
|
||
expectedPageText: map[int][]string{
|
||
1: []string{"A Research UNIX Reader:",
|
||
"Annotated Excerpts from the Programmer’s Manual,",
|
||
"1. Introduction",
|
||
"To keep the size of this report",
|
||
"last common ancestor of a radiative explosion",
|
||
},
|
||
},
|
||
},
|
||
{filename: "testdata/000026.pdf",
|
||
expectedPageText: map[int][]string{
|
||
1: []string{"Fresh Flower",
|
||
"Care & Handling
",
|
||
},
|
||
},
|
||
},
|
||
{filename: "testdata/search_sim_key.pdf",
|
||
expectedPageText: map[int][]string{
|
||
2: []string{"A cryptographic scheme which enables searching",
|
||
"Untrusted server should not be able to search for a word without authorization",
|
||
},
|
||
},
|
||
},
|
||
{filename: "testdata/Theil_inequality.pdf",
|
||
expectedPageText: map[int][]string{
|
||
1: []string{"London School of Economics and Political Science"},
|
||
4: []string{"The purpose of this paper is to set Theil’s approach"},
|
||
},
|
||
},
|
||
{filename: "testdata/8207.pdf",
|
||
expectedPageText: map[int][]string{
|
||
1: []string{"In building graphic systems for use with raster devices,"},
|
||
2: []string{"The imaging model specifies how geometric shapes and colors are"},
|
||
3: []string{"The transformation matrix T that maps application defined"},
|
||
},
|
||
},
|
||
}
|
||
|
||
func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
|
||
_, actualPageText := extractPageTexts(t, filename)
|
||
for _, pageNum := range sortedKeys(expectedPageText) {
|
||
expectedSentences, ok := expectedPageText[pageNum]
|
||
actualText, ok := actualPageText[pageNum]
|
||
if !ok {
|
||
t.Fatalf("%q doesn't have page %d", filename, pageNum)
|
||
}
|
||
if !containsSentences(t, expectedSentences, actualText) {
|
||
t.Fatalf("Text mismatch filename=%q page=%d", filename, pageNum)
|
||
}
|
||
}
|
||
}
|
||
|
||
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
|
||
for _, e := range expectedSentences {
|
||
if !strings.Contains(actualText, e) {
|
||
t.Errorf("No match for %+q", e)
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
func sortedKeys(m map[int][]string) []int {
|
||
keys := []int{}
|
||
for k := range m {
|
||
keys = append(keys, k)
|
||
}
|
||
sort.Ints(keys)
|
||
return keys
|
||
}
|
||
|
||
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
|
||
f, err := os.Open(filename)
|
||
if err != nil {
|
||
t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
|
||
}
|
||
defer f.Close()
|
||
|
||
pdfReader, err := model.NewPdfReader(f)
|
||
if err != nil {
|
||
t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
|
||
}
|
||
numPages, err := pdfReader.GetNumPages()
|
||
if err != nil {
|
||
t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
|
||
}
|
||
pageText := map[int]string{}
|
||
for pageNum := 1; pageNum <= numPages; pageNum++ {
|
||
|
||
page, err := pdfReader.GetPage(pageNum)
|
||
if err != nil {
|
||
t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
||
}
|
||
ex, err := New(page)
|
||
if err != nil {
|
||
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
||
}
|
||
text, _, _, err := ex.ExtractText2()
|
||
if err != nil {
|
||
t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
||
}
|
||
pageText[pageNum] = reduceSpaces(text)
|
||
}
|
||
return numPages, pageText
|
||
}
|
||
|
||
// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
|
||
// reduced to a single space.
|
||
func reduceSpaces(text string) string {
|
||
text = reSpace.ReplaceAllString(text, " ")
|
||
return strings.Trim(text, " \t\n\r\v")
|
||
}
|
||
|
||
var reSpace = regexp.MustCompile(`(?m)\s+`)
|