2018-03-22 14:03:47 +00:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
*/
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
package extractor
|
|
|
|
|
2018-03-22 13:53:12 +00:00
|
|
|
import (
|
|
|
|
"flag"
|
2018-11-12 11:04:09 +11:00
|
|
|
"os"
|
|
|
|
"sort"
|
|
|
|
"strings"
|
2018-03-22 13:53:12 +00:00
|
|
|
"testing"
|
2018-11-02 15:13:48 +11:00
|
|
|
|
|
|
|
"github.com/unidoc/unidoc/common"
|
2018-11-12 11:04:09 +11:00
|
|
|
"github.com/unidoc/unidoc/pdf/model"
|
2018-03-22 13:53:12 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
func init() {
|
2018-11-02 15:13:48 +11:00
|
|
|
common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
|
2018-03-22 13:53:12 +00:00
|
|
|
if flag.Lookup("test.v") != nil {
|
|
|
|
isTesting = true
|
|
|
|
}
|
|
|
|
}
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
|
|
const testContents1 = `
|
2018-07-07 09:45:55 +10:00
|
|
|
BT
|
2018-07-15 17:22:00 +10:00
|
|
|
/UniDocCourier 24 Tf
|
2018-07-07 09:45:55 +10:00
|
|
|
(Hello World!)Tj
|
|
|
|
0 -10 Td
|
|
|
|
(Doink)Tj
|
|
|
|
ET
|
2018-03-22 13:01:04 +00:00
|
|
|
`
|
2018-07-07 09:45:55 +10:00
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
const testExpected1 = "Hello World!\nDoink"
|
|
|
|
|
|
|
|
func TestTextExtraction1(t *testing.T) {
|
|
|
|
e := Extractor{}
|
|
|
|
e.contents = testContents1
|
|
|
|
|
|
|
|
s, err := e.ExtractText()
|
|
|
|
if err != nil {
|
|
|
|
t.Errorf("Error extracting text: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if s != testExpected1 {
|
2018-06-27 16:31:28 +10:00
|
|
|
t.Errorf("Text mismatch. Got %q. Expected %q", s, testExpected1)
|
2018-03-22 13:01:04 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
2018-11-12 11:04:09 +11:00
|
|
|
|
|
|
|
func TestTextExtraction2(t *testing.T) {
|
|
|
|
for _, test := range extract2Tests {
|
|
|
|
testExtract2(t, test.filename, test.expectedPageText)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var extract2Tests = []struct {
|
|
|
|
filename string
|
|
|
|
expectedPageText map[int][]string
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
filename: "testdata/reader.pdf",
|
|
|
|
expectedPageText: map[int][]string{
|
|
|
|
1: []string{"A Research UNIX Reader:",
|
|
|
|
"Annotated Excerpts from the Programmer's Manual,",
|
|
|
|
"To keep the size of this report",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
|
|
|
|
_, actualPageText := extractPageTexts(t, filename)
|
|
|
|
for _, pageNum := range sortedKeys(expectedPageText) {
|
|
|
|
expectedSentences, ok := expectedPageText[pageNum]
|
|
|
|
actualText, ok := actualPageText[pageNum]
|
|
|
|
if !ok {
|
|
|
|
t.Fatalf("%q doesn't have page %d", filename, pageNum)
|
|
|
|
}
|
|
|
|
if !containsSentences(t, expectedSentences, actualText) {
|
|
|
|
t.Fatalf("Text mismatch filename=%q page=%d", filename, pageNum)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
|
|
|
|
actualSentences := asSet(strings.Split(actualText, "\n"))
|
|
|
|
for _, e := range expectedSentences {
|
|
|
|
if _, ok := actualSentences[e]; !ok {
|
|
|
|
t.Errorf("No match for %q", e)
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
func sortedKeys(m map[int][]string) []int {
|
|
|
|
keys := []int{}
|
|
|
|
for k := range m {
|
|
|
|
keys = append(keys, k)
|
|
|
|
}
|
|
|
|
sort.Ints(keys)
|
|
|
|
return keys
|
|
|
|
}
|
|
|
|
|
|
|
|
func asSet(keys []string) map[string]bool {
|
|
|
|
set := map[string]bool{}
|
|
|
|
for _, k := range keys {
|
|
|
|
set[k] = true
|
|
|
|
}
|
|
|
|
return set
|
|
|
|
}
|
|
|
|
|
|
|
|
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
|
|
|
|
f, err := os.Open(filename)
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
|
|
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
pdfReader, err := model.NewPdfReader(f)
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
|
|
|
|
}
|
|
|
|
numPages, err := pdfReader.GetNumPages()
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
|
|
|
|
}
|
|
|
|
pageText := map[int]string{}
|
|
|
|
for pageNum := 1; pageNum <= numPages; pageNum++ {
|
|
|
|
|
|
|
|
page, err := pdfReader.GetPage(pageNum)
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
|
|
|
}
|
|
|
|
ex, err := New(page)
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
|
|
|
}
|
|
|
|
text, _, _, err := ex.ExtractText2()
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
|
|
|
}
|
|
|
|
pageText[pageNum] = text
|
|
|
|
}
|
|
|
|
return numPages, pageText
|
|
|
|
}
|