unipdf/pdf/extractor/text_test.go

279 lines
8.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"flag"
"os"
"os/user"
"path/filepath"
"regexp"
"sort"
"strings"
"testing"
"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/model"
)
// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
// forceTest should be set to true to force running all tests.
const forceTest = false
// corpusFolders is where we search for test files.
var corpusFolders = []string{
"./testdata",
"~/testdata",
".",
}
func init() {
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
if flag.Lookup("test.v") != nil {
isTesting = true
}
}
const testContents1 = `
BT
/UniDocCourier 24 Tf
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`
const testExpected1 = "Hello World!\nDoink"
// TestTextExtraction1 tests text extraction on a PDF fragment.
func TestTextExtraction1(t *testing.T) {
e := Extractor{}
e.contents = testContents1
s, err := e.ExtractText()
if err != nil {
t.Errorf("Error extracting text: %v", err)
return
}
if s != testExpected1 {
t.Errorf("Text mismatch. Got %q. Expected %q", s, testExpected1)
return
}
}
// TestTextExtraction2 tests text extraction on set of PDF files.
// It checks for the existence of specified strings of words on specified pages.
// We currently only check within lines as our line order is still improving.
func TestTextExtraction2(t *testing.T) {
for _, test := range extract2Tests {
testExtract2(t, test.filename, test.expectedPageText)
}
}
// extract2Tests are the PDFs and texts we are looking for on specified pages.
var extract2Tests = []struct {
filename string
expectedPageText map[int][]string
}{
{filename: "reader.pdf",
expectedPageText: map[int][]string{
1: []string{"A Research UNIX Reader:",
"Annotated Excerpts from the Programmers Manual,",
"1. Introduction",
"To keep the size of this report",
"last common ancestor of a radiative explosion",
},
},
},
{filename: "000026.pdf",
expectedPageText: map[int][]string{
1: []string{"Fresh Flower",
"Care & Handling",
},
},
},
{filename: "search_sim_key.pdf",
expectedPageText: map[int][]string{
2: []string{"A cryptographic scheme which enables searching",
"Untrusted server should not be able to search for a word without authorization",
},
},
},
{filename: "Theil_inequality.pdf",
expectedPageText: map[int][]string{
1: []string{"London School of Economics and Political Science"},
4: []string{"The purpose of this paper is to set Theils approach"},
},
},
{filename: "8207.pdf",
expectedPageText: map[int][]string{
1: []string{"In building graphic systems for use with raster devices,"},
2: []string{"The imaging model specifies how geometric shapes and colors are"},
3: []string{"The transformation matrix T that maps application defined"},
},
},
{filename: "ling-2013-0040ad.pdf",
expectedPageText: map[int][]string{
1: []string{"Although the linguistic variation among texts is continuous"},
2: []string{"distinctions. For example, much of the research on spoken/written"},
},
},
{filename: "26-Hazard-Thermal-environment.pdf",
expectedPageText: map[int][]string{
1: []string{"OHS Body of Knowledge"},
2: []string{"Copyright notice and licence terms"},
},
},
{filename: "Threshold_survey.pdf",
expectedPageText: map[int][]string{
1: []string{"clustering, entropy, object attributes, spatial correlation, and local"},
},
},
// {filename:"Ito_Formula.pdf",
// expectedPageText: map[int][]string{
// 1: []string{"In the Itô stochastic calculus"},
// },
// },
{filename: "circ2.pdf",
expectedPageText: map[int][]string{
1: []string{"Understanding and complying with copyright law can be a challenge"},
},
},
{filename: "rare_word.pdf",
expectedPageText: map[int][]string{
6: []string{"words in the test set, we increase the BLEU score"},
},
},
// {filename: "Planck_Wien.pdf",
// expectedPageText: map[int][]string{
// 1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
// },
// },
}
// testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted
// text to `expectedPageText`.
// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
// If `filename` cannot be found in `corpusFolders` then the test is skipped.
func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
homeDir, hasHome := getHomeDir()
path, ok := searchDirectories(homeDir, hasHome, corpusFolders, filename)
if !ok {
if forceTest {
t.Fatalf("filename=%q does not exist", filename)
}
return
}
_, actualPageText := extractPageTexts(t, path)
for _, pageNum := range sortedKeys(expectedPageText) {
expectedSentences, ok := expectedPageText[pageNum]
actualText, ok := actualPageText[pageNum]
if !ok {
t.Fatalf("%q doesn't have page %d", filename, pageNum)
}
if !containsSentences(t, expectedSentences, actualText) {
t.Fatalf("Text mismatch filename=%q page=%d", path, pageNum)
}
}
}
// extractPageTexts runs ExtractText2 on all pages in PDF `filename` and returns the result as a map
// {page number: page text}
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
f, err := os.Open(filename)
if err != nil {
t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
}
defer f.Close()
pdfReader, err := model.NewPdfReader(f)
if err != nil {
t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
}
numPages, err := pdfReader.GetNumPages()
if err != nil {
t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
}
pageText := map[int]string{}
for pageNum := 1; pageNum <= numPages; pageNum++ {
page, err := pdfReader.GetPage(pageNum)
if err != nil {
t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
ex, err := New(page)
if err != nil {
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
text, _, _, err := ex.ExtractText2()
if err != nil {
t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
// XXX(peterwilliams97)TODO: Improve text extraction space insertion so we don't need reduceSpaces.
pageText[pageNum] = reduceSpaces(text)
}
return numPages, pageText
}
// containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
for _, e := range expectedSentences {
if !strings.Contains(actualText, e) {
t.Errorf("No match for %#q", e)
return false
}
}
return true
}
// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
// reduced to a single space.
func reduceSpaces(text string) string {
text = reSpace.ReplaceAllString(text, " ")
return strings.Trim(text, " \t\n\r\v")
}
var reSpace = regexp.MustCompile(`(?m)\s+`)
// searchDirectories searches `directories` for `filename` and returns the full file path if it is
// found. `homeDir` and `hasHome` are used for home directory substitution.
func searchDirectories(homeDir string, hasHome bool, directories []string, filename string) (string, bool) {
for _, direct := range directories {
if hasHome {
direct = strings.Replace(direct, "~", homeDir, 1)
}
path := filepath.Join(direct, filename)
if _, err := os.Stat(path); err == nil {
return path, true
}
}
return "", false
}
// getHomeDir returns the current user's home directory if it is defined and a bool to tell if it
// is defined.
func getHomeDir() (string, bool) {
usr, err := user.Current()
if err != nil {
common.Log.Error("No current user. err=%v", err)
return "", false
}
return usr.HomeDir, true
}
// sortedKeys returns the keys of `m` as a sorted slice.
func sortedKeys(m map[int][]string) []int {
keys := []int{}
for k := range m {
keys = append(keys, k)
}
sort.Ints(keys)
return keys
}