unipdf/pdf/extractor/text_test.go

349 lines
10 KiB
Go
Raw Normal View History

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
2018-03-22 13:53:12 +00:00
import (
"flag"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
2018-03-22 13:53:12 +00:00
"testing"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/pdf/model"
2018-11-28 18:06:03 +11:00
"golang.org/x/text/unicode/norm"
2018-03-22 13:53:12 +00:00
)
2018-11-28 23:25:17 +00:00
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.
// forceTest should be set to true to force running all tests.
2018-11-28 23:25:17 +00:00
// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
2018-11-28 23:25:17 +00:00
var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
2018-03-22 13:53:12 +00:00
func init() {
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
2018-03-22 13:53:12 +00:00
if flag.Lookup("test.v") != nil {
isTesting = true
}
}
2018-11-28 23:25:17 +00:00
// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
func TestTextExtractionFragments(t *testing.T) {
fragmentTests := []struct {
name string
contents string
text string
}{
{
name: "portrait",
contents: `
BT
/UniDocCourier 24 Tf
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
2018-11-28 23:25:17 +00:00
text: "Hello World!\nDoink",
},
{
name: "landscape",
contents: `
BT
/UniDocCourier 24 Tf
0 1 -1 0 0 0 Tm
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
2018-11-28 23:25:17 +00:00
text: "Hello World!\nDoink",
},
{
name: "180 degree rotation",
contents: `
BT
/UniDocCourier 24 Tf
-1 0 0 -1 0 0 Tm
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
2018-11-28 23:25:17 +00:00
text: "Hello World!\nDoink",
},
{
name: "Helvetica",
contents: `
BT
/UniDocHelvetica 24 Tf
0 -1 1 0 0 0 Tm
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
2018-11-28 23:25:17 +00:00
text: "Hello World!\nDoink",
},
}
2018-11-28 23:25:17 +00:00
// Setup mock resources.
resources := model.NewPdfPageResources()
{
courier := model.NewStandard14FontMustCompile(model.CourierName)
helvetica := model.NewStandard14FontMustCompile(model.HelveticaName)
2018-11-28 23:25:17 +00:00
resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
}
2018-11-28 23:25:17 +00:00
for _, f := range fragmentTests {
t.Run(f.name, func(t *testing.T) {
e := Extractor{resources: resources, contents: f.contents}
text, err := e.ExtractText()
if err != nil {
t.Fatalf("Error extracting text: %q err=%v", f.name, err)
return
}
if text != f.text {
t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
return
}
})
}
}
2018-11-28 23:25:17 +00:00
// TestTextExtractionFiles tests text extraction on a set of PDF files.
// It checks for the existence of specified strings of words on specified pages.
// We currently only check within lines as our line order is still improving.
2018-11-28 23:25:17 +00:00
func TestTextExtractionFiles(t *testing.T) {
if len(corpusFolder) == 0 && !forceTest {
t.Log("Corpus folder not set - skipping")
return
}
for _, test := range fileExtractionTests {
t.Run(test.filename, func(t *testing.T) {
testExtractFile(t, test.filename, test.expectedPageText)
})
}
}
2018-11-28 23:25:17 +00:00
// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
var fileExtractionTests = []struct {
filename string
expectedPageText map[int][]string
}{
{filename: "reader.pdf",
expectedPageText: map[int][]string{
1: []string{"A Research UNIX Reader:",
"Annotated Excerpts from the Programmers Manual,",
"1. Introduction",
"To keep the size of this report",
"last common ancestor of a radiative explosion",
},
},
},
{filename: "000026.pdf",
expectedPageText: map[int][]string{
1: []string{"Fresh Flower",
"Care & Handling",
},
},
},
{filename: "search_sim_key.pdf",
expectedPageText: map[int][]string{
2: []string{"A cryptographic scheme which enables searching",
"Untrusted server should not be able to search for a word without authorization",
},
},
},
{filename: "Theil_inequality.pdf",
expectedPageText: map[int][]string{
1: []string{"London School of Economics and Political Science"},
4: []string{"The purpose of this paper is to set Theils approach"},
},
},
{filename: "8207.pdf",
expectedPageText: map[int][]string{
1: []string{"In building graphic systems for use with raster devices,"},
2: []string{"The imaging model specifies how geometric shapes and colors are"},
3: []string{"The transformation matrix T that maps application defined"},
},
},
{filename: "ling-2013-0040ad.pdf",
expectedPageText: map[int][]string{
1: []string{"Although the linguistic variation among texts is continuous"},
2: []string{"distinctions. For example, much of the research on spoken/written"},
},
},
{filename: "26-Hazard-Thermal-environment.pdf",
expectedPageText: map[int][]string{
1: []string{"OHS Body of Knowledge"},
2: []string{"Copyright notice and licence terms"},
},
},
{filename: "Threshold_survey.pdf",
expectedPageText: map[int][]string{
1: []string{"clustering, entropy, object attributes, spatial correlation, and local"},
},
},
{filename: "circ2.pdf",
expectedPageText: map[int][]string{
1: []string{"Understanding and complying with copyright law can be a challenge"},
},
},
{filename: "rare_word.pdf",
expectedPageText: map[int][]string{
6: []string{"words in the test set, we increase the BLEU score"},
},
},
{filename: "Planck_Wien.pdf",
expectedPageText: map[int][]string{
1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
},
},
// Case where combineDiacritics was combining ' and " with preceeding letters.
// NOTE(peterwilliams97): Part of the reason this test fails is that we don't currently read
// Type0:CIDFontType0 font metrics and assume zero displacemet so that we place the ' and " too
// close to the preceeding letters.
{filename: "/rfc6962.txt.pdf",
2018-11-28 18:06:03 +11:00
expectedPageText: map[int][]string{
4: []string{
"timestamps for certificates they then dont log",
`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
2018-11-28 18:06:03 +11:00
},
},
// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
// {filename: "Ito_Formula.pdf",
// expectedPageText: map[int][]string{
// 1: []string{
// "In the Itô stochastic calculus",
// "In standard, non-stochastic calculus, one computes a derivative"},
// 2: []string{"Financial Economics Itôs Formula"},
// },
// },
// {filename: "thanh.pdf",
// expectedPageText: map[int][]string{
// 1: []string{"Hàn Thé̂ Thành"},
// },
// },
}
2018-11-28 23:25:17 +00:00
// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
// text to `expectedPageText`.
2018-11-28 23:25:17 +00:00
//
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
// the corpus directory.
//
// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
filepath := filepath.Join(corpusFolder, filename)
exists := checkFileExists(filepath)
if !exists {
if forceTest {
t.Fatalf("filename=%q does not exist", filename)
}
2018-11-28 23:25:17 +00:00
t.Logf("%s not found", filename)
return
}
2018-11-28 23:25:17 +00:00
_, actualPageText := extractPageTexts(t, filepath)
for _, pageNum := range sortedKeys(expectedPageText) {
expectedSentences, ok := expectedPageText[pageNum]
actualText, ok := actualPageText[pageNum]
if !ok {
t.Fatalf("%q doesn't have page %d", filename, pageNum)
}
2018-11-28 18:06:03 +11:00
actualText = norm.NFKC.String(actualText)
if !containsSentences(t, expectedSentences, actualText) {
2018-11-28 23:25:17 +00:00
t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
}
}
}
2018-11-28 23:25:17 +00:00
// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
// {page number: page text}
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
f, err := os.Open(filename)
if err != nil {
t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
}
defer f.Close()
pdfReader, err := model.NewPdfReader(f)
if err != nil {
t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
}
numPages, err := pdfReader.GetNumPages()
if err != nil {
t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
}
pageText := map[int]string{}
for pageNum := 1; pageNum <= numPages; pageNum++ {
page, err := pdfReader.GetPage(pageNum)
if err != nil {
t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
ex, err := New(page)
if err != nil {
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
2018-11-28 23:25:17 +00:00
text, _, _, err := ex.ExtractTextWithStats()
if err != nil {
2018-11-28 23:25:17 +00:00
t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
2018-11-28 23:25:17 +00:00
// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
pageText[pageNum] = reduceSpaces(text)
}
return numPages, pageText
}
// containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
for _, e := range expectedSentences {
2018-11-28 18:06:03 +11:00
e = norm.NFKC.String(e)
if !strings.Contains(actualText, e) {
2018-11-28 18:06:03 +11:00
t.Errorf("No match for %q", e)
return false
}
}
return true
}
// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
// reduced to a single space.
func reduceSpaces(text string) string {
text = reSpace.ReplaceAllString(text, " ")
return strings.Trim(text, " \t\n\r\v")
}
var reSpace = regexp.MustCompile(`(?m)\s+`)
2018-11-28 23:25:17 +00:00
// checkFileExists returns true if `filepath` exists.
func checkFileExists(filepath string) bool {
_, err := os.Stat(filepath)
return err == nil
}
// sortedKeys returns the keys of `m` as a sorted slice.
func sortedKeys(m map[int][]string) []int {
keys := []int{}
for k := range m {
keys = append(keys, k)
}
sort.Ints(keys)
return keys
}