unipdf/extractor/text_test.go
2019-05-16 20:44:51 +00:00

349 lines
10 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"flag"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"testing"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
"golang.org/x/text/unicode/norm"
)
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.
// forceTest should be set to true to force running all tests.
// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
func init() {
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
if flag.Lookup("test.v") != nil {
isTesting = true
}
}
// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
func TestTextExtractionFragments(t *testing.T) {
fragmentTests := []struct {
name string
contents string
text string
}{
{
name: "portrait",
contents: `
BT
/UniDocCourier 24 Tf
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
{
name: "landscape",
contents: `
BT
/UniDocCourier 24 Tf
0 1 -1 0 0 0 Tm
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
{
name: "180 degree rotation",
contents: `
BT
/UniDocCourier 24 Tf
-1 0 0 -1 0 0 Tm
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
{
name: "Helvetica",
contents: `
BT
/UniDocHelvetica 24 Tf
0 -1 1 0 0 0 Tm
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
}
// Setup mock resources.
resources := model.NewPdfPageResources()
{
courier := model.NewStandard14FontMustCompile(model.CourierName)
helvetica := model.NewStandard14FontMustCompile(model.HelveticaName)
resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
}
for _, f := range fragmentTests {
t.Run(f.name, func(t *testing.T) {
e := Extractor{resources: resources, contents: f.contents}
text, err := e.ExtractText()
if err != nil {
t.Fatalf("Error extracting text: %q err=%v", f.name, err)
return
}
if text != f.text {
t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
return
}
})
}
}
// TestTextExtractionFiles tests text extraction on a set of PDF files.
// It checks for the existence of specified strings of words on specified pages.
// We currently only check within lines as our line order is still improving.
func TestTextExtractionFiles(t *testing.T) {
if len(corpusFolder) == 0 && !forceTest {
t.Log("Corpus folder not set - skipping")
return
}
for _, test := range fileExtractionTests {
t.Run(test.filename, func(t *testing.T) {
testExtractFile(t, test.filename, test.expectedPageText)
})
}
}
// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
var fileExtractionTests = []struct {
filename string
expectedPageText map[int][]string
}{
{filename: "reader.pdf",
expectedPageText: map[int][]string{
1: []string{"A Research UNIX Reader:",
"Annotated Excerpts from the Programmers Manual,",
"1. Introduction",
"To keep the size of this report",
"last common ancestor of a radiative explosion",
},
},
},
{filename: "000026.pdf",
expectedPageText: map[int][]string{
1: []string{"Fresh Flower",
"Care & Handling",
},
},
},
{filename: "search_sim_key.pdf",
expectedPageText: map[int][]string{
2: []string{"A cryptographic scheme which enables searching",
"Untrusted server should not be able to search for a word without authorization",
},
},
},
{filename: "Theil_inequality.pdf",
expectedPageText: map[int][]string{
1: []string{"London School of Economics and Political Science"},
4: []string{"The purpose of this paper is to set Theils approach"},
},
},
{filename: "8207.pdf",
expectedPageText: map[int][]string{
1: []string{"In building graphic systems for use with raster devices,"},
2: []string{"The imaging model specifies how geometric shapes and colors are"},
3: []string{"The transformation matrix T that maps application defined"},
},
},
{filename: "ling-2013-0040ad.pdf",
expectedPageText: map[int][]string{
1: []string{"Although the linguistic variation among texts is continuous"},
2: []string{"distinctions. For example, much of the research on spoken/written"},
},
},
{filename: "26-Hazard-Thermal-environment.pdf",
expectedPageText: map[int][]string{
1: []string{"OHS Body of Knowledge"},
2: []string{"Copyright notice and licence terms"},
},
},
{filename: "Threshold_survey.pdf",
expectedPageText: map[int][]string{
1: []string{"clustering, entropy, object attributes, spatial correlation, and local"},
},
},
{filename: "circ2.pdf",
expectedPageText: map[int][]string{
1: []string{"Understanding and complying with copyright law can be a challenge"},
},
},
{filename: "rare_word.pdf",
expectedPageText: map[int][]string{
6: []string{"words in the test set, we increase the BLEU score"},
},
},
{filename: "Planck_Wien.pdf",
expectedPageText: map[int][]string{
1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
},
},
// Case where combineDiacritics was combining ' and " with preceeding letters.
// NOTE(peterwilliams97): Part of the reason this test fails is that we don't currently read
// Type0:CIDFontType0 font metrics and assume zero displacemet so that we place the ' and " too
// close to the preceeding letters.
{filename: "/rfc6962.txt.pdf",
expectedPageText: map[int][]string{
4: []string{
"timestamps for certificates they then dont log",
`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
},
},
// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
// {filename: "Ito_Formula.pdf",
// expectedPageText: map[int][]string{
// 1: []string{
// "In the Itô stochastic calculus",
// "In standard, non-stochastic calculus, one computes a derivative"},
// 2: []string{"Financial Economics Itôs Formula"},
// },
// },
// {filename: "thanh.pdf",
// expectedPageText: map[int][]string{
// 1: []string{"Hàn Thé̂ Thành"},
// },
// },
}
// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
// text to `expectedPageText`.
//
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
// the corpus directory.
//
// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
filepath := filepath.Join(corpusFolder, filename)
exists := checkFileExists(filepath)
if !exists {
if forceTest {
t.Fatalf("filename=%q does not exist", filename)
}
t.Logf("%s not found", filename)
return
}
_, actualPageText := extractPageTexts(t, filepath)
for _, pageNum := range sortedKeys(expectedPageText) {
expectedSentences, ok := expectedPageText[pageNum]
actualText, ok := actualPageText[pageNum]
if !ok {
t.Fatalf("%q doesn't have page %d", filename, pageNum)
}
actualText = norm.NFKC.String(actualText)
if !containsSentences(t, expectedSentences, actualText) {
t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
}
}
}
// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
// {page number: page text}
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
f, err := os.Open(filename)
if err != nil {
t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
}
defer f.Close()
pdfReader, err := model.NewPdfReader(f)
if err != nil {
t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
}
numPages, err := pdfReader.GetNumPages()
if err != nil {
t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
}
pageText := map[int]string{}
for pageNum := 1; pageNum <= numPages; pageNum++ {
page, err := pdfReader.GetPage(pageNum)
if err != nil {
t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
ex, err := New(page)
if err != nil {
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
text, _, _, err := ex.ExtractTextWithStats()
if err != nil {
t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
pageText[pageNum] = reduceSpaces(text)
}
return numPages, pageText
}
// containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
for _, e := range expectedSentences {
e = norm.NFKC.String(e)
if !strings.Contains(actualText, e) {
t.Errorf("No match for %q", e)
return false
}
}
return true
}
// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
// reduced to a single space.
func reduceSpaces(text string) string {
text = reSpace.ReplaceAllString(text, " ")
return strings.Trim(text, " \t\n\r\v")
}
var reSpace = regexp.MustCompile(`(?m)\s+`)
// checkFileExists returns true if `filepath` exists.
func checkFileExists(filepath string) bool {
_, err := os.Stat(filepath)
return err == nil
}
// sortedKeys returns the keys of `m` as a sorted slice.
func sortedKeys(m map[int][]string) []int {
keys := []int{}
for k := range m {
keys = append(keys, k)
}
sort.Ints(keys)
return keys
}