unipdf/pdf/extractor/text_test.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"flag"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strings"
	"testing"

	"github.com/unidoc/unidoc/common"
	"github.com/unidoc/unidoc/pdf/model"

	"golang.org/x/text/unicode/norm"
)

// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.

// forceTest should be set to true to force running all tests.
// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"

var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")

func init() {
	common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
	if flag.Lookup("test.v") != nil {
		isTesting = true
	}
}

// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
func TestTextExtractionFragments(t *testing.T) {
	fragmentTests := []struct {
		name     string
		contents string
		text     string
	}{
		{
			name: "portrait",
			contents: `
        BT
        /UniDocCourier 24 Tf
        (Hello World!)Tj
        0 -10 Td
        (Doink)Tj
        ET
        `,
			text: "Hello World!\nDoink",
		},
		{
			name: "landscape",
			contents: `
        BT
        /UniDocCourier 24 Tf
        0 1 -1 0 0 0 Tm
        (Hello World!)Tj
        0 -10 Td
        (Doink)Tj
        ET
        `,
			text: "Hello World!\nDoink",
		},
		{
			name: "180 degree rotation",
			contents: `
        BT
        /UniDocCourier 24 Tf
        -1 0 0 -1 0 0 Tm
        (Hello World!)Tj
        0 -10 Td
        (Doink)Tj
        ET
        `,
			text: "Hello World!\nDoink",
		},
		{
			name: "Helvetica",
			contents: `
        BT
        /UniDocHelvetica 24 Tf
        0 -1 1 0 0 0 Tm
        (Hello World!)Tj
        0 -10 Td
        (Doink)Tj
        ET
        `,
			text: "Hello World!\nDoink",
		},
	}

	// Setup mock resources.
	resources := model.NewPdfPageResources()
	{
		courier := model.NewStandard14FontMustCompile(model.CourierName)
		helvetica := model.NewStandard14FontMustCompile(model.HelveticaName)
		resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
		resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
	}

	for _, f := range fragmentTests {
		t.Run(f.name, func(t *testing.T) {
			e := Extractor{resources: resources, contents: f.contents}
			text, err := e.ExtractText()
			if err != nil {
				t.Fatalf("Error extracting text: %q err=%v", f.name, err)
				return
			}
			if text != f.text {
				t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
				return
			}
		})
	}
}

// TestTextExtractionFiles tests text extraction on a set of PDF files.
// It checks for the existence of specified strings of words on specified pages.
// We currently only check within lines as our line order is still improving.
func TestTextExtractionFiles(t *testing.T) {
	if len(corpusFolder) == 0 && !forceTest {
		t.Log("Corpus folder not set - skipping")
		return
	}

	for _, test := range fileExtractionTests {
		t.Run(test.filename, func(t *testing.T) {
			testExtractFile(t, test.filename, test.expectedPageText)
		})
	}
}

// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
var fileExtractionTests = []struct {
	filename         string
	expectedPageText map[int][]string
}{
	{filename: "reader.pdf",
		expectedPageText: map[int][]string{
			1: []string{"A Research UNIX Reader:",
				"Annotated Excerpts from the Programmer’s Manual,",
				"1. Introduction",
				"To keep the size of this report",
				"last common ancestor of a radiative explosion",
			},
		},
	},
	{filename: "000026.pdf",
		expectedPageText: map[int][]string{
			1: []string{"Fresh Flower",
				"Care & Handling ",
			},
		},
	},
	{filename: "search_sim_key.pdf",
		expectedPageText: map[int][]string{
			2: []string{"A cryptographic scheme which enables searching",
				"Untrusted server should not be able to search for a word without authorization",
			},
		},
	},
	{filename: "Theil_inequality.pdf",
		expectedPageText: map[int][]string{
			1: []string{"London School of Economics and Political Science"},
			4: []string{"The purpose of this paper is to set Theil’s approach"},
		},
	},
	{filename: "8207.pdf",
		expectedPageText: map[int][]string{
			1: []string{"In building graphic systems for use with raster devices,"},
			2: []string{"The imaging model specifies how geometric shapes and colors are"},
			3: []string{"The transformation matrix T that maps application defined"},
		},
	},
	{filename: "ling-2013-0040ad.pdf",
		expectedPageText: map[int][]string{
			1: []string{"Although the linguistic variation among texts is continuous"},
			2: []string{"distinctions. For example, much of the research on spoken/written"},
		},
	},
	{filename: "26-Hazard-Thermal-environment.pdf",
		expectedPageText: map[int][]string{
			1: []string{"OHS Body of Knowledge"},
			2: []string{"Copyright notice and licence terms"},
		},
	},
	{filename: "Threshold_survey.pdf",
		expectedPageText: map[int][]string{
			1: []string{"clustering, entropy, object attributes, spatial correlation, and local"},
		},
	},
	{filename: "circ2.pdf",
		expectedPageText: map[int][]string{
			1: []string{"Understanding and complying with copyright law can be a challenge"},
		},
	},
	{filename: "rare_word.pdf",
		expectedPageText: map[int][]string{
			6: []string{"words in the test set, we increase the BLEU score"},
		},
	},
	{filename: "Planck_Wien.pdf",
		expectedPageText: map[int][]string{
			1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
		},
	},
	// Case where combineDiacritics was combining ' and " with preceeding letters.
	// NOTE(peterwilliams97): Part of the reason this test fails is that we don't currently read
	// Type0:CIDFontType0 font metrics and assume zero displacemet so that we place the ' and " too
	// close to the preceeding letters.
	{filename: "/rfc6962.txt.pdf",
		expectedPageText: map[int][]string{
			4: []string{
				"timestamps for certificates they then don't log",
				`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
		},
	},
	// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
	// {filename: "Ito_Formula.pdf",
	// 	expectedPageText: map[int][]string{
	// 		1: []string{
	// 			"In the Itô stochastic calculus",
	// 			"In standard, non-stochastic calculus, one computes a derivative"},
	// 		2: []string{"Financial Economics Itô’s Formula"},
	// 	},
	// },
	// {filename: "thanh.pdf",
	// 	expectedPageText: map[int][]string{
	// 		1: []string{"Hàn Thé̂ Thành"},
	// 	},
	// },
}

// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
// text to `expectedPageText`.
//
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
// the corpus directory.
//
// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
	filepath := filepath.Join(corpusFolder, filename)
	exists := checkFileExists(filepath)
	if !exists {
		if forceTest {
			t.Fatalf("filename=%q does not exist", filename)
		}
		t.Logf("%s not found", filename)
		return
	}

	_, actualPageText := extractPageTexts(t, filepath)
	for _, pageNum := range sortedKeys(expectedPageText) {
		expectedSentences, ok := expectedPageText[pageNum]
		actualText, ok := actualPageText[pageNum]
		if !ok {
			t.Fatalf("%q doesn't have page %d", filename, pageNum)
		}
		actualText = norm.NFKC.String(actualText)
		if !containsSentences(t, expectedSentences, actualText) {
			t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
		}
	}
}

// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
// {page number: page text}
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
	f, err := os.Open(filename)
	if err != nil {
		t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
	}
	defer f.Close()

	pdfReader, err := model.NewPdfReader(f)
	if err != nil {
		t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
	}
	numPages, err := pdfReader.GetNumPages()
	if err != nil {
		t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
	}
	pageText := map[int]string{}
	for pageNum := 1; pageNum <= numPages; pageNum++ {

		page, err := pdfReader.GetPage(pageNum)
		if err != nil {
			t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
		}
		ex, err := New(page)
		if err != nil {
			t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
		}
		text, _, _, err := ex.ExtractTextWithStats()
		if err != nil {
			t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
		}
		// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
		pageText[pageNum] = reduceSpaces(text)
	}
	return numPages, pageText
}

// containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
	for _, e := range expectedSentences {
		e = norm.NFKC.String(e)
		if !strings.Contains(actualText, e) {
			t.Errorf("No match for %q", e)
			return false
		}
	}
	return true
}

// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
// reduced to a single space.
func reduceSpaces(text string) string {
	text = reSpace.ReplaceAllString(text, " ")
	return strings.Trim(text, " \t\n\r\v")
}

var reSpace = regexp.MustCompile(`(?m)\s+`)

// checkFileExists returns true if `filepath` exists.
func checkFileExists(filepath string) bool {
	_, err := os.Stat(filepath)
	return err == nil
}

// sortedKeys returns the keys of `m` as a sorted slice.
func sortedKeys(m map[int][]string) []int {
	keys := []int{}
	for k := range m {
		keys = append(keys, k)
	}
	sort.Ints(keys)
	return keys
}
-												Add LICENSE.md with reference to AGPL and Commercial license.  Add license header info to code.

											
										
										
											2018-03-22 14:03:47 +00:00
+								/*
 								 * This file is subject to the terms and conditions defined in
 								 * file 'LICENSE.md', which is part of this source code package.
 								 */
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								package extractor
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+								import (
 									"flag"
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									"os"
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									"path/filepath"
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+									"regexp"
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									"sort"
 									"strings"
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+									"testing"
-												Merge branch 'render.v3.hungarian' into extract

											
										
										
											2018-11-02 15:13:48 +11:00
 									"github.com/unidoc/unidoc/common"
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									"github.com/unidoc/unidoc/pdf/model"
-												Move model/fonts to model/internal/fonts - reducing export surface

- Move the folder
- Update imports
- Add type aliases to access needed types from model (fonts.StdFont, fonts.CharMetrics and the font names)

											
										
										
											2019-03-09 18:02:57 +00:00
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									"golang.org/x/text/unicode/norm"
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+								)
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
 								// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
 								// forceTest should be set to true to force running all tests.
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
 								var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+								func init() {
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+									common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+									if flag.Lookup("test.v") != nil {
 										isTesting = true
 									}
 								}
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
 								func TestTextExtractionFragments(t *testing.T) {
 									fragmentTests := []struct {
 										name     string
 										contents string
 										text     string
 									}{
 										{
 											name: "portrait",
 											contents: `
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+								        BT
 								        /UniDocCourier 24 Tf
 								        (Hello World!)Tj
 -10 Td
 								        (Doink)Tj
 								        ET
 								        `,
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+											text: "Hello World!\nDoink",
 										},
 										{
 											name: "landscape",
 											contents: `
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+								        BT
 								        /UniDocCourier 24 Tf
 1 -1 0 0 0 Tm
 								        (Hello World!)Tj
 -10 Td
 								        (Doink)Tj
 								        ET
 								        `,
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+											text: "Hello World!\nDoink",
 										},
 										{
 											name: "180 degree rotation",
 											contents: `
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+								        BT
 								        /UniDocCourier 24 Tf
 								        -1 0 0 -1 0 0 Tm
 								        (Hello World!)Tj
 -10 Td
 								        (Doink)Tj
 								        ET
 								        `,
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+											text: "Hello World!\nDoink",
 										},
 										{
 											name: "Helvetica",
 											contents: `
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+								        BT
 								        /UniDocHelvetica 24 Tf
 -1 1 0 0 0 Tm
 								        (Hello World!)Tj
 -10 Td
 								        (Doink)Tj
 								        ET
 								        `,
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+											text: "Hello World!\nDoink",
 										},
 									}
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									// Setup mock resources.
 									resources := model.NewPdfPageResources()
 									{
-												Move model/fonts to model/internal/fonts - reducing export surface

- Move the folder
- Update imports
- Add type aliases to access needed types from model (fonts.StdFont, fonts.CharMetrics and the font names)

											
										
										
											2019-03-09 18:02:57 +00:00
+										courier := model.NewStandard14FontMustCompile(model.CourierName)
 										helvetica := model.NewStandard14FontMustCompile(model.HelveticaName)
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+										resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
 										resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
 									for _, f := range fragmentTests {
 										t.Run(f.name, func(t *testing.T) {
 											e := Extractor{resources: resources, contents: f.contents}
 											text, err := e.ExtractText()
 											if err != nil {
 												t.Fatalf("Error extracting text: %q err=%v", f.name, err)
 												return
 											}
 											if text != f.text {
 												t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
 												return
 											}
 										})
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									}
 								}
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// TestTextExtractionFiles tests text extraction on a set of PDF files.
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+								// It checks for the existence of specified strings of words on specified pages.
 								// We currently only check within lines as our line order is still improving.
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								func TestTextExtractionFiles(t *testing.T) {
 									if len(corpusFolder) == 0 && !forceTest {
 										t.Log("Corpus folder not set - skipping")
 										return
 									}
 									for _, test := range fileExtractionTests {
 										t.Run(test.filename, func(t *testing.T) {
 											testExtractFile(t, test.filename, test.expectedPageText)
 										})
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									}
 								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
 								var fileExtractionTests = []struct {
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									filename         string
 									expectedPageText map[int][]string
 								}{
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "reader.pdf",
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+										expectedPageText: map[int][]string{
 : []string{"A Research UNIX Reader:",
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+												"Annotated Excerpts from the Programmer’s Manual,",
 												"1. Introduction",
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+												"To keep the size of this report",
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+												"last common ancestor of a radiative explosion",
 											},
 										},
 									},
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "000026.pdf",
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+										expectedPageText: map[int][]string{
 : []string{"Fresh Flower",
 												"Care & Handling ",
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+											},
 										},
 									},
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "search_sim_key.pdf",
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+										expectedPageText: map[int][]string{
 : []string{"A cryptographic scheme which enables searching",
 												"Untrusted server should not be able to search for a word without authorization",
 											},
 										},
 									},
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "Theil_inequality.pdf",
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+										expectedPageText: map[int][]string{
 : []string{"London School of Economics and Political Science"},
 : []string{"The purpose of this paper is to set Theil’s approach"},
 										},
 									},
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "8207.pdf",
-												Handle standard 14 TrueType fonts and stanard 14 font aliases in text extraction.

											
										
										
											2018-11-20 17:49:37 +11:00
+										expectedPageText: map[int][]string{
 : []string{"In building graphic systems for use with raster devices,"},
 : []string{"The imaging model specifies how geometric shapes and colors are"},
 : []string{"The transformation matrix T that maps application defined"},
 										},
 									},
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "ling-2013-0040ad.pdf",
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										expectedPageText: map[int][]string{
 : []string{"Although the linguistic variation among texts is continuous"},
 : []string{"distinctions. For example, much of the research on spoken/written"},
 										},
 									},
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "26-Hazard-Thermal-environment.pdf",
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										expectedPageText: map[int][]string{
 : []string{"OHS Body of Knowledge"},
 : []string{"Copyright notice and licence terms"},
 										},
 									},
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+									{filename: "Threshold_survey.pdf",
 										expectedPageText: map[int][]string{
 : []string{"clustering, entropy, object attributes, spatial correlation, and local"},
 										},
 									},
 									{filename: "circ2.pdf",
 										expectedPageText: map[int][]string{
 : []string{"Understanding and complying with copyright law can be a challenge"},
 										},
 									},
 									{filename: "rare_word.pdf",
 										expectedPageText: map[int][]string{
 : []string{"words in the test set, we increase the BLEU score"},
 										},
 									},
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+									{filename: "Planck_Wien.pdf",
 										expectedPageText: map[int][]string{
 : []string{"entropy of a system of n identical resonators in a stationary radiation field"},
 										},
 									},
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									// Case where combineDiacritics was combining ' and " with preceeding letters.
 									// NOTE(peterwilliams97): Part of the reason this test fails is that we don't currently read
 									// Type0:CIDFontType0 font metrics and assume zero displacemet so that we place the ' and " too
 									// close to the preceeding letters.
 									{filename: "/rfc6962.txt.pdf",
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										expectedPageText: map[int][]string{
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+: []string{
 												"timestamps for certificates they then don't log",
 												`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										},
 									},
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
 									// {filename: "Ito_Formula.pdf",
 									// 	expectedPageText: map[int][]string{
 									// 		1: []string{
 									// 			"In the Itô stochastic calculus",
 									// 			"In standard, non-stochastic calculus, one computes a derivative"},
 									// 		2: []string{"Financial Economics Itô’s Formula"},
 									// 	},
 									// },
 									// {filename: "thanh.pdf",
 									// 	expectedPageText: map[int][]string{
 									// 		1: []string{"Hàn Thé̂ Thành"},
 									// 	},
 									// },
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+								// text to `expectedPageText`.
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								//
 								// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
 								// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
 								// the corpus directory.
 								//
 								// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
 								// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
 								func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
 									filepath := filepath.Join(corpusFolder, filename)
 									exists := checkFileExists(filepath)
 									if !exists {
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+										if forceTest {
 											t.Fatalf("filename=%q does not exist", filename)
 										}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+										t.Logf("%s not found", filename)
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+										return
 									}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
 									_, actualPageText := extractPageTexts(t, filepath)
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									for _, pageNum := range sortedKeys(expectedPageText) {
 										expectedSentences, ok := expectedPageText[pageNum]
 										actualText, ok := actualPageText[pageNum]
 										if !ok {
 											t.Fatalf("%q doesn't have page %d", filename, pageNum)
 										}
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										actualText = norm.NFKC.String(actualText)
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+										if !containsSentences(t, expectedSentences, actualText) {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+											t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+										}
 									}
 								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+								// {page number: page text}
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+								func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 									f, err := os.Open(filename)
 									if err != nil {
 										t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
 									}
 									defer f.Close()
 									pdfReader, err := model.NewPdfReader(f)
 									if err != nil {
 										t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
 									}
 									numPages, err := pdfReader.GetNumPages()
 									if err != nil {
 										t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
 									}
 									pageText := map[int]string{}
 									for pageNum := 1; pageNum <= numPages; pageNum++ {
 										page, err := pdfReader.GetPage(pageNum)
 										if err != nil {
 											t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
 										}
 										ex, err := New(page)
 										if err != nil {
 											t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
 										}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+										text, _, _, err := ex.ExtractTextWithStats()
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+										if err != nil {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+											t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+										}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+										// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+										pageText[pageNum] = reduceSpaces(text)
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									}
 									return numPages, pageText
 								}
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+								// containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
 								func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
 									for _, e := range expectedSentences {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										e = norm.NFKC.String(e)
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+										if !strings.Contains(actualText, e) {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+											t.Errorf("No match for %q", e)
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+											return false
 										}
 									}
 									return true
 								}
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+								// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
 								// reduced to a single space.
 								func reduceSpaces(text string) string {
 									text = reSpace.ReplaceAllString(text, " ")
 									return strings.Trim(text, " \t\n\r\v")
 								}
 								var reSpace = regexp.MustCompile(`(?m)\s+`)
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// checkFileExists returns true if `filepath` exists.
 								func checkFileExists(filepath string) bool {
 									_, err := os.Stat(filepath)
 									return err == nil
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+								}
 								// sortedKeys returns the keys of `m` as a sorted slice.
 								func sortedKeys(m map[int][]string) []int {
 									keys := []int{}
 									for k := range m {
 										keys = append(keys, k)
 									}
 									sort.Ints(keys)
 									return keys
 								}