unipdf/pdf/extractor/text_test.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"flag"
	"os"
	"regexp"
	"sort"
	"strings"
	"testing"

	"github.com/unidoc/unidoc/common"
	"github.com/unidoc/unidoc/pdf/model"
)

func init() {
	common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
	if flag.Lookup("test.v") != nil {
		isTesting = true
	}
}

const testContents1 = `
    BT
    /UniDocCourier 24 Tf
    (Hello World!)Tj
    0 -10 Td
    (Doink)Tj
    ET
`

const testExpected1 = "Hello World!\nDoink"

func TestTextExtraction1(t *testing.T) {
	e := Extractor{}
	e.contents = testContents1

	s, err := e.ExtractText()
	if err != nil {
		t.Errorf("Error extracting text: %v", err)
		return
	}
	if s != testExpected1 {
		t.Errorf("Text mismatch. Got %q. Expected %q", s, testExpected1)
		return
	}
}

func TestTextExtraction2(t *testing.T) {
	for _, test := range extract2Tests {
		testExtract2(t, test.filename, test.expectedPageText)
	}
}

var extract2Tests = []struct {
	filename         string
	expectedPageText map[int][]string
}{
	{filename: "testdata/reader.pdf",
		expectedPageText: map[int][]string{
			1: []string{"A Research UNIX Reader:",
				"Annotated Excerpts from the Programmer’s Manual,",
				"1. Introduction",
				"To keep the size of this report",
				"last common ancestor of a radiative explosion",
			},
		},
	},
	{filename: "testdata/000026.pdf",
		expectedPageText: map[int][]string{
			1: []string{"Fresh Flower",
				"Care & Handling ",
			},
		},
	},
	{filename: "testdata/search_sim_key.pdf",
		expectedPageText: map[int][]string{
			2: []string{"A cryptographic scheme which enables searching",
				"Untrusted server should not be able to search for a word without authorization",
			},
		},
	},
	{filename: "testdata/Theil_inequality.pdf",
		expectedPageText: map[int][]string{
			1: []string{"London School of Economics and Political Science"},
			4: []string{"The purpose of this paper is to set Theil’s approach"},
		},
	},
	{filename: "testdata/8207.pdf",
		expectedPageText: map[int][]string{
			1: []string{"In building graphic systems for use with raster devices,"},
			2: []string{"The imaging model specifies how geometric shapes and colors are"},
			3: []string{"The transformation matrix T that maps application defined"},
		},
	},
}

func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
	_, actualPageText := extractPageTexts(t, filename)
	for _, pageNum := range sortedKeys(expectedPageText) {
		expectedSentences, ok := expectedPageText[pageNum]
		actualText, ok := actualPageText[pageNum]
		if !ok {
			t.Fatalf("%q doesn't have page %d", filename, pageNum)
		}
		if !containsSentences(t, expectedSentences, actualText) {
			t.Fatalf("Text mismatch filename=%q page=%d", filename, pageNum)
		}
	}
}

func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
	for _, e := range expectedSentences {
		if !strings.Contains(actualText, e) {
			t.Errorf("No match for %+q", e)
			return false
		}
	}
	return true
}

func sortedKeys(m map[int][]string) []int {
	keys := []int{}
	for k := range m {
		keys = append(keys, k)
	}
	sort.Ints(keys)
	return keys
}

func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
	f, err := os.Open(filename)
	if err != nil {
		t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
	}
	defer f.Close()

	pdfReader, err := model.NewPdfReader(f)
	if err != nil {
		t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
	}
	numPages, err := pdfReader.GetNumPages()
	if err != nil {
		t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
	}
	pageText := map[int]string{}
	for pageNum := 1; pageNum <= numPages; pageNum++ {

		page, err := pdfReader.GetPage(pageNum)
		if err != nil {
			t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
		}
		ex, err := New(page)
		if err != nil {
			t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
		}
		text, _, _, err := ex.ExtractText2()
		if err != nil {
			t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
		}
		pageText[pageNum] = reduceSpaces(text)
	}
	return numPages, pageText
}

// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
// reduced to a single space.
func reduceSpaces(text string) string {
	text = reSpace.ReplaceAllString(text, " ")
	return strings.Trim(text, " \t\n\r\v")
}

var reSpace = regexp.MustCompile(`(?m)\s+`)
-												Add LICENSE.md with reference to AGPL and Commercial license.  Add license header info to code.

											
										
										
											2018-03-22 14:03:47 +00:00
+								/*
 								 * This file is subject to the terms and conditions defined in
 								 * file 'LICENSE.md', which is part of this source code package.
 								 */
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								package extractor
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+								import (
 									"flag"
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									"os"
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+									"regexp"
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									"sort"
 									"strings"
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+									"testing"
-												Merge branch 'render.v3.hungarian' into extract

											
										
										
											2018-11-02 15:13:48 +11:00
 									"github.com/unidoc/unidoc/common"
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									"github.com/unidoc/unidoc/pdf/model"
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+								)
 								func init() {
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+									common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
-												Fixes in extractor testing

											
										
										
											2018-03-22 13:53:12 +00:00
+									if flag.Lookup("test.v") != nil {
 										isTesting = true
 									}
 								}
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
 								const testContents1 = `
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+								    BT
-												Don't import core anonymously

											
										
										
											2018-07-15 17:22:00 +10:00
+								    /UniDocCourier 24 Tf
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+								    (Hello World!)Tj
 -10 Td
 								    (Doink)Tj
 								    ET
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								`
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								const testExpected1 = "Hello World!\nDoink"
 								func TestTextExtraction1(t *testing.T) {
 									e := Extractor{}
 									e.contents = testContents1
 									s, err := e.ExtractText()
 									if err != nil {
 										t.Errorf("Error extracting text: %v", err)
 										return
 									}
 									if s != testExpected1 {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										t.Errorf("Text mismatch. Got %q. Expected %q", s, testExpected1)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+										return
 									}
 								}
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
 								func TestTextExtraction2(t *testing.T) {
 									for _, test := range extract2Tests {
 										testExtract2(t, test.filename, test.expectedPageText)
 									}
 								}
 								var extract2Tests = []struct {
 									filename         string
 									expectedPageText map[int][]string
 								}{
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+									{filename: "testdata/reader.pdf",
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+										expectedPageText: map[int][]string{
 : []string{"A Research UNIX Reader:",
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+												"Annotated Excerpts from the Programmer’s Manual,",
 												"1. Introduction",
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+												"To keep the size of this report",
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+												"last common ancestor of a radiative explosion",
 											},
 										},
 									},
 									{filename: "testdata/000026.pdf",
 										expectedPageText: map[int][]string{
 : []string{"Fresh Flower",
 												"Care & Handling ",
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+											},
 										},
 									},
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+									{filename: "testdata/search_sim_key.pdf",
 										expectedPageText: map[int][]string{
 : []string{"A cryptographic scheme which enables searching",
 												"Untrusted server should not be able to search for a word without authorization",
 											},
 										},
 									},
 									{filename: "testdata/Theil_inequality.pdf",
 										expectedPageText: map[int][]string{
 : []string{"London School of Economics and Political Science"},
 : []string{"The purpose of this paper is to set Theil’s approach"},
 										},
 									},
-												Handle standard 14 TrueType fonts and stanard 14 font aliases in text extraction.

											
										
										
											2018-11-20 17:49:37 +11:00
+									{filename: "testdata/8207.pdf",
 										expectedPageText: map[int][]string{
 : []string{"In building graphic systems for use with raster devices,"},
 : []string{"The imaging model specifies how geometric shapes and colors are"},
 : []string{"The transformation matrix T that maps application defined"},
 										},
 									},
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+								}
 								func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
 									_, actualPageText := extractPageTexts(t, filename)
 									for _, pageNum := range sortedKeys(expectedPageText) {
 										expectedSentences, ok := expectedPageText[pageNum]
 										actualText, ok := actualPageText[pageNum]
 										if !ok {
 											t.Fatalf("%q doesn't have page %d", filename, pageNum)
 										}
 										if !containsSentences(t, expectedSentences, actualText) {
 											t.Fatalf("Text mismatch filename=%q page=%d", filename, pageNum)
 										}
 									}
 								}
 								func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
 									for _, e := range expectedSentences {
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+										if !strings.Contains(actualText, e) {
 											t.Errorf("No match for %+q", e)
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+											return false
 										}
 									}
 									return true
 								}
 								func sortedKeys(m map[int][]string) []int {
 									keys := []int{}
 									for k := range m {
 										keys = append(keys, k)
 									}
 									sort.Ints(keys)
 									return keys
 								}
 								func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 									f, err := os.Open(filename)
 									if err != nil {
 										t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
 									}
 									defer f.Close()
 									pdfReader, err := model.NewPdfReader(f)
 									if err != nil {
 										t.Fatalf("NewPdfReader failed. filename=%q err=%v", filename, err)
 									}
 									numPages, err := pdfReader.GetNumPages()
 									if err != nil {
 										t.Fatalf("GetNumPages failed. filename=%q err=%v", filename, err)
 									}
 									pageText := map[int]string{}
 									for pageNum := 1; pageNum <= numPages; pageNum++ {
 										page, err := pdfReader.GetPage(pageNum)
 										if err != nil {
 											t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
 										}
 										ex, err := New(page)
 										if err != nil {
 											t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
 										}
 										text, _, _, err := ex.ExtractText2()
 										if err != nil {
 											t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
 										}
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
+										pageText[pageNum] = reduceSpaces(text)
-												Added test for position based text extraction

											
										
										
											2018-11-12 11:04:09 +11:00
+									}
 									return numPages, pageText
 								}
-												Handle missing widths in text extraction

											
										
										
											2018-11-20 15:49:28 +11:00
 								// reduceSpaces returns `text` with runs of spaces of any kind (spaces, tabs, line breaks, etc)
 								// reduced to a single space.
 								func reduceSpaces(text string) string {
 									text = reSpace.ReplaceAllString(text, " ")
 									return strings.Trim(text, " \t\n\r\v")
 								}
 								var reSpace = regexp.MustCompile(`(?m)\s+`)