unipdf/internal/cmap/cmap_test.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package cmap

import (
	"sort"
	"strings"
	"testing"
)

func init() {
	// Uncomment when debugging to get debug or trace logging output.
	//common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
	//common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
}

// cmap1Data represents a basic CMap.
const cmap1Data = `
	/CIDInit /ProcSet findresource begin
	12 dict begin
	begincmap
	/CIDSystemInfo
	<<  /Registry (Adobe)
	/Ordering (UCS)
	/Supplement 0
	>> def
	/CMapName /Adobe-Identity-UCS def
	/CMapType 2 def
	1 begincodespacerange
	<0000> <FFFF>
	endcodespacerange
	8 beginbfchar
	<0003> <0020>
	<0007> <0024>
	<0033> <0050>
	<0035> <0052>
	<0037> <0054>
	<005A> <0077>
	<005C> <0079>
	<005F> <007C>
	endbfchar
	7 beginbfrange
	<000F> <0017> <002C>
	<001B> <001D> <0038>
	<0025> <0026> <0042>
	<002F> <0031> <004C>
	<0044> <004C> <0061>
	<004F> <0053> <006C>
	<0055> <0057> <0072>
	endbfrange
	endcmap
	CMapName currentdict /CMap defineresource pop
	end
	end
`

// TestCMapParser tests basic loading of a simple CMap.
func TestCMapParser1(t *testing.T) {
	cmap, err := LoadCmapFromDataCID([]byte(cmap1Data))
	if err != nil {
		t.Error("Failed: ", err)
		return
	}

	if cmap.Name() != "Adobe-Identity-UCS" {
		t.Errorf("CMap name incorrect (%s)", cmap.Name())
		return
	}

	if cmap.Type() != 2 {
		t.Errorf("CMap type incorrect")
		return
	}

	if len(cmap.codespaces) != 1 {
		t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
		return
	}

	if cmap.codespaces[0].Low != 0 {
		t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].Low)
		return
	}

	if cmap.codespaces[0].High != 0xFFFF {
		t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].High)
		return
	}

	expectedMappings := map[CharCode]rune{
		0x0003:     0x0020,
		0x005F:     0x007C,
		0x000F:     0x002C,
		0x000F + 5: 0x002C + 5,
		0x001B:     0x0038,
		0x001B + 2: 0x0038 + 2,
		0x002F:     0x004C,
		0x0044:     0x0061,
		0x004F:     0x006C,
		0x0055:     0x0072,
	}

	for k, expected := range expectedMappings {
		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
			t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v)
			return
		}
	}

	v, _ := cmap.CharcodeToUnicode(0x99)
	if v != MissingCodeRune { //!= "notdef" {
		t.Errorf("Unmapped code, expected to map to undefined")
		return
	}

	charcodes := []byte{0x00, 0x03, 0x00, 0x0F}
	s, _ := cmap.CharcodeBytesToUnicode(charcodes)
	if s != " ," {
		t.Error("Incorrect charcode bytes ➞ string mapping")
		return
	}
}

const cmap2Data = `
	/CIDInit /ProcSet findresource begin
	12 dict begin
	begincmap
	/CIDSystemInfo
	<<  /Registry (Adobe)
	/Ordering (UCS)
	/Supplement 0
	>> def
	/CMapName /Adobe-Identity-UCS def
	/CMapType 2 def
	1 begincodespacerange
	<0000> <FFFF>
	endcodespacerange
	7 beginbfrange
	<0080> <00FF> <002C>
	<802F> <902F> <0038>
	endbfrange
	endcmap
	CMapName currentdict /CMap defineresource pop
	end
	end
`

// TestCMapParser2 tests a bug that came up when 2-byte character codes had the higher byte set to 0,
// e.g. 0x0080, and the character map was not taking the number of bytes of the input codemap into account.
func TestCMapParser2(t *testing.T) {
	cmap, err := LoadCmapFromDataCID([]byte(cmap2Data))
	if err != nil {
		t.Error("Failed: ", err)
		return
	}

	if cmap.Name() != "Adobe-Identity-UCS" {
		t.Errorf("CMap name incorrect (%s)", cmap.Name())
		return
	}

	if cmap.Type() != 2 {
		t.Errorf("CMap type incorrect")
		return
	}

	if len(cmap.codespaces) != 1 {
		t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
		return
	}

	if cmap.codespaces[0].Low != 0 {
		t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].Low)
		return
	}

	if cmap.codespaces[0].High != 0xFFFF {
		t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].High)
		return
	}

	expectedMappings := map[CharCode]rune{
		0x0080: 0x002C,
		0x802F: 0x0038,
	}

	for k, expected := range expectedMappings {
		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
			t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v)
			return
		}
	}

	// Check byte sequence mappings.
	expectedSequenceMappings := []struct {
		bytes    []byte
		expected string
	}{
		{[]byte{0x80, 0x2F, 0x00, 0x80}, string([]rune{0x0038, 0x002C})},
	}

	for _, exp := range expectedSequenceMappings {
		str, _ := cmap.CharcodeBytesToUnicode(exp.bytes)
		if str != exp.expected {
			t.Errorf("Incorrect byte sequence mapping % X ➞ % X (got % X)",
				exp.bytes, []rune(exp.expected), []rune(str))
			return
		}
	}
}

// cmapData3 is a CMap with a mixture of 1 and 2 byte codespaces.
const cmapData3 = `
	/CIDInit /ProcSet findresource begin
	12 dict begin begincmap
	/CIDSystemInfo
	3 dict dup begin
	/Registry (Adobe) def
	/Supplement 2 def
	end def

	/CMapName /test-1 def
	/CMapType 1 def

	4 begincodespacerange
	<00> <80>
	<8100> <9fff>
	<a0> <d0>
	<d140> <fbfc>
	endcodespacerange
	7 beginbfrange
	<00> <80> <10>
	<8100> <9f00> <1000>
	<a0> <d0> <90>
	<d140> <f000> <a000>
	endbfrange
	endcmap
`

// TestCMapParser3 test case of a CMap with mixed number of 1 and 2 bytes in the codespace range.
func TestCMapParser3(t *testing.T) {
	cmap, err := LoadCmapFromDataCID([]byte(cmapData3))
	if err != nil {
		t.Error("Failed: ", err)
		return
	}

	if cmap.Name() != "test-1" {
		t.Errorf("CMap name incorrect (%s)", cmap.Name())
		return
	}

	if cmap.Type() != 1 {
		t.Errorf("CMap type incorrect")
		return
	}

	// Check codespaces.
	expectedCodespaces := []Codespace{
		{1, 0x00, 0x80},
		{1, 0xa0, 0xd0},
		{2, 0x8100, 0x9fff},
		{2, 0xd140, 0xfbfc},
	}

	if len(cmap.codespaces) != len(expectedCodespaces) {
		t.Errorf("len codespace != %d (%d)", len(expectedCodespaces), len(cmap.codespaces))
		return
	}

	for i, cs := range cmap.codespaces {
		exp := expectedCodespaces[i]
		if cs.NumBytes != exp.NumBytes {
			t.Errorf("code space number of bytes != %d (%d) %x", exp.NumBytes, cs.NumBytes, exp)
			return
		}

		if cs.Low != exp.Low {
			t.Errorf("code space low range != %d (%d) %x", exp.Low, cs.Low, exp)
			return
		}

		if cs.High != exp.High {
			t.Errorf("code space high range != 0x%X (0x%X) %x", exp.High, cs.High, exp)
			return
		}
	}

	// Check mappings.
	expectedMappings := map[CharCode]rune{
		0x80:   0x10 + 0x80,
		0x8100: 0x1000,
		0xa0:   0x90,
		0xd140: 0xa000,
	}
	for k, expected := range expectedMappings {
		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
			t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v)
			return
		}
	}

	// Check byte sequence mappings.
	expectedSequenceMappings := []struct {
		bytes    []byte
		expected string
	}{

		{[]byte{0x80, 0x81, 0x00, 0xa1, 0xd1, 0x80, 0x00},
			string([]rune{
				0x90,
				0x1000,
				0x91,
				0xa000 + 0x40,
				0x10})},
	}

	for _, exp := range expectedSequenceMappings {
		str, _ := cmap.CharcodeBytesToUnicode(exp.bytes)
		if str != exp.expected {
			t.Errorf("Incorrect byte sequence mapping: % 02X ➞ % 02X (got % 02X)",
				exp.bytes, []rune(exp.expected), []rune(str))
			return
		}
	}
}

// cmapData4 is a CMap with some utf16 encoded unicode strings that contain surrogates.
const cmap4Data = `
    /CIDInit /ProcSet findresource begin
    11 dict begin
    begincmap
    /CIDSystemInfo
    << /Registry (Adobe)
    /Ordering (UCS)
    /Supplement 0
    >> def
    /CMapName /Adobe-Identity-UCS def
    /CMapType 2 def
    1 begincodespacerange
    <0000> <FFFF>
    endcodespacerange
    15 beginbfchar
    <01E1> <002C>
    <0201> <007C>
    <059C> <21D2>
    <05CA> <2200>
    <05CC> <2203>
    <05D0> <2208>
    <0652> <2295>
    <073F> <D835DC50>
    <0749> <D835DC5A>
    <0889> <D835DC84>
    <0893> <D835DC8E>
    <08DD> <D835DC9E>
    <08E5> <D835DCA6>
    <08E7> <2133>
    <0D52> <2265>
    endbfchar
    1 beginbfrange
    <0E36> <0E37> <27F5>
    endbfrange
    endcmap
`

// TestCMapParser4 checks that ut16 encoded unicode strings are interpreted correctly.
func TestCMapParser4(t *testing.T) {
	cmap, err := LoadCmapFromDataCID([]byte(cmap4Data))
	if err != nil {
		t.Error("Failed to load CMap: ", err)
		return
	}

	if cmap.Name() != "Adobe-Identity-UCS" {
		t.Errorf("CMap name incorrect (%s)", cmap.Name())
		return
	}

	if cmap.Type() != 2 {
		t.Errorf("CMap type incorrect")
		return
	}

	if len(cmap.codespaces) != 1 {
		t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
		return
	}

	if cmap.codespaces[0].Low != 0 {
		t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].Low)
		return
	}

	if cmap.codespaces[0].High != 0xFFFF {
		t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].High)
		return
	}

	expectedMappings := map[CharCode]rune{
		0x0889: '\U0001d484', // `𝒄`
		0x0893: '\U0001d48e', // `𝒎`
		0x08DD: '\U0001d49e', // `𝒞`
		0x08E5: '\U0001d4a6', // `𝒦
	}

	for k, expected := range expectedMappings {
		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
			t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v)
			return
		}
	}

	// Check byte sequence mappings.
	expectedSequenceMappings := []struct {
		bytes    []byte
		expected string
	}{
		{[]byte{0x07, 0x3F, 0x07, 0x49}, "\U0001d450\U0001d45a"}, // `𝑐𝑚`
		{[]byte{0x08, 0x89, 0x08, 0x93}, "\U0001d484\U0001d48e"}, // `𝒄𝒎`
		{[]byte{0x08, 0xDD, 0x08, 0xE5}, "\U0001d49e\U0001d4a6"}, // `𝒞𝒦`
		{[]byte{0x08, 0xE7, 0x0D, 0x52}, "\u2133\u2265"},         // `ℳ≥`
	}

	for _, exp := range expectedSequenceMappings {
		str, _ := cmap.CharcodeBytesToUnicode(exp.bytes)
		if str != exp.expected {
			t.Errorf("Incorrect byte sequence mapping % 02X ➞ %+q (got %+q)",
				exp.bytes, exp.expected, str)
			return
		}
	}
}

var (
	codeToUnicode1 = map[CharCode]rune{ // 40 entries
		0x02ca: 'ˊ',
		0x02cb: 'ˋ',
		0x02cd: 'ˍ',
		0x039c: 'Μ',
		0x039d: 'Ν',
		0x039e: 'Ξ',
		0x039f: 'Ο',
		0x03a0: 'Π',
		0x03a1: 'Ρ',
		0x03a6: 'Φ',
		0x03b1: 'α',
		0x03b2: 'β',
		0x03b3: 'γ',
		0x03b4: 'δ',
		0x03b5: 'ε',
		0x03b6: 'ζ',
		0x03b7: 'η',
		0x03c6: 'φ',
		0x03c7: 'χ',
		0x03c9: 'ω',
		0x2013: '–',
		0x2014: '—',
		0x2018: '‘',
		0x2019: '’',
		0x203e: '‾',
		0x20ac: '€',
		0x2163: 'Ⅳ',
		0x2164: 'Ⅴ',
		0x2165: 'Ⅵ',
		0x2166: 'Ⅶ',
		0x2167: 'Ⅷ',
		0x2168: 'Ⅸ',
		0x2169: 'Ⅹ',
		0x2190: '←',
		0x2191: '↑',
		0x2192: '→',
		0x2193: '↓',
		0x2220: '∠',
		0x2223: '∣',
		0x222a: '∪',
	}

	codeToUnicode2 = map[CharCode]rune{ // 40 entries
		0x0100: 'Ā',
		0x0101: 'ā',
		0x0102: 'Ă',
		0x0111: 'đ',
		0x0112: 'Ē',
		0x0113: 'ē',
		0x0114: 'Ĕ',
		0x0115: 'ĕ',
		0x0116: 'Ė',
		0x011b: 'ě',
		0x0126: 'Ħ',
		0x0127: 'ħ',
		0x0128: 'Ĩ',
		0x0129: 'ĩ',
		0x012a: 'Ī',
		0x012b: 'ī',
		0x012c: 'Ĭ',
		0x013b: 'Ļ',
		0x013c: 'ļ',
		0x013e: 'ľ',
		0x013f: 'Ŀ',
		0x0140: 'ŀ',
		0x0141: 'Ł',
		0x0150: 'Ő',
		0x0151: 'ő',
		0x0152: 'Œ',
		0x0153: 'œ',
		0x0154: 'Ŕ',
		0x0155: 'ŕ',
		0x015a: 'Ś',
		0x0165: 'ť',
		0x0166: 'Ŧ',
		0x0167: 'ŧ',
		0x0168: 'Ũ',
		0x0169: 'ũ',
		0x016a: 'Ū',
		0x016b: 'ū',
		0x017a: 'ź',
		0x017b: 'Ż',
		0x017d: 'Ž',
	}
	codeToUnicode3 = map[CharCode]rune{ // 93 entries
		0x0124: 'Ĥ',
		0x0125: 'ĥ',
		0x0126: 'Ħ',
		0x0127: 'ħ',
		0x0134: 'Ĵ',
		0x0135: 'ĵ',
		0x0136: 'Ķ',
		0x0137: 'ķ',
		0x0138: 'ĸ',
		0x0144: 'ń',
		0x0145: 'Ņ',
		0x0146: 'ņ',
		0x0147: 'Ň',
		0x0154: 'Ŕ',
		0x0155: 'ŕ',
		0x0156: 'Ŗ',
		0x0157: 'ŗ',
		0x0164: 'Ť',
		0x0169: 'ũ',
		0x0174: 'Ŵ',
		0x0175: 'ŵ',
		0x0176: 'Ŷ',
		0x0177: 'ŷ',
		0x0184: 'Ƅ',
		0x0185: 'ƅ',
		0x0186: 'Ɔ',
		0x0187: 'Ƈ',
		0x0194: 'Ɣ',
		0x019a: 'ƚ',
		0x01a4: 'Ƥ',
		0x01a5: 'ƥ',
		0x01a6: 'Ʀ',
		0x01a7: 'Ƨ',
		0x01b4: 'ƴ',
		0x01b5: 'Ƶ',
		0x01b6: 'ƶ',
		0x01b7: 'Ʒ',
		0x01c4: 'Ǆ',
		0x01cb: 'ǋ',
		0x01d4: 'ǔ',
		0x01d5: 'Ǖ',
		0x01d6: 'ǖ',
		0x01d7: 'Ǘ',
		0x01e4: 'Ǥ',
		0x01e5: 'ǥ',
		0x01e6: 'Ǧ',
		0x01e7: 'ǧ',
		0x01f4: 'Ǵ',
		0x01f5: 'ǵ',
		0x0204: 'Ȅ',
		0x0205: 'ȅ',
		0x0206: 'Ȇ',
		0x0207: 'ȇ',
		0x0214: 'Ȕ',
		0x0215: 'ȕ',
		0x0216: 'Ȗ',
		0x0217: 'ȗ',
		0x0224: 'Ȥ',
		0x0226: 'Ȧ',
		0x0227: 'ȧ',
		0x0254: 'ɔ',
		0x0255: 'ɕ',
		0x0256: 'ɖ',
		0x0257: 'ɗ',
		0x0264: 'ɤ',
		0x0265: 'ɥ',
		0x0266: 'ɦ',
		0x0267: 'ɧ',
		0x0273: 'ɳ',
		0x0274: 'ɴ',
		0x0275: 'ɵ',
		0x0276: 'ɶ',
		0x0277: 'ɷ',
		0x0284: 'ʄ',
		0x0285: 'ʅ',
		0x0286: 'ʆ',
		0x0287: 'ʇ',
		0x0294: 'ʔ',
		0x0296: 'ʖ',
		0x0297: 'ʗ',
		0x02a4: 'ʤ',
		0x02a5: 'ʥ',
		0x02c6: 'ˆ',
		0x02c7: 'ˇ',
		0x0304: '̄',
		0x0305: '̅',
		0x0306: '̆',
		0x0307: '̇',
		0x030d: '̍',
		0x0314: '̔',
		0x0315: '̕',
		0x0316: '̖',
		0x0317: '̗',
	}
)

const bfData1 = `
8 beginbfchar
<02cd> <02cd>
<03a6> <03a6>
<03c9> <03c9>
<203e> <203e>
<20ac> <20ac>
<2220> <2220>
<2223> <2223>
<222a> <222a>
endbfchar
8 beginbfrange
<02ca><02cb> <02ca>
<039c><03a1> <039c>
<03b1><03b7> <03b1>
<03c6><03c7> <03c6>
<2013><2014> <2013>
<2018><2019> <2018>
<2163><2169> <2163>
<2190><2193> <2190>
endbfrange
`

// TestBfData checks that cmap.toBfData produces the expected output.
func TestBfData(t *testing.T) {
	cmap := NewToUnicodeCMap(codeToUnicode1)

	bfDataExpected := strings.Trim(bfData1, "\n")
	bfDataTest := cmap.toBfData()

	if bfDataTest != bfDataExpected {
		t.Errorf("Incorrect bfData")
		return
	}
}

// TestBfData checks that cmap.toBfData produces the expected output.
func TestCMapCreation(t *testing.T) {
	checkCmapWriteRead(t, codeToUnicode1)
	checkCmapWriteRead(t, codeToUnicode2)
	checkCmapWriteRead(t, codeToUnicode3)
}

// checkCmapWriteRead creates CMap data from `codeToUnicode` then parses it and checks that the
// same codeToUnicode is returned.
func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) {
	cmap0 := NewToUnicodeCMap(codeToUnicode)

	data := cmap0.Bytes()
	cmap, err := LoadCmapFromDataCID(data)
	if err != nil {
		t.Error("Failed to load CMap: ", err)
		return
	}

	codes0 := make([]CharCode, 0, len(codeToUnicode))
	for code := range codeToUnicode {
		codes0 = append(codes0, code)
	}
	sort.Slice(codes0, func(i, j int) bool { return codes0[i] < codes0[j] })
	codes := make([]CharCode, 0, len(cmap.codeToUnicode))
	for code := range cmap.codeToUnicode {
		codes = append(codes, code)
	}
	sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })

	if len(cmap.codeToUnicode) != len(codeToUnicode) {
		t.Errorf("Incorrect length. expected=%d test=%d", len(codeToUnicode1), len(cmap.codeToUnicode))
		return
	}

	for i, code := range codes0 {
		if code != codes[i] {
			t.Errorf("Code mismatch: i=%d expected=0x%04x test=0x%04x", i, code, codes[i])
			return
		}
		u0 := codeToUnicode[code]
		u := cmap.codeToUnicode[code]
		if u != u0 {
			t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u)
			return
		}
	}
}
-												Add LICENSE.md with reference to AGPL and Commercial license.  Add license header info to code.

											
										
										
											2018-03-22 14:03:47 +00:00
+								/*
 								 * This file is subject to the terms and conditions defined in
 								 * file 'LICENSE.md', which is part of this source code package.
 								 */
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								package cmap
 								import (
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
+									"sort"
 									"strings"
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									"testing"
 								)
 								func init() {
-												Disable trace logging when running tests

											
										
										
											2018-08-01 13:19:05 +00:00
+									// Uncomment when debugging to get debug or trace logging output.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									//common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
-												Disable trace logging when running tests

											
										
										
											2018-08-01 13:19:05 +00:00
+									//common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								}
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+								// cmap1Data represents a basic CMap.
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								const cmap1Data = `
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									/CIDInit /ProcSet findresource begin
 dict begin
 									begincmap
 									/CIDSystemInfo
 									<<  /Registry (Adobe)
 									/Ordering (UCS)
 									/Supplement 0
 									>> def
 									/CMapName /Adobe-Identity-UCS def
 									/CMapType 2 def
 begincodespacerange
 									<0000> <FFFF>
 									endcodespacerange
 beginbfchar
 									<0003> <0020>
 									<0007> <0024>
 									<0033> <0050>
 									<0035> <0052>
 									<0037> <0054>
 									<005A> <0077>
 									<005C> <0079>
 									<005F> <007C>
 									endbfchar
 beginbfrange
 									<000F> <0017> <002C>
 									<001B> <001D> <0038>
 									<0025> <0026> <0042>
 									<002F> <0031> <004C>
 									<0044> <004C> <0061>
 									<004F> <0053> <006C>
 									<0055> <0057> <0072>
 									endbfrange
 									endcmap
 									CMapName currentdict /CMap defineresource pop
 									end
 									end
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								`
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+								// TestCMapParser tests basic loading of a simple CMap.
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								func TestCMapParser1(t *testing.T) {
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									cmap, err := LoadCmapFromDataCID([]byte(cmap1Data))
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									if err != nil {
 										t.Error("Failed: ", err)
 										return
 									}
 									if cmap.Name() != "Adobe-Identity-UCS" {
 										t.Errorf("CMap name incorrect (%s)", cmap.Name())
 										return
 									}
 									if cmap.Type() != 2 {
 										t.Errorf("CMap type incorrect")
 										return
 									}
 									if len(cmap.codespaces) != 1 {
 										t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
 										return
 									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									if cmap.codespaces[0].Low != 0 {
 										t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].Low)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+										return
 									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									if cmap.codespaces[0].High != 0xFFFF {
 										t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].High)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+										return
 									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									expectedMappings := map[CharCode]rune{
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+x0003:     0x0020,
 x005F:     0x007C,
 x000F:     0x002C,
 x000F + 5: 0x002C + 5,
 x001B:     0x0038,
 x001B + 2: 0x0038 + 2,
 x002F:     0x004C,
 x0044:     0x0061,
 x004F:     0x006C,
 x0055:     0x0072,
 									}
 									for k, expected := range expectedMappings {
-												cmap: mapped values are runes, not strings

											
										
										
											2018-11-29 03:22:46 +02:00
+										if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+											return
 										}
 									}
-												Changes after pull request review

											
										
										
											2018-07-24 21:32:02 +10:00
+									v, _ := cmap.CharcodeToUnicode(0x99)
-												cmap: mapped values are runes, not strings

											
										
										
											2018-11-29 03:22:46 +02:00
+									if v != MissingCodeRune { //!= "notdef" {
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+										t.Errorf("Unmapped code, expected to map to undefined")
 										return
 									}
 									charcodes := []byte{0x00, 0x03, 0x00, 0x0F}
-												Fall back to font encoding when ToUnicode doesn't match

											
										
										
											2018-06-27 22:01:17 +10:00
+									s, _ := cmap.CharcodeBytesToUnicode(charcodes)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									if s != " ," {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										t.Error("Incorrect charcode bytes ➞ string mapping")
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+										return
 									}
 								}
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
 								const cmap2Data = `
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									/CIDInit /ProcSet findresource begin
 dict begin
 									begincmap
 									/CIDSystemInfo
 									<<  /Registry (Adobe)
 									/Ordering (UCS)
 									/Supplement 0
 									>> def
 									/CMapName /Adobe-Identity-UCS def
 									/CMapType 2 def
 begincodespacerange
 									<0000> <FFFF>
 									endcodespacerange
 beginbfrange
 									<0080> <00FF> <002C>
 									<802F> <902F> <0038>
 									endbfrange
 									endcmap
 									CMapName currentdict /CMap defineresource pop
 									end
 									end
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+								`
 								// TestCMapParser2 tests a bug that came up when 2-byte character codes had the higher byte set to 0,
 								// e.g. 0x0080, and the character map was not taking the number of bytes of the input codemap into account.
 								func TestCMapParser2(t *testing.T) {
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									cmap, err := LoadCmapFromDataCID([]byte(cmap2Data))
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+									if err != nil {
 										t.Error("Failed: ", err)
 										return
 									}
 									if cmap.Name() != "Adobe-Identity-UCS" {
 										t.Errorf("CMap name incorrect (%s)", cmap.Name())
 										return
 									}
 									if cmap.Type() != 2 {
 										t.Errorf("CMap type incorrect")
 										return
 									}
 									if len(cmap.codespaces) != 1 {
 										t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
 										return
 									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									if cmap.codespaces[0].Low != 0 {
 										t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].Low)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										return
 									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									if cmap.codespaces[0].High != 0xFFFF {
 										t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].High)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										return
 									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									expectedMappings := map[CharCode]rune{
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+x0080: 0x002C,
 x802F: 0x0038,
 									}
 									for k, expected := range expectedMappings {
-												cmap: mapped values are runes, not strings

											
										
										
											2018-11-29 03:22:46 +02:00
+										if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+											return
 										}
 									}
 									// Check byte sequence mappings.
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									expectedSequenceMappings := []struct {
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										bytes    []byte
 										expected string
 									}{
 										{[]byte{0x80, 0x2F, 0x00, 0x80}, string([]rune{0x0038, 0x002C})},
 									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									for _, exp := range expectedSequenceMappings {
-												Fall back to font encoding when ToUnicode doesn't match

											
										
										
											2018-06-27 22:01:17 +10:00
+										str, _ := cmap.CharcodeBytesToUnicode(exp.bytes)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										if str != exp.expected {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											t.Errorf("Incorrect byte sequence mapping % X ➞ % X (got % X)",
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+												exp.bytes, []rune(exp.expected), []rune(str))
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+											return
 										}
 									}
 								}
 								// cmapData3 is a CMap with a mixture of 1 and 2 byte codespaces.
 								const cmapData3 = `
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									/CIDInit /ProcSet findresource begin
 dict begin begincmap
 									/CIDSystemInfo
 dict dup begin
 									/Registry (Adobe) def
 									/Supplement 2 def
 									end def
 									/CMapName /test-1 def
 									/CMapType 1 def
 begincodespacerange
 									<00> <80>
 									<8100> <9fff>
 									<a0> <d0>
 									<d140> <fbfc>
 									endcodespacerange
 beginbfrange
 									<00> <80> <10>
 									<8100> <9f00> <1000>
 									<a0> <d0> <90>
 									<d140> <f000> <a000>
 									endbfrange
 									endcmap
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+								`
 								// TestCMapParser3 test case of a CMap with mixed number of 1 and 2 bytes in the codespace range.
 								func TestCMapParser3(t *testing.T) {
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									cmap, err := LoadCmapFromDataCID([]byte(cmapData3))
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+									if err != nil {
 										t.Error("Failed: ", err)
 										return
 									}
 									if cmap.Name() != "test-1" {
 										t.Errorf("CMap name incorrect (%s)", cmap.Name())
 										return
 									}
 									if cmap.Type() != 1 {
 										t.Errorf("CMap type incorrect")
 										return
 									}
 									// Check codespaces.
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									expectedCodespaces := []Codespace{
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										{1, 0x00, 0x80},
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+										{1, 0xa0, 0xd0},
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										{2, 0x8100, 0x9fff},
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+										{2, 0xd140, 0xfbfc},
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+									}
 									if len(cmap.codespaces) != len(expectedCodespaces) {
 										t.Errorf("len codespace != %d (%d)", len(expectedCodespaces), len(cmap.codespaces))
 										return
 									}
 									for i, cs := range cmap.codespaces {
 										exp := expectedCodespaces[i]
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+										if cs.NumBytes != exp.NumBytes {
 											t.Errorf("code space number of bytes != %d (%d) %x", exp.NumBytes, cs.NumBytes, exp)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+											return
 										}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+										if cs.Low != exp.Low {
 											t.Errorf("code space low range != %d (%d) %x", exp.Low, cs.Low, exp)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+											return
 										}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+										if cs.High != exp.High {
 											t.Errorf("code space high range != 0x%X (0x%X) %x", exp.High, cs.High, exp)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+											return
 										}
 									}
 									// Check mappings.
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									expectedMappings := map[CharCode]rune{
 x80:   0x10 + 0x80,
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+x8100: 0x1000,
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+xa0:   0x90,
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+xd140: 0xa000,
 									}
 									for k, expected := range expectedMappings {
-												cmap: mapped values are runes, not strings

											
										
										
											2018-11-29 03:22:46 +02:00
+										if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v)
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+											return
 										}
 									}
 									// Check byte sequence mappings.
 									expectedSequenceMappings := []struct {
 										bytes    []byte
 										expected string
 									}{
 										{[]byte{0x80, 0x81, 0x00, 0xa1, 0xd1, 0x80, 0x00},
 											string([]rune{
 x90,
 x1000,
 x91,
 xa000 + 0x40,
 x10})},
 									}
 									for _, exp := range expectedSequenceMappings {
-												Fall back to font encoding when ToUnicode doesn't match

											
										
										
											2018-06-27 22:01:17 +10:00
+										str, _ := cmap.CharcodeBytesToUnicode(exp.bytes)
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+										if str != exp.expected {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											t.Errorf("Incorrect byte sequence mapping: % 02X ➞ % 02X (got % 02X)",
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+												exp.bytes, []rune(exp.expected), []rune(str))
 											return
 										}
 									}
 								}
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
+								// cmapData4 is a CMap with some utf16 encoded unicode strings that contain surrogates.
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+								const cmap4Data = `
 								    /CIDInit /ProcSet findresource begin
 dict begin
 								    begincmap
 								    /CIDSystemInfo
 								    << /Registry (Adobe)
 								    /Ordering (UCS)
 								    /Supplement 0
 								    >> def
 								    /CMapName /Adobe-Identity-UCS def
 								    /CMapType 2 def
 begincodespacerange
 								    <0000> <FFFF>
 								    endcodespacerange
 beginbfchar
 								    <01E1> <002C>
 								    <0201> <007C>
 								    <059C> <21D2>
 								    <05CA> <2200>
 								    <05CC> <2203>
 								    <05D0> <2208>
 								    <0652> <2295>
 								    <073F> <D835DC50>
 								    <0749> <D835DC5A>
 								    <0889> <D835DC84>
 								    <0893> <D835DC8E>
 								    <08DD> <D835DC9E>
 								    <08E5> <D835DCA6>
 								    <08E7> <2133>
 								    <0D52> <2265>
 								    endbfchar
 beginbfrange
 								    <0E36> <0E37> <27F5>
 								    endbfrange
 								    endcmap
 								`
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
+								// TestCMapParser4 checks that ut16 encoded unicode strings are interpreted correctly.
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+								func TestCMapParser4(t *testing.T) {
 									cmap, err := LoadCmapFromDataCID([]byte(cmap4Data))
 									if err != nil {
 										t.Error("Failed to load CMap: ", err)
 										return
 									}
 									if cmap.Name() != "Adobe-Identity-UCS" {
 										t.Errorf("CMap name incorrect (%s)", cmap.Name())
 										return
 									}
 									if cmap.Type() != 2 {
 										t.Errorf("CMap type incorrect")
 										return
 									}
 									if len(cmap.codespaces) != 1 {
 										t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
 										return
 									}
 									if cmap.codespaces[0].Low != 0 {
 										t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].Low)
 										return
 									}
 									if cmap.codespaces[0].High != 0xFFFF {
 										t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].High)
 										return
 									}
-												cmap: mapped values are runes, not strings

											
										
										
											2018-11-29 03:22:46 +02:00
+									expectedMappings := map[CharCode]rune{
 x0889: '\U0001d484', // `𝒄`
 x0893: '\U0001d48e', // `𝒎`
 x08DD: '\U0001d49e', // `𝒞`
 x08E5: '\U0001d4a6', // `𝒦
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									}
 									for k, expected := range expectedMappings {
-												Changes after pull request review

											
										
										
											2018-07-24 21:32:02 +10:00
+										if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+											return
 										}
 									}
 									// Check byte sequence mappings.
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									expectedSequenceMappings := []struct {
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										bytes    []byte
 										expected string
 									}{
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+										{[]byte{0x07, 0x3F, 0x07, 0x49}, "\U0001d450\U0001d45a"}, // `𝑐𝑚`
 										{[]byte{0x08, 0x89, 0x08, 0x93}, "\U0001d484\U0001d48e"}, // `𝒄𝒎`
 										{[]byte{0x08, 0xDD, 0x08, 0xE5}, "\U0001d49e\U0001d4a6"}, // `𝒞𝒦`
 										{[]byte{0x08, 0xE7, 0x0D, 0x52}, "\u2133\u2265"},         // `ℳ≥`
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+									}
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+									for _, exp := range expectedSequenceMappings {
-												Fall back to font encoding when ToUnicode doesn't match

											
										
										
											2018-06-27 22:01:17 +10:00
+										str, _ := cmap.CharcodeBytesToUnicode(exp.bytes)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+										if str != exp.expected {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											t.Errorf("Incorrect byte sequence mapping % 02X ➞ %+q (got %+q)",
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+												exp.bytes, exp.expected, str)
-												Track number of bytes per character code for mappings.  Fixes problem posed in PR #156  in a generic fashion.

											
										
										
											2018-06-03 01:05:46 +00:00
+											return
 										}
 									}
 								}
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
 								var (
-												cmap: mapped values are runes, not strings

											
										
										
											2018-11-29 03:22:46 +02:00
+									codeToUnicode1 = map[CharCode]rune{ // 40 entries
 x02ca: 'ˊ',
 x02cb: 'ˋ',
 x02cd: 'ˍ',
 x039c: 'Μ',
 x039d: 'Ν',
 x039e: 'Ξ',
 x039f: 'Ο',
 x03a0: 'Π',
 x03a1: 'Ρ',
 x03a6: 'Φ',
 x03b1: 'α',
 x03b2: 'β',
 x03b3: 'γ',
 x03b4: 'δ',
 x03b5: 'ε',
 x03b6: 'ζ',
 x03b7: 'η',
 x03c6: 'φ',
 x03c7: 'χ',
 x03c9: 'ω',
 x2013: '–',
 x2014: '—',
 x2018: '‘',
 x2019: '’',
 x203e: '‾',
 x20ac: '€',
 x2163: 'Ⅳ',
 x2164: 'Ⅴ',
 x2165: 'Ⅵ',
 x2166: 'Ⅶ',
 x2167: 'Ⅷ',
 x2168: 'Ⅸ',
 x2169: 'Ⅹ',
 x2190: '←',
 x2191: '↑',
 x2192: '→',
 x2193: '↓',
 x2220: '∠',
 x2223: '∣',
 x222a: '∪',
 									}
 									codeToUnicode2 = map[CharCode]rune{ // 40 entries
 x0100: 'Ā',
 x0101: 'ā',
 x0102: 'Ă',
 x0111: 'đ',
 x0112: 'Ē',
 x0113: 'ē',
 x0114: 'Ĕ',
 x0115: 'ĕ',
 x0116: 'Ė',
 x011b: 'ě',
 x0126: 'Ħ',
 x0127: 'ħ',
 x0128: 'Ĩ',
 x0129: 'ĩ',
 x012a: 'Ī',
 x012b: 'ī',
 x012c: 'Ĭ',
 x013b: 'Ļ',
 x013c: 'ļ',
 x013e: 'ľ',
 x013f: 'Ŀ',
 x0140: 'ŀ',
 x0141: 'Ł',
 x0150: 'Ő',
 x0151: 'ő',
 x0152: 'Œ',
 x0153: 'œ',
 x0154: 'Ŕ',
 x0155: 'ŕ',
 x015a: 'Ś',
 x0165: 'ť',
 x0166: 'Ŧ',
 x0167: 'ŧ',
 x0168: 'Ũ',
 x0169: 'ũ',
 x016a: 'Ū',
 x016b: 'ū',
 x017a: 'ź',
 x017b: 'Ż',
 x017d: 'Ž',
 									}
 									codeToUnicode3 = map[CharCode]rune{ // 93 entries
 x0124: 'Ĥ',
 x0125: 'ĥ',
 x0126: 'Ħ',
 x0127: 'ħ',
 x0134: 'Ĵ',
 x0135: 'ĵ',
 x0136: 'Ķ',
 x0137: 'ķ',
 x0138: 'ĸ',
 x0144: 'ń',
 x0145: 'Ņ',
 x0146: 'ņ',
 x0147: 'Ň',
 x0154: 'Ŕ',
 x0155: 'ŕ',
 x0156: 'Ŗ',
 x0157: 'ŗ',
 x0164: 'Ť',
 x0169: 'ũ',
 x0174: 'Ŵ',
 x0175: 'ŵ',
 x0176: 'Ŷ',
 x0177: 'ŷ',
 x0184: 'Ƅ',
 x0185: 'ƅ',
 x0186: 'Ɔ',
 x0187: 'Ƈ',
 x0194: 'Ɣ',
 x019a: 'ƚ',
 x01a4: 'Ƥ',
 x01a5: 'ƥ',
 x01a6: 'Ʀ',
 x01a7: 'Ƨ',
 x01b4: 'ƴ',
 x01b5: 'Ƶ',
 x01b6: 'ƶ',
 x01b7: 'Ʒ',
 x01c4: 'Ǆ',
 x01cb: 'ǋ',
 x01d4: 'ǔ',
 x01d5: 'Ǖ',
 x01d6: 'ǖ',
 x01d7: 'Ǘ',
 x01e4: 'Ǥ',
 x01e5: 'ǥ',
 x01e6: 'Ǧ',
 x01e7: 'ǧ',
 x01f4: 'Ǵ',
 x01f5: 'ǵ',
 x0204: 'Ȅ',
 x0205: 'ȅ',
 x0206: 'Ȇ',
 x0207: 'ȇ',
 x0214: 'Ȕ',
 x0215: 'ȕ',
 x0216: 'Ȗ',
 x0217: 'ȗ',
 x0224: 'Ȥ',
 x0226: 'Ȧ',
 x0227: 'ȧ',
 x0254: 'ɔ',
 x0255: 'ɕ',
 x0256: 'ɖ',
 x0257: 'ɗ',
 x0264: 'ɤ',
 x0265: 'ɥ',
 x0266: 'ɦ',
 x0267: 'ɧ',
 x0273: 'ɳ',
 x0274: 'ɴ',
 x0275: 'ɵ',
 x0276: 'ɶ',
 x0277: 'ɷ',
 x0284: 'ʄ',
 x0285: 'ʅ',
 x0286: 'ʆ',
 x0287: 'ʇ',
 x0294: 'ʔ',
 x0296: 'ʖ',
 x0297: 'ʗ',
 x02a4: 'ʤ',
 x02a5: 'ʥ',
 x02c6: 'ˆ',
 x02c7: 'ˇ',
 x0304: '̄',
 x0305: '̅',
 x0306: '̆',
 x0307: '̇',
 x030d: '̍',
 x0314: '̔',
 x0315: '̕',
 x0316: '̖',
 x0317: '̗',
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
+									}
 								)
 								const bfData1 = `
 beginbfchar
 								<02cd> <02cd>
 								<03a6> <03a6>
 								<03c9> <03c9>
 								<203e> <203e>
 								<20ac> <20ac>
 								<2220> <2220>
 								<2223> <2223>
 								<222a> <222a>
 								endbfchar
 beginbfrange
 								<02ca><02cb> <02ca>
 								<039c><03a1> <039c>
 								<03b1><03b7> <03b1>
 								<03c6><03c7> <03c6>
 								<2013><2014> <2013>
 								<2018><2019> <2018>
 								<2163><2169> <2163>
 								<2190><2193> <2190>
 								endbfrange
 								`
 								// TestBfData checks that cmap.toBfData produces the expected output.
 								func TestBfData(t *testing.T) {
 									cmap := NewToUnicodeCMap(codeToUnicode1)
 									bfDataExpected := strings.Trim(bfData1, "\n")
 									bfDataTest := cmap.toBfData()
 									if bfDataTest != bfDataExpected {
 										t.Errorf("Incorrect bfData")
 										return
 									}
 								}
 								// TestBfData checks that cmap.toBfData produces the expected output.
 								func TestCMapCreation(t *testing.T) {
 									checkCmapWriteRead(t, codeToUnicode1)
 									checkCmapWriteRead(t, codeToUnicode2)
 									checkCmapWriteRead(t, codeToUnicode3)
 								}
 								// checkCmapWriteRead creates CMap data from `codeToUnicode` then parses it and checks that the
 								// same codeToUnicode is returned.
-												cmap: mapped values are runes, not strings

											
										
										
											2018-11-29 03:22:46 +02:00
+								func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) {
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
+									cmap0 := NewToUnicodeCMap(codeToUnicode)
 									data := cmap0.Bytes()
 									cmap, err := LoadCmapFromDataCID(data)
 									if err != nil {
 										t.Error("Failed to load CMap: ", err)
 										return
 									}
-												refactor some receiver and method names; fix typos in comments

											
										
										
											2018-12-11 04:37:00 +02:00
+									codes0 := make([]CharCode, 0, len(codeToUnicode))
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
+									for code := range codeToUnicode {
 										codes0 = append(codes0, code)
 									}
 									sort.Slice(codes0, func(i, j int) bool { return codes0[i] < codes0[j] })
-												refactor some receiver and method names; fix typos in comments

											
										
										
											2018-12-11 04:37:00 +02:00
+									codes := make([]CharCode, 0, len(cmap.codeToUnicode))
-												Added tests for CMap creation

											
										
										
											2018-09-21 15:39:31 +10:00
+									for code := range cmap.codeToUnicode {
 										codes = append(codes, code)
 									}
 									sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
 									if len(cmap.codeToUnicode) != len(codeToUnicode) {
 										t.Errorf("Incorrect length. expected=%d test=%d", len(codeToUnicode1), len(cmap.codeToUnicode))
 										return
 									}
 									for i, code := range codes0 {
 										if code != codes[i] {
 											t.Errorf("Code mismatch: i=%d expected=0x%04x test=0x%04x", i, code, codes[i])
 											return
 										}
 										u0 := codeToUnicode[code]
 										u := cmap.codeToUnicode[code]
 										if u != u0 {
 											t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u)
 											return
 										}
 									}
 								}