From 96dba88f57633b05d6457ca56b60700848b91877 Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Tue, 17 Jul 2018 17:43:11 +1000
Subject: [PATCH] Added a test for CharcodeBytesToUnicode for TrueType fonts
 with ToUnicode cmaps

---
 pdf/core/parser.go     |   1 +
 pdf/model/font_test.go | 112 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/pdf/core/parser.go b/pdf/core/parser.go
index 9ff22319..40a942bb 100644
--- a/pdf/core/parser.go
+++ b/pdf/core/parser.go
@@ -1507,6 +1507,7 @@ func NewParserFromString(txt string) *PdfParser {
 	parser.reader = bufferedReader
 
 	parser.fileSize = int64(len(txt))
+	parser.streamLengthReferenceLookupInProgress = map[int64]bool{}
 
 	return &parser
 }
diff --git a/pdf/model/font_test.go b/pdf/model/font_test.go
index 85c77fcb..eb4ec0e1 100644
--- a/pdf/model/font_test.go
+++ b/pdf/model/font_test.go
@@ -2,6 +2,8 @@ package model_test
 
 import (
 	"errors"
+	"fmt"
+	"io"
 	"testing"
 
 	"github.com/unidoc/unidoc/common"
@@ -138,6 +140,53 @@ func TestCompositeFonts(t *testing.T) {
 	}
 }
 
+// TestTrueTypeToUnicode checks that CharcodeBytesToUnicode is working for a TrueType font with a
+// ToUnicode cmap.
+func TestTrueTypeToUnicode(t *testing.T) {
+	numObj, err := parsePdfObjects(ttToUnicode)
+	if err != nil {
+		t.Errorf("Failed to parse ttToUnicode object. err=%v", err)
+		return
+	}
+	fontObj := numObj[9]
+	font, err := model.NewPdfFontFromPdfObject(fontObj)
+	if err != nil {
+		t.Errorf("Failed to create font. err=%v", err)
+		return
+	}
+
+	data := []byte{43, 40, 41, 34, 37, 42, 38, 49, 36, 38, 48, 34, 35, 36, 37, 35, 36, 58}
+	expectedText := "Alerts on printing"
+	actualText, numChars, numMisses := font.CharcodeBytesToUnicode(data)
+	if numMisses != 0 {
+		t.Errorf("Some codes not decoded. numMisses=%d", numMisses)
+		return
+	}
+	if actualText != expectedText {
+		t.Errorf("Incorrect decoding.\nexpected=%q\n  actual=%q", expectedText, actualText)
+	}
+	if numChars != len(actualText) {
+		t.Errorf("Incorrect numChars=%d expected=%d", numChars, len(actualText))
+	}
+}
+
+// ttToUnicode is a TrueType font object and its ToUnicode cmap.
+// The stream data in obj 26 (the ToUnicode cmap) is Sprintf'd to avoid binary data in the `` string.
+var ttToUnicode = fmt.Sprintf(`9 0 obj
+<< /Type /Font /Subtype /TrueType /BaseFont /AHSHJL+.SFUIText /ToUnicode 26 0 R /FirstChar 33 /LastChar 79 /Widths [ 635 381 246
+583 363 282 609 252 571 523 674 560 594 542 543 609 591 637 584 874 614 614
+362 246 268 609 742 870 644 596 604 771 716 653 297 525 657 543 297 774 539
+382 382 726 968 295 538 ] >>
+endobj
+26 0 obj
+<< /Length 497 /Filter /FlateDecode >>
+stream
+%s
+endstream
+endobj
+`,
+	"\x78\x01\x5d\x93\xcd\x8a\xdb\x30\x14\x46\xf7\x7e\x0a\x2d\xa7\x8b\xc1\x8a\xe5\x24\x33\x60\x0c\xc3\x94\x81\x2c\xfa\x43\xd3\x3e\x80\x6d\xc9\xc1\xd0\xd8\xc6\x71\x16\x79\xfb\x9e\xef\x66\x3a\x85\x2e\xbe\xc5\xf1\xd5\x55\xee\x51\xa4\xfc\xf5\xf0\xf9\x30\x0e\xab\xcb\xbf\x2f\x53\x77\x4c\xab\xeb\x87\x31\x2e\xe9\x32\x5d\x97\x2e\xb9\x36\x9d\x86\x31\xdb\x14\x2e\x0e\xdd\xfa\x4e\xf6\xad\x3b\x37\x73\x96\xd3\x7c\xbc\x5d\xd6\x74\x3e\x8c\xfd\xe4\xaa\x2a\x73\x2e\xff\x41\xcb\x65\x5d\x6e\xee\xe1\x25\x4e\x6d\xfa\xa4\x6f\xdf\x96\x98\x96\x61\x3c\xb9\x87\x5f\xaf\x47\xfb\x72\xbc\xce\xf3\xef\x74\x4e\xe3\xea\x7c\x56\xd7\x2e\xa6\x9e\xed\xbe\x34\xf3\xd7\xe6\x9c\x5c\x6e\xad\x8f\x87\x48\x7d\x58\x6f\x8f\x74\xfd\x5b\xf1\xf3\x36\x27\xc7\x44\x74\x6c\xee\x23\x75\x53\x4c\x97\xb9\xe9\xd2\xd2\x8c\xa7\x94\x55\xde\xd7\xd5\xdb\x5b\x9d\xa5\x31\xfe\x57\x2a\xb7\xf7\x8e\xb6\x7f\x5f\x5a\x6c\xea\x4a\xf1\x7e\xeb\xeb\xac\x2a\x0a\x90\x78\xbf\x2f\x84\x01\x24\xde\xef\x9e\x85\x25\x48\xc0\x24\xdc\x82\x84\xc5\xa5\x70\x07\x12\xef\x0b\xdb\xea\x09\x24\x2c\xee\x54\x7d\x06\x09\xb8\x15\x36\x20\xa1\x37\x08\x5b\x90\x78\x5f\x6e\x84\x1d\x48\x58\x6c\xd5\x08\x12\xf0\x49\xd5\x04\x12\x7a\x77\xc2\x1e\x24\xa0\x86\x0c\xc8\x2b\xa0\xc6\x08\xc8\x29\xf4\xf6\x42\xe4\x14\x7c\xb5\x73\x40\x4e\x61\xb1\xa6\x0a\xc8\x29\x8c\x11\x85\xc8\x29\xf4\xea\x34\x02\x72\x0a\x28\xdf\xb0\x07\x09\xa8\x31\x02\xae\x0a\xd8\x08\x71\x55\xd8\xca\xa6\xc2\x35\x98\xef\x6e\xaf\x2a\xae\x0a\x55\x9d\x64\xc0\x55\xa1\xd7\x7e\x17\xd7\x60\xbe\x0c\x43\x15\x57\x85\xc5\x36\x24\xae\xc1\x7c\x11\xc9\xaa\x12\x57\x85\xaa\x04\x39\x3f\x0b\x28\xc1\x12\x57\x05\x5f\x5b\x8c\x2b\xdf\x41\x0e\x90\x2a\xae\x0a\xfa\x3a\x58\xb6\xb7\xd0\x6b\x8b\x71\x2d\xef\xbe\xad\xaa\xb8\x2a\xf4\xea\x0f\x2d\x71\x55\xe8\x95\x11\x96\x16\x50\xfa\x25\xae\xa5\x09\x72\x07\x40\xe4\x14\x76\x96\x11\xa7\x6b\x61\x2a\xeb\x45\x8e\x73\xa8\x7c\xd1\xda\xce\xc8\xe1\xa2\xc5\x6c\xc5\x25\xfe\x7b\x5b\x75\x9f\xf5\xee\x3e\xde\x49\x77\x5d\x16\x9e\x88\x3d\x4e\x7b\x3d\x7a\x15\xc3\x98\x3e\xde\xef\x3c\xcd\xda\xc0\xf2\x07\xc5\xfd\x00\x4f")
+
 // objFontObj parses `fontDict` to a make a Font, creates a PDF object from the Font and checks that
 // the new PDF object is the same as the input object
 func objFontObj(t *testing.T, fontDict string) error {
@@ -166,3 +215,66 @@ func objFontObj(t *testing.T, fontDict string) error {
 
 	return nil
 }
+
+// parsePdfObjects parses a fragment of a PDF `text` (e.g. ttToUnicode above) and returns a map of
+// {object number: object} with indirect objects replaced by their values if they are in `text`.
+func parsePdfObjects(text string) (map[int64]core.PdfObject, error) {
+	numObj := map[int64]core.PdfObject{}
+	parser := core.NewParserFromString(text)
+	for {
+		obj, err := parser.ParseIndirectObject()
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return numObj, err
+		}
+		switch t := obj.(type) {
+		case *core.PdfIndirectObject:
+			numObj[t.ObjectNumber] = obj
+		case *core.PdfObjectStream:
+			numObj[t.ObjectNumber] = obj
+		}
+	}
+
+	for _, obj := range numObj {
+		iobj, ok := obj.(*core.PdfIndirectObject)
+		if !ok {
+			continue
+		}
+		dict, ok := iobj.PdfObject.(*core.PdfObjectDictionary)
+		if !ok {
+			continue
+		}
+		for _, k := range dict.Keys() {
+			if ref, ok := dict.Get(k).(*core.PdfObjectReference); ok {
+				if o, ok := numObj[ref.ObjectNumber]; ok {
+					dict.Set(k, o)
+				}
+			}
+		}
+	}
+	return numObj, nil
+}
+
+// func isFontObject(obj core.PdfObject) bool {
+// 	var dict *core.PdfObjectDictionary
+// 	switch t := obj.(type) {
+// 	case *core.PdfIndirectObject:
+// 		dict = t.PdfObject.(*core.PdfObjectDictionary)
+// 	case *core.PdfObjectDictionary:
+// 		dict = t
+// 	default:
+// 		return false
+// 	}
+// 	name, err := core.GetName(dict.Get("Type"))
+// 	return err == nil && name == "Font"
+// }
+
+// func showDict(dict *core.PdfObjectDictionary) string {
+// 	parts := []string{}
+// 	for _, k := range dict.Keys() {
+// 		parts = append(parts, fmt.Sprintf("%s: %T", k, dict.Get(k)))
+// 	}
+// 	return fmt.Sprintf("{%s}", strings.Join(parts, ", "))
+// }