From 96dba88f57633b05d6457ca56b60700848b91877 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 17 Jul 2018 17:43:11 +1000 Subject: [PATCH] Added a test for CharcodeBytesToUnicode for TrueType fonts with ToUnicode cmaps --- pdf/core/parser.go | 1 + pdf/model/font_test.go | 112 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/pdf/core/parser.go b/pdf/core/parser.go index 9ff22319..40a942bb 100644 --- a/pdf/core/parser.go +++ b/pdf/core/parser.go @@ -1507,6 +1507,7 @@ func NewParserFromString(txt string) *PdfParser { parser.reader = bufferedReader parser.fileSize = int64(len(txt)) + parser.streamLengthReferenceLookupInProgress = map[int64]bool{} return &parser } diff --git a/pdf/model/font_test.go b/pdf/model/font_test.go index 85c77fcb..eb4ec0e1 100644 --- a/pdf/model/font_test.go +++ b/pdf/model/font_test.go @@ -2,6 +2,8 @@ package model_test import ( "errors" + "fmt" + "io" "testing" "github.com/unidoc/unidoc/common" @@ -138,6 +140,53 @@ func TestCompositeFonts(t *testing.T) { } } +// TestTrueTypeToUnicode checks that CharcodeBytesToUnicode is working for a TrueType font with a +// ToUnicode cmap. +func TestTrueTypeToUnicode(t *testing.T) { + numObj, err := parsePdfObjects(ttToUnicode) + if err != nil { + t.Errorf("Failed to parse ttToUnicode object. err=%v", err) + return + } + fontObj := numObj[9] + font, err := model.NewPdfFontFromPdfObject(fontObj) + if err != nil { + t.Errorf("Failed to create font. err=%v", err) + return + } + + data := []byte{43, 40, 41, 34, 37, 42, 38, 49, 36, 38, 48, 34, 35, 36, 37, 35, 36, 58} + expectedText := "Alerts on printing" + actualText, numChars, numMisses := font.CharcodeBytesToUnicode(data) + if numMisses != 0 { + t.Errorf("Some codes not decoded. numMisses=%d", numMisses) + return + } + if actualText != expectedText { + t.Errorf("Incorrect decoding.\nexpected=%q\n actual=%q", expectedText, actualText) + } + if numChars != len(actualText) { + t.Errorf("Incorrect numChars=%d expected=%d", numChars, len(actualText)) + } +} + +// ttToUnicode is a TrueType font object and its ToUnicode cmap. +// The stream data in obj 26 (the ToUnicode cmap) is Sprintf'd to avoid binary data in the `` string. +var ttToUnicode = fmt.Sprintf(`9 0 obj +<< /Type /Font /Subtype /TrueType /BaseFont /AHSHJL+.SFUIText /ToUnicode 26 0 R /FirstChar 33 /LastChar 79 /Widths [ 635 381 246 +583 363 282 609 252 571 523 674 560 594 542 543 609 591 637 584 874 614 614 +362 246 268 609 742 870 644 596 604 771 716 653 297 525 657 543 297 774 539 +382 382 726 968 295 538 ] >> +endobj +26 0 obj +<< /Length 497 /Filter /FlateDecode >> +stream +%s +endstream +endobj +`, + "\x78\x01\x5d\x93\xcd\x8a\xdb\x30\x14\x46\xf7\x7e\x0a\x2d\xa7\x8b\xc1\x8a\xe5\x24\x33\x60\x0c\xc3\x94\x81\x2c\xfa\x43\xd3\x3e\x80\x6d\xc9\xc1\xd0\xd8\xc6\x71\x16\x79\xfb\x9e\xef\x66\x3a\x85\x2e\xbe\xc5\xf1\xd5\x55\xee\x51\xa4\xfc\xf5\xf0\xf9\x30\x0e\xab\xcb\xbf\x2f\x53\x77\x4c\xab\xeb\x87\x31\x2e\xe9\x32\x5d\x97\x2e\xb9\x36\x9d\x86\x31\xdb\x14\x2e\x0e\xdd\xfa\x4e\xf6\xad\x3b\x37\x73\x96\xd3\x7c\xbc\x5d\xd6\x74\x3e\x8c\xfd\xe4\xaa\x2a\x73\x2e\xff\x41\xcb\x65\x5d\x6e\xee\xe1\x25\x4e\x6d\xfa\xa4\x6f\xdf\x96\x98\x96\x61\x3c\xb9\x87\x5f\xaf\x47\xfb\x72\xbc\xce\xf3\xef\x74\x4e\xe3\xea\x7c\x56\xd7\x2e\xa6\x9e\xed\xbe\x34\xf3\xd7\xe6\x9c\x5c\x6e\xad\x8f\x87\x48\x7d\x58\x6f\x8f\x74\xfd\x5b\xf1\xf3\x36\x27\xc7\x44\x74\x6c\xee\x23\x75\x53\x4c\x97\xb9\xe9\xd2\xd2\x8c\xa7\x94\x55\xde\xd7\xd5\xdb\x5b\x9d\xa5\x31\xfe\x57\x2a\xb7\xf7\x8e\xb6\x7f\x5f\x5a\x6c\xea\x4a\xf1\x7e\xeb\xeb\xac\x2a\x0a\x90\x78\xbf\x2f\x84\x01\x24\xde\xef\x9e\x85\x25\x48\xc0\x24\xdc\x82\x84\xc5\xa5\x70\x07\x12\xef\x0b\xdb\xea\x09\x24\x2c\xee\x54\x7d\x06\x09\xb8\x15\x36\x20\xa1\x37\x08\x5b\x90\x78\x5f\x6e\x84\x1d\x48\x58\x6c\xd5\x08\x12\xf0\x49\xd5\x04\x12\x7a\x77\xc2\x1e\x24\xa0\x86\x0c\xc8\x2b\xa0\xc6\x08\xc8\x29\xf4\xf6\x42\xe4\x14\x7c\xb5\x73\x40\x4e\x61\xb1\xa6\x0a\xc8\x29\x8c\x11\x85\xc8\x29\xf4\xea\x34\x02\x72\x0a\x28\xdf\xb0\x07\x09\xa8\x31\x02\xae\x0a\xd8\x08\x71\x55\xd8\xca\xa6\xc2\x35\x98\xef\x6e\xaf\x2a\xae\x0a\x55\x9d\x64\xc0\x55\xa1\xd7\x7e\x17\xd7\x60\xbe\x0c\x43\x15\x57\x85\xc5\x36\x24\xae\xc1\x7c\x11\xc9\xaa\x12\x57\x85\xaa\x04\x39\x3f\x0b\x28\xc1\x12\x57\x05\x5f\x5b\x8c\x2b\xdf\x41\x0e\x90\x2a\xae\x0a\xfa\x3a\x58\xb6\xb7\xd0\x6b\x8b\x71\x2d\xef\xbe\xad\xaa\xb8\x2a\xf4\xea\x0f\x2d\x71\x55\xe8\x95\x11\x96\x16\x50\xfa\x25\xae\xa5\x09\x72\x07\x40\xe4\x14\x76\x96\x11\xa7\x6b\x61\x2a\xeb\x45\x8e\x73\xa8\x7c\xd1\xda\xce\xc8\xe1\xa2\xc5\x6c\xc5\x25\xfe\x7b\x5b\x75\x9f\xf5\xee\x3e\xde\x49\x77\x5d\x16\x9e\x88\x3d\x4e\x7b\x3d\x7a\x15\xc3\x98\x3e\xde\xef\x3c\xcd\xda\xc0\xf2\x07\xc5\xfd\x00\x4f") + // objFontObj parses `fontDict` to a make a Font, creates a PDF object from the Font and checks that // the new PDF object is the same as the input object func objFontObj(t *testing.T, fontDict string) error { @@ -166,3 +215,66 @@ func objFontObj(t *testing.T, fontDict string) error { return nil } + +// parsePdfObjects parses a fragment of a PDF `text` (e.g. ttToUnicode above) and returns a map of +// {object number: object} with indirect objects replaced by their values if they are in `text`. +func parsePdfObjects(text string) (map[int64]core.PdfObject, error) { + numObj := map[int64]core.PdfObject{} + parser := core.NewParserFromString(text) + for { + obj, err := parser.ParseIndirectObject() + if err != nil { + if err == io.EOF { + break + } + return numObj, err + } + switch t := obj.(type) { + case *core.PdfIndirectObject: + numObj[t.ObjectNumber] = obj + case *core.PdfObjectStream: + numObj[t.ObjectNumber] = obj + } + } + + for _, obj := range numObj { + iobj, ok := obj.(*core.PdfIndirectObject) + if !ok { + continue + } + dict, ok := iobj.PdfObject.(*core.PdfObjectDictionary) + if !ok { + continue + } + for _, k := range dict.Keys() { + if ref, ok := dict.Get(k).(*core.PdfObjectReference); ok { + if o, ok := numObj[ref.ObjectNumber]; ok { + dict.Set(k, o) + } + } + } + } + return numObj, nil +} + +// func isFontObject(obj core.PdfObject) bool { +// var dict *core.PdfObjectDictionary +// switch t := obj.(type) { +// case *core.PdfIndirectObject: +// dict = t.PdfObject.(*core.PdfObjectDictionary) +// case *core.PdfObjectDictionary: +// dict = t +// default: +// return false +// } +// name, err := core.GetName(dict.Get("Type")) +// return err == nil && name == "Font" +// } + +// func showDict(dict *core.PdfObjectDictionary) string { +// parts := []string{} +// for _, k := range dict.Keys() { +// parts = append(parts, fmt.Sprintf("%s: %T", k, dict.Get(k))) +// } +// return fmt.Sprintf("{%s}", strings.Join(parts, ", ")) +// }