Added a test for CharcodeBytesToUnicode for TrueType fonts with ToUnicode cmaps

This commit is contained in:
Peter Williams 2018-07-17 17:43:11 +10:00
parent 79aa75acf8
commit 96dba88f57
2 changed files with 113 additions and 0 deletions

View File

@ -1507,6 +1507,7 @@ func NewParserFromString(txt string) *PdfParser {
parser.reader = bufferedReader parser.reader = bufferedReader
parser.fileSize = int64(len(txt)) parser.fileSize = int64(len(txt))
parser.streamLengthReferenceLookupInProgress = map[int64]bool{}
return &parser return &parser
} }

View File

@ -2,6 +2,8 @@ package model_test
import ( import (
"errors" "errors"
"fmt"
"io"
"testing" "testing"
"github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/common"
@ -138,6 +140,53 @@ func TestCompositeFonts(t *testing.T) {
} }
} }
// TestTrueTypeToUnicode checks that CharcodeBytesToUnicode is working for a TrueType font with a
// ToUnicode cmap.
func TestTrueTypeToUnicode(t *testing.T) {
numObj, err := parsePdfObjects(ttToUnicode)
if err != nil {
t.Errorf("Failed to parse ttToUnicode object. err=%v", err)
return
}
fontObj := numObj[9]
font, err := model.NewPdfFontFromPdfObject(fontObj)
if err != nil {
t.Errorf("Failed to create font. err=%v", err)
return
}
data := []byte{43, 40, 41, 34, 37, 42, 38, 49, 36, 38, 48, 34, 35, 36, 37, 35, 36, 58}
expectedText := "Alerts on printing"
actualText, numChars, numMisses := font.CharcodeBytesToUnicode(data)
if numMisses != 0 {
t.Errorf("Some codes not decoded. numMisses=%d", numMisses)
return
}
if actualText != expectedText {
t.Errorf("Incorrect decoding.\nexpected=%q\n actual=%q", expectedText, actualText)
}
if numChars != len(actualText) {
t.Errorf("Incorrect numChars=%d expected=%d", numChars, len(actualText))
}
}
// ttToUnicode is a TrueType font object and its ToUnicode cmap.
// The stream data in obj 26 (the ToUnicode cmap) is Sprintf'd to avoid binary data in the `` string.
var ttToUnicode = fmt.Sprintf(`9 0 obj
<< /Type /Font /Subtype /TrueType /BaseFont /AHSHJL+.SFUIText /ToUnicode 26 0 R /FirstChar 33 /LastChar 79 /Widths [ 635 381 246
583 363 282 609 252 571 523 674 560 594 542 543 609 591 637 584 874 614 614
362 246 268 609 742 870 644 596 604 771 716 653 297 525 657 543 297 774 539
382 382 726 968 295 538 ] >>
endobj
26 0 obj
<< /Length 497 /Filter /FlateDecode >>
stream
%s
endstream
endobj
`,
"\x78\x01\x5d\x93\xcd\x8a\xdb\x30\x14\x46\xf7\x7e\x0a\x2d\xa7\x8b\xc1\x8a\xe5\x24\x33\x60\x0c\xc3\x94\x81\x2c\xfa\x43\xd3\x3e\x80\x6d\xc9\xc1\xd0\xd8\xc6\x71\x16\x79\xfb\x9e\xef\x66\x3a\x85\x2e\xbe\xc5\xf1\xd5\x55\xee\x51\xa4\xfc\xf5\xf0\xf9\x30\x0e\xab\xcb\xbf\x2f\x53\x77\x4c\xab\xeb\x87\x31\x2e\xe9\x32\x5d\x97\x2e\xb9\x36\x9d\x86\x31\xdb\x14\x2e\x0e\xdd\xfa\x4e\xf6\xad\x3b\x37\x73\x96\xd3\x7c\xbc\x5d\xd6\x74\x3e\x8c\xfd\xe4\xaa\x2a\x73\x2e\xff\x41\xcb\x65\x5d\x6e\xee\xe1\x25\x4e\x6d\xfa\xa4\x6f\xdf\x96\x98\x96\x61\x3c\xb9\x87\x5f\xaf\x47\xfb\x72\xbc\xce\xf3\xef\x74\x4e\xe3\xea\x7c\x56\xd7\x2e\xa6\x9e\xed\xbe\x34\xf3\xd7\xe6\x9c\x5c\x6e\xad\x8f\x87\x48\x7d\x58\x6f\x8f\x74\xfd\x5b\xf1\xf3\x36\x27\xc7\x44\x74\x6c\xee\x23\x75\x53\x4c\x97\xb9\xe9\xd2\xd2\x8c\xa7\x94\x55\xde\xd7\xd5\xdb\x5b\x9d\xa5\x31\xfe\x57\x2a\xb7\xf7\x8e\xb6\x7f\x5f\x5a\x6c\xea\x4a\xf1\x7e\xeb\xeb\xac\x2a\x0a\x90\x78\xbf\x2f\x84\x01\x24\xde\xef\x9e\x85\x25\x48\xc0\x24\xdc\x82\x84\xc5\xa5\x70\x07\x12\xef\x0b\xdb\xea\x09\x24\x2c\xee\x54\x7d\x06\x09\xb8\x15\x36\x20\xa1\x37\x08\x5b\x90\x78\x5f\x6e\x84\x1d\x48\x58\x6c\xd5\x08\x12\xf0\x49\xd5\x04\x12\x7a\x77\xc2\x1e\x24\xa0\x86\x0c\xc8\x2b\xa0\xc6\x08\xc8\x29\xf4\xf6\x42\xe4\x14\x7c\xb5\x73\x40\x4e\x61\xb1\xa6\x0a\xc8\x29\x8c\x11\x85\xc8\x29\xf4\xea\x34\x02\x72\x0a\x28\xdf\xb0\x07\x09\xa8\x31\x02\xae\x0a\xd8\x08\x71\x55\xd8\xca\xa6\xc2\x35\x98\xef\x6e\xaf\x2a\xae\x0a\x55\x9d\x64\xc0\x55\xa1\xd7\x7e\x17\xd7\x60\xbe\x0c\x43\x15\x57\x85\xc5\x36\x24\xae\xc1\x7c\x11\xc9\xaa\x12\x57\x85\xaa\x04\x39\x3f\x0b\x28\xc1\x12\x57\x05\x5f\x5b\x8c\x2b\xdf\x41\x0e\x90\x2a\xae\x0a\xfa\x3a\x58\xb6\xb7\xd0\x6b\x8b\x71\x2d\xef\xbe\xad\xaa\xb8\x2a\xf4\xea\x0f\x2d\x71\x55\xe8\x95\x11\x96\x16\x50\xfa\x25\xae\xa5\x09\x72\x07\x40\xe4\x14\x76\x96\x11\xa7\x6b\x61\x2a\xeb\x45\x8e\x73\xa8\x7c\xd1\xda\xce\xc8\xe1\xa2\xc5\x6c\xc5\x25\xfe\x7b\x5b\x75\x9f\xf5\xee\x3e\xde\x49\x77\x5d\x16\x9e\x88\x3d\x4e\x7b\x3d\x7a\x15\xc3\x98\x3e\xde\xef\x3c\xcd\xda\xc0\xf2\x07\xc5\xfd\x00\x4f")
// objFontObj parses `fontDict` to a make a Font, creates a PDF object from the Font and checks that // objFontObj parses `fontDict` to a make a Font, creates a PDF object from the Font and checks that
// the new PDF object is the same as the input object // the new PDF object is the same as the input object
func objFontObj(t *testing.T, fontDict string) error { func objFontObj(t *testing.T, fontDict string) error {
@ -166,3 +215,66 @@ func objFontObj(t *testing.T, fontDict string) error {
return nil return nil
} }
// parsePdfObjects parses a fragment of a PDF `text` (e.g. ttToUnicode above) and returns a map of
// {object number: object} with indirect objects replaced by their values if they are in `text`.
func parsePdfObjects(text string) (map[int64]core.PdfObject, error) {
numObj := map[int64]core.PdfObject{}
parser := core.NewParserFromString(text)
for {
obj, err := parser.ParseIndirectObject()
if err != nil {
if err == io.EOF {
break
}
return numObj, err
}
switch t := obj.(type) {
case *core.PdfIndirectObject:
numObj[t.ObjectNumber] = obj
case *core.PdfObjectStream:
numObj[t.ObjectNumber] = obj
}
}
for _, obj := range numObj {
iobj, ok := obj.(*core.PdfIndirectObject)
if !ok {
continue
}
dict, ok := iobj.PdfObject.(*core.PdfObjectDictionary)
if !ok {
continue
}
for _, k := range dict.Keys() {
if ref, ok := dict.Get(k).(*core.PdfObjectReference); ok {
if o, ok := numObj[ref.ObjectNumber]; ok {
dict.Set(k, o)
}
}
}
}
return numObj, nil
}
// func isFontObject(obj core.PdfObject) bool {
// var dict *core.PdfObjectDictionary
// switch t := obj.(type) {
// case *core.PdfIndirectObject:
// dict = t.PdfObject.(*core.PdfObjectDictionary)
// case *core.PdfObjectDictionary:
// dict = t
// default:
// return false
// }
// name, err := core.GetName(dict.Get("Type"))
// return err == nil && name == "Font"
// }
// func showDict(dict *core.PdfObjectDictionary) string {
// parts := []string{}
// for _, k := range dict.Keys() {
// parts = append(parts, fmt.Sprintf("%s: %T", k, dict.Get(k)))
// }
// return fmt.Sprintf("{%s}", strings.Join(parts, ", "))
// }