Fixed encoding selection for standard 14 fonts.

This commit is contained in:
Peter Williams 2018-11-22 22:01:04 +11:00
parent 8b964f2008
commit 6e5e32dd92
5 changed files with 78 additions and 26 deletions

View File

@ -413,6 +413,9 @@ func (to *textObject) setTextRise(y float64) {
// setWordSpacing "Tw" Set word spacing.
func (to *textObject) setWordSpacing(y float64) {
if to == nil {
return
}
to.State.Tw = y
}

View File

@ -115,6 +115,11 @@ func (se SimpleEncoder) String() string {
return fmt.Sprintf("SIMPLE_ENCODER{%s}", strings.Join(parts, ", "))
}
// BaseName returns `se`'s base name.
func (se SimpleEncoder) BaseName() string {
return se.baseName
}
// Encode converts a Go unicode string `raw` to a PDF encoded string.
func (se SimpleEncoder) Encode(raw string) []byte {
return doEncode(se, raw)
@ -1089,7 +1094,7 @@ var simpleEncodings = map[string]map[uint16]rune{
0xe9: 0x0152, // Œ "OE"
0xea: 0x00ba, // º "ordmasculine"
0xf0: 0x00e6, // æ "ae"
0xf4: 0x0131, // ı "dotlessi"
0xf5: 0x0131, // ı "dotlessi"
0xf7: 0x0142, // ł "lslash"
0xf8: 0x00f8, // ø "oslash"
0xf9: 0x0153, // œ "oe"

View File

@ -267,7 +267,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
case "Type1", "Type3", "MMType1", "TrueType":
var simplefont *pdfFontSimple
std, ok := loadStandard14Font(Standard14Font(base.basefont))
builtin := ok /*&& base.subtype == "Type1"*/
builtin := ok
if builtin {
font.context = &std
@ -611,6 +611,7 @@ type fontCommon struct {
// All fonts have these fields.
basefont string // The font's "BaseFont" field.
subtype string // The font's "Subtype" field.
name string
// These are optional fields in the PDF font.
toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject.
@ -669,8 +670,16 @@ func (base fontCommon) coreString() string {
if base.fontDescriptor != nil {
descriptor = base.fontDescriptor.String()
}
return fmt.Sprintf("%#q %#q obj=%d ToUnicode=%t %s",
base.subtype, base.basefont, base.objectNumber, base.toUnicode != nil, descriptor)
return fmt.Sprintf("%#q %#q %q obj=%d ToUnicode=%t flags=0x%0x %s",
base.subtype, base.basefont, base.name, base.objectNumber, base.toUnicode != nil,
base.fontFlags(), descriptor)
}
func (base fontCommon) fontFlags() int {
if base.fontDescriptor == nil {
return 0
}
return base.fontDescriptor.flags
}
// isCIDFont returns true if `base` is a CID font.
@ -721,6 +730,11 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
}
font.subtype = subtype
name, ok := core.GetNameVal(d.Get("Name"))
if ok {
font.name = name
}
if subtype == "Type3" {
common.Log.Debug("ERROR: Type 3 font not supprted. d=%s", d)
return nil, nil, ErrFontNotSupported
@ -815,6 +829,7 @@ type PdfFontDescriptor struct {
FontFile3 core.PdfObject // OTF / CFF
CharSet core.PdfObject
flags int
missingWidth float64
*fontFile
fontFile2 *fonts.TtfType
@ -923,6 +938,11 @@ func newPdfFontDescriptorFromPdfObject(obj core.PdfObject) (*PdfFontDescriptor,
descriptor.FD = d.Get("FD")
descriptor.CIDSet = d.Get("CIDSet")
if descriptor.Flags != nil {
if flags, ok := core.GetIntVal(descriptor.Flags); ok {
descriptor.flags = flags
}
}
if descriptor.MissingWidth != nil {
if missingWidth, err := core.GetNumberAsFloat(descriptor.MissingWidth); err == nil {
descriptor.missingWidth = missingWidth

View File

@ -9,6 +9,7 @@ import (
"errors"
"io/ioutil"
"sort"
"strings"
"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/core"
@ -236,19 +237,26 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
func (font *pdfFontSimple) addEncoding() error {
var baseEncoder string
var differences map[byte]string
var err error
var encoder *textencoding.SimpleEncoder
if font.Encoder() != nil {
encoder, ok := font.Encoder().(*textencoding.SimpleEncoder)
if ok && encoder != nil {
baseEncoder = encoder.BaseName()
}
}
if font.Encoding != nil {
baseEncoder, differences, err = getFontEncoding(font.Encoding)
baseEncoderName, differences, err := font.getFontEncoding()
if err != nil {
common.Log.Debug("ERROR: BaseFont=%q Subtype=%q Encoding=%s (%T) err=%v", font.basefont,
font.subtype, font.Encoding, font.Encoding, err)
return err
}
base := font.baseFields()
common.Log.Trace("addEncoding: BaseFont=%q Subtype=%q Encoding=%s (%T) differences=%d %+v",
base.basefont, base.subtype, font.Encoding, font.Encoding, len(differences), differences)
if baseEncoderName != "" {
baseEncoder = baseEncoderName
}
encoder, err = textencoding.NewSimpleTextEncoder(baseEncoder, differences)
if err != nil {
return err
@ -298,17 +306,27 @@ func (font *pdfFontSimple) addEncoding() error {
// Except for Type 3 fonts, every font program shall have a built-in encoding. Under certain
// circumstances, a PDF font dictionary may change the encoding used with the font program to match
// the requirements of the conforming writer generating the text being shown.
func getFontEncoding(obj core.PdfObject) (baseName string, differences map[byte]string, err error) {
func (font *pdfFontSimple) getFontEncoding() (baseName string, differences map[byte]string, err error) {
baseName = "StandardEncoding"
if name, ok := builtinEncodings[font.basefont]; ok {
baseName = name
} else if font.fontFlags()&fontFlagSymbolic != 0 {
for base, name := range builtinEncodings {
if strings.Contains(font.basefont, base) {
baseName = name
break
}
}
}
if obj == nil {
if font.Encoding == nil {
// Fall back to StandardEncoding
// This works because the only way BaseEncoding can get overridden is by FontFile entries
// and the only encoding names we have seen in FontFile's are StandardEncoding or no entry.
return baseName, nil, nil
}
switch encoding := obj.(type) {
switch encoding := font.Encoding.(type) {
case *core.PdfObjectName:
return string(*encoding), nil, nil
case *core.PdfObjectDictionary:
@ -328,7 +346,7 @@ func getFontEncoding(obj core.PdfObject) (baseName string, differences map[byte]
}
return baseName, differences, err
default:
common.Log.Debug("ERROR: Encoding not a name or dict (%T) %s", obj, obj.String())
common.Log.Debug("ERROR: Encoding not a name or dict (%T) %s", font.Encoding, font.Encoding)
return "", nil, core.ErrTypeError
}
}
@ -687,6 +705,11 @@ var standard14Fonts = map[Standard14Font]pdfFontSimple{
},
}
var builtinEncodings = map[string]string{
"Symbol": "SymbolEncoding",
"ZapfDingbats": "ZapfDingbatsEncoding",
}
// builtinDescriptor returns the PdfFontDescriptor for the builtin font named `baseFont`, or nil if
// there is none.
func builtinDescriptor(baseFont string) *PdfFontDescriptor {

View File

@ -437,7 +437,7 @@ var charcodeBytesToUnicodeTest = []fontFragmentTest{
fontFragmentTest{"Type1 font with /Encoding with /Differences",
"./testdata/font/noise-invariant.txt", 102,
[]byte{96, 247, 39, 32, 147, 231, 148, 32, 232, 32, 193, 111, 180, 32, 105, 116,
169, 115, 32, 204, 195, 196, 197, 198, 199, 168, 202, 206, 226, 234, 172, 244, 173, 151,
169, 115, 32, 204, 195, 196, 197, 198, 199, 168, 202, 206, 226, 234, 172, 245, 173, 151,
177, 151, 178, 179, 183, 185, 188, 205, 184, 189},
"‘ł’ “Ł” Ø `o´ it's ˝ˆ˜¯˘˙¨˚ˇªº‹ı›—–—†‡•„…˛¸‰",
},
@ -533,6 +533,8 @@ func (f *fontFragmentTest) String() string {
// CharcodeBytesToUnicode on `data` and checks that output equals `expected`.
func (f *fontFragmentTest) check(t *testing.T) {
common.Log.Debug("fontFragmentTest: %s", f)
numObj, err := parsePdfFragment(f.filename)
if err != nil {
t.Errorf("Failed to parse. %s err=%v", f, err)
@ -551,12 +553,12 @@ func (f *fontFragmentTest) check(t *testing.T) {
actualText, numChars, numMisses := font.CharcodeBytesToUnicode(f.data)
if numMisses != 0 {
t.Errorf("Some codes not decoded. numMisses=%d", numMisses)
t.Errorf("Some codes not decoded %s. font=%s numMisses=%d", f, font, numMisses)
return
}
if actualText != f.expected {
t.Errorf("Incorrect decoding. %s\nexpected=%q\n actual=%q",
f, f.expected, actualText)
t.Errorf("Incorrect decoding. %s encoding=%s\nexpected=%q\n actual=%q",
f, font.Encoder(), f.expected, actualText)
act, exp := []rune(actualText), []rune(f.expected)
if len(act) != len(exp) {
t.Errorf("\texpected=%d actual=%d", len(exp), len(act))
@ -568,7 +570,6 @@ func (f *fontFragmentTest) check(t *testing.T) {
}
}
}
}
if numChars != len([]rune(actualText)) {
t.Errorf("Incorrect numChars. %s numChars=%d expected=%d\n%+v\n%c",
@ -620,21 +621,21 @@ func parsePdfFragment(filename string) (map[int]core.PdfObject, error) {
func parsePdfObjects(text string) (map[int]core.PdfObject, error) {
numObj := map[int]core.PdfObject{}
parser := core.NewParserFromString(text)
common.Log.Debug("parsePdfObjects")
common.Log.Trace("parsePdfObjects")
// Build the numObj {object number: object} map
nums := []int{}
for {
obj, err := parser.ParseIndirectObject()
common.Log.Debug("parsePdfObjects: %T %v", obj, err)
common.Log.Trace("parsePdfObjects: %T %v", obj, err)
if err != nil {
if err == io.EOF {
break
}
common.Log.Debug("parsePdfObjects: err=%v", err)
common.Log.Trace("parsePdfObjects: err=%v", err)
return numObj, err
}
common.Log.Debug("parsePdfObjects: %d %T", len(numObj), obj)
common.Log.Trace("parsePdfObjects: %d %T", len(numObj), obj)
switch t := obj.(type) {
case *core.PdfIndirectObject:
numObj[int(t.ObjectNumber)] = obj
@ -645,16 +646,16 @@ func parsePdfObjects(text string) (map[int]core.PdfObject, error) {
}
}
common.Log.Debug("parsePdfObjects: Parsed %d objects %+v", len(numObj), nums)
common.Log.Trace("parsePdfObjects: Parsed %d objects %+v", len(numObj), nums)
// Replace the indirect objects in all dicts and arrays with their values, if they are in numObj.
for n, obj := range numObj {
common.Log.Debug("-- 0 %d obj %T", n, obj)
common.Log.Trace("-- 0 %d obj %T", n, obj)
iobj, ok := obj.(*core.PdfIndirectObject)
if !ok {
continue
}
common.Log.Debug(" -- %T", iobj.PdfObject)
common.Log.Trace(" -- %T", iobj.PdfObject)
iobj.PdfObject, ok = replaceReferences(numObj, iobj.PdfObject)
if !ok {
common.Log.Debug("ERROR: unresolved reference")
@ -671,7 +672,7 @@ func replaceReferences(numObj map[int]core.PdfObject, obj core.PdfObject) (core.
switch t := obj.(type) {
case *core.PdfObjectReference:
o, ok := numObj[int(t.ObjectNumber)]
common.Log.Debug(" %d 0 R %t ", t.ObjectNumber, ok)
common.Log.Trace(" %d 0 R %t ", t.ObjectNumber, ok)
return o, ok
case *core.PdfObjectDictionary:
for _, k := range t.Keys() {