Fixed encoding selection for standard 14 fonts.

2025-04-27 13:48:51 +08:00 · 2018-11-22 22:01:04 +11:00 · 2018-11-22 22:01:04 +11:00 · 6e5e32dd92
commit 6e5e32dd92
parent 8b964f2008
5 changed files with 78 additions and 26 deletions
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@ -413,6 +413,9 @@ func (to *textObject) setTextRise(y float64) {

 // setWordSpacing "Tw" Set word spacing.
 func (to *textObject) setWordSpacing(y float64) {
+	if to == nil {
+		return
+	}
 	to.State.Tw = y
 }

--- a/pdf/internal/textencoding/simple.go
+++ b/pdf/internal/textencoding/simple.go
@ -115,6 +115,11 @@ func (se SimpleEncoder) String() string {
 	return fmt.Sprintf("SIMPLE_ENCODER{%s}", strings.Join(parts, ", "))
 }

+// BaseName returns `se`'s base name.
+func (se SimpleEncoder) BaseName() string {
+	return se.baseName
+}
+
 // Encode converts a Go unicode string `raw` to a PDF encoded string.
 func (se SimpleEncoder) Encode(raw string) []byte {
 	return doEncode(se, raw)
@ -1089,7 +1094,7 @@ var simpleEncodings = map[string]map[uint16]rune{
 		0xe9: 0x0152, //  Œ "OE"
 		0xea: 0x00ba, //  º "ordmasculine"
 		0xf0: 0x00e6, //  æ "ae"
-		0xf4: 0x0131, //  ı "dotlessi"
+		0xf5: 0x0131, //  ı "dotlessi"
 		0xf7: 0x0142, //  ł "lslash"
 		0xf8: 0x00f8, //  ø "oslash"
 		0xf9: 0x0153, //  œ "oe"
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@ -267,7 +267,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
 	case "Type1", "Type3", "MMType1", "TrueType":
 		var simplefont *pdfFontSimple
 		std, ok := loadStandard14Font(Standard14Font(base.basefont))
-		builtin := ok /*&& base.subtype == "Type1"*/
+		builtin := ok
 		if builtin {
 			font.context = &std

@ -611,6 +611,7 @@ type fontCommon struct {
 	// All fonts have these fields.
 	basefont string // The font's "BaseFont" field.
 	subtype  string // The font's "Subtype" field.
+	name     string

 	// These are optional fields in the PDF font.
 	toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject.
@ -669,8 +670,16 @@ func (base fontCommon) coreString() string {
 	if base.fontDescriptor != nil {
 		descriptor = base.fontDescriptor.String()
 	}
-	return fmt.Sprintf("%#q %#q obj=%d ToUnicode=%t %s",
-		base.subtype, base.basefont, base.objectNumber, base.toUnicode != nil, descriptor)
+	return fmt.Sprintf("%#q %#q %q obj=%d ToUnicode=%t flags=0x%0x %s",
+		base.subtype, base.basefont, base.name, base.objectNumber, base.toUnicode != nil,
+		base.fontFlags(), descriptor)
+}
+
+func (base fontCommon) fontFlags() int {
+	if base.fontDescriptor == nil {
+		return 0
+	}
+	return base.fontDescriptor.flags
 }

 // isCIDFont returns true if `base` is a CID font.
@ -721,6 +730,11 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
 	}
 	font.subtype = subtype

+	name, ok := core.GetNameVal(d.Get("Name"))
+	if ok {
+		font.name = name
+	}
+
 	if subtype == "Type3" {
 		common.Log.Debug("ERROR: Type 3 font not supprted. d=%s", d)
 		return nil, nil, ErrFontNotSupported
@ -815,6 +829,7 @@ type PdfFontDescriptor struct {
 	FontFile3    core.PdfObject // OTF / CFF
 	CharSet      core.PdfObject

+	flags        int
 	missingWidth float64
 	*fontFile
 	fontFile2 *fonts.TtfType
@ -923,6 +938,11 @@ func newPdfFontDescriptorFromPdfObject(obj core.PdfObject) (*PdfFontDescriptor,
 	descriptor.FD = d.Get("FD")
 	descriptor.CIDSet = d.Get("CIDSet")

+	if descriptor.Flags != nil {
+		if flags, ok := core.GetIntVal(descriptor.Flags); ok {
+			descriptor.flags = flags
+		}
+	}
 	if descriptor.MissingWidth != nil {
 		if missingWidth, err := core.GetNumberAsFloat(descriptor.MissingWidth); err == nil {
 			descriptor.missingWidth = missingWidth
--- a/pdf/model/font_simple.go
+++ b/pdf/model/font_simple.go
@ -9,6 +9,7 @@ import (
 	"errors"
 	"io/ioutil"
 	"sort"
+	"strings"

 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/core"
@ -236,19 +237,26 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
 func (font *pdfFontSimple) addEncoding() error {
 	var baseEncoder string
 	var differences map[byte]string
-	var err error
 	var encoder *textencoding.SimpleEncoder

+	if font.Encoder() != nil {
+		encoder, ok := font.Encoder().(*textencoding.SimpleEncoder)
+		if ok && encoder != nil {
+			baseEncoder = encoder.BaseName()
+		}
+	}
+
 	if font.Encoding != nil {
-		baseEncoder, differences, err = getFontEncoding(font.Encoding)
+		baseEncoderName, differences, err := font.getFontEncoding()
 		if err != nil {
 			common.Log.Debug("ERROR: BaseFont=%q Subtype=%q Encoding=%s (%T) err=%v", font.basefont,
 				font.subtype, font.Encoding, font.Encoding, err)
 			return err
 		}
-		base := font.baseFields()
-		common.Log.Trace("addEncoding: BaseFont=%q Subtype=%q Encoding=%s (%T) differences=%d %+v",
-			base.basefont, base.subtype, font.Encoding, font.Encoding, len(differences), differences)
+		if baseEncoderName != "" {
+			baseEncoder = baseEncoderName
+		}
+
 		encoder, err = textencoding.NewSimpleTextEncoder(baseEncoder, differences)
 		if err != nil {
 			return err
@ -298,17 +306,27 @@ func (font *pdfFontSimple) addEncoding() error {
 // Except for Type 3 fonts, every font program shall have a built-in encoding. Under certain
 // circumstances, a PDF font dictionary may change the encoding used with the font program to match
 // the requirements of the conforming writer generating the text being shown.
-func getFontEncoding(obj core.PdfObject) (baseName string, differences map[byte]string, err error) {
+func (font *pdfFontSimple) getFontEncoding() (baseName string, differences map[byte]string, err error) {
 	baseName = "StandardEncoding"
+	if name, ok := builtinEncodings[font.basefont]; ok {
+		baseName = name
+	} else if font.fontFlags()&fontFlagSymbolic != 0 {
+		for base, name := range builtinEncodings {
+			if strings.Contains(font.basefont, base) {
+				baseName = name
+				break
+			}
+		}
+	}

-	if obj == nil {
+	if font.Encoding == nil {
 		// Fall back to StandardEncoding
 		// This works because the only way BaseEncoding can get overridden is by FontFile entries
 		// and the only encoding names we have seen in FontFile's are StandardEncoding or no entry.
 		return baseName, nil, nil
 	}

-	switch encoding := obj.(type) {
+	switch encoding := font.Encoding.(type) {
 	case *core.PdfObjectName:
 		return string(*encoding), nil, nil
 	case *core.PdfObjectDictionary:
@ -328,7 +346,7 @@ func getFontEncoding(obj core.PdfObject) (baseName string, differences map[byte]
 		}
 		return baseName, differences, err
 	default:
-		common.Log.Debug("ERROR: Encoding not a name or dict (%T) %s", obj, obj.String())
+		common.Log.Debug("ERROR: Encoding not a name or dict (%T) %s", font.Encoding, font.Encoding)
 		return "", nil, core.ErrTypeError
 	}
 }
@ -687,6 +705,11 @@ var standard14Fonts = map[Standard14Font]pdfFontSimple{
 	},
 }

+var builtinEncodings = map[string]string{
+	"Symbol":       "SymbolEncoding",
+	"ZapfDingbats": "ZapfDingbatsEncoding",
+}
+
 // builtinDescriptor returns the PdfFontDescriptor for the builtin font named `baseFont`, or nil if
 // there is none.
 func builtinDescriptor(baseFont string) *PdfFontDescriptor {
--- a/pdf/model/font_test.go
+++ b/pdf/model/font_test.go
@ -437,7 +437,7 @@ var charcodeBytesToUnicodeTest = []fontFragmentTest{
 	fontFragmentTest{"Type1 font with /Encoding with /Differences",
 		"./testdata/font/noise-invariant.txt", 102,
 		[]byte{96, 247, 39, 32, 147, 231, 148, 32, 232, 32, 193, 111, 180, 32, 105, 116,
-			169, 115, 32, 204, 195, 196, 197, 198, 199, 168, 202, 206, 226, 234, 172, 244, 173, 151,
+			169, 115, 32, 204, 195, 196, 197, 198, 199, 168, 202, 206, 226, 234, 172, 245, 173, 151,
 			177, 151, 178, 179, 183, 185, 188, 205, 184, 189},
 		"‘ł’ “Ł” Ø `o´ it's ˝ˆ˜¯˘˙¨˚ˇªº‹ı›—–—†‡•„…˛¸‰",
 	},
@ -533,6 +533,8 @@ func (f *fontFragmentTest) String() string {
 // CharcodeBytesToUnicode on `data` and checks that output equals `expected`.
 func (f *fontFragmentTest) check(t *testing.T) {
 	common.Log.Debug("fontFragmentTest: %s", f)
+
+
 	numObj, err := parsePdfFragment(f.filename)
 	if err != nil {
 		t.Errorf("Failed to parse. %s err=%v", f, err)
@ -551,12 +553,12 @@ func (f *fontFragmentTest) check(t *testing.T) {

 	actualText, numChars, numMisses := font.CharcodeBytesToUnicode(f.data)
 	if numMisses != 0 {
-		t.Errorf("Some codes not decoded. numMisses=%d", numMisses)
+		t.Errorf("Some codes not decoded %s. font=%s numMisses=%d", f, font, numMisses)
 		return
 	}
 	if actualText != f.expected {
-		t.Errorf("Incorrect decoding. %s\nexpected=%q\n  actual=%q",
-			f, f.expected, actualText)
+		t.Errorf("Incorrect decoding. %s encoding=%s\nexpected=%q\n  actual=%q",
+			f, font.Encoder(), f.expected, actualText)
 		act, exp := []rune(actualText), []rune(f.expected)
 		if len(act) != len(exp) {
 			t.Errorf("\texpected=%d actual=%d", len(exp), len(act))
@ -568,7 +570,6 @@ func (f *fontFragmentTest) check(t *testing.T) {
 				}
 			}
 		}
-
 	}
 	if numChars != len([]rune(actualText)) {
 		t.Errorf("Incorrect numChars. %s numChars=%d expected=%d\n%+v\n%c",
@ -620,21 +621,21 @@ func parsePdfFragment(filename string) (map[int]core.PdfObject, error) {
 func parsePdfObjects(text string) (map[int]core.PdfObject, error) {
 	numObj := map[int]core.PdfObject{}
 	parser := core.NewParserFromString(text)
-	common.Log.Debug("parsePdfObjects")
+	common.Log.Trace("parsePdfObjects")

 	// Build the numObj {object number: object} map
 	nums := []int{}
 	for {
 		obj, err := parser.ParseIndirectObject()
-		common.Log.Debug("parsePdfObjects:  %T %v", obj, err)
+		common.Log.Trace("parsePdfObjects:  %T %v", obj, err)
 		if err != nil {
 			if err == io.EOF {
 				break
 			}
-			common.Log.Debug("parsePdfObjects:  err=%v", err)
+			common.Log.Trace("parsePdfObjects:  err=%v", err)
 			return numObj, err
 		}
-		common.Log.Debug("parsePdfObjects: %d %T", len(numObj), obj)
+		common.Log.Trace("parsePdfObjects: %d %T", len(numObj), obj)
 		switch t := obj.(type) {
 		case *core.PdfIndirectObject:
 			numObj[int(t.ObjectNumber)] = obj
@ -645,16 +646,16 @@ func parsePdfObjects(text string) (map[int]core.PdfObject, error) {
 		}
 	}

-	common.Log.Debug("parsePdfObjects: Parsed %d objects %+v", len(numObj), nums)
+	common.Log.Trace("parsePdfObjects: Parsed %d objects %+v", len(numObj), nums)

 	// Replace the indirect objects in all dicts and arrays with their values, if they are in numObj.
 	for n, obj := range numObj {
-		common.Log.Debug("-- 0 %d obj %T", n, obj)
+		common.Log.Trace("-- 0 %d obj %T", n, obj)
 		iobj, ok := obj.(*core.PdfIndirectObject)
 		if !ok {
 			continue
 		}
-		common.Log.Debug("   -- %T", iobj.PdfObject)
+		common.Log.Trace("   -- %T", iobj.PdfObject)
 		iobj.PdfObject, ok = replaceReferences(numObj, iobj.PdfObject)
 		if !ok {
 			common.Log.Debug("ERROR: unresolved reference")
@ -671,7 +672,7 @@ func replaceReferences(numObj map[int]core.PdfObject, obj core.PdfObject) (core.
 	switch t := obj.(type) {
 	case *core.PdfObjectReference:
 		o, ok := numObj[int(t.ObjectNumber)]
-		common.Log.Debug("    %d 0 R  %t ", t.ObjectNumber, ok)
+		common.Log.Trace("    %d 0 R  %t ", t.ObjectNumber, ok)
 		return o, ok
 	case *core.PdfObjectDictionary:
 		for _, k := range t.Keys() {