diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go index 0e353098..fc36a026 100644 --- a/pdf/extractor/text.go +++ b/pdf/extractor/text.go @@ -413,6 +413,9 @@ func (to *textObject) setTextRise(y float64) { // setWordSpacing "Tw" Set word spacing. func (to *textObject) setWordSpacing(y float64) { + if to == nil { + return + } to.State.Tw = y } diff --git a/pdf/internal/textencoding/simple.go b/pdf/internal/textencoding/simple.go index 4d0891bb..697b1c7d 100644 --- a/pdf/internal/textencoding/simple.go +++ b/pdf/internal/textencoding/simple.go @@ -115,6 +115,11 @@ func (se SimpleEncoder) String() string { return fmt.Sprintf("SIMPLE_ENCODER{%s}", strings.Join(parts, ", ")) } +// BaseName returns `se`'s base name. +func (se SimpleEncoder) BaseName() string { + return se.baseName +} + // Encode converts a Go unicode string `raw` to a PDF encoded string. func (se SimpleEncoder) Encode(raw string) []byte { return doEncode(se, raw) @@ -1089,7 +1094,7 @@ var simpleEncodings = map[string]map[uint16]rune{ 0xe9: 0x0152, // Œ "OE" 0xea: 0x00ba, // º "ordmasculine" 0xf0: 0x00e6, // æ "ae" - 0xf4: 0x0131, // ı "dotlessi" + 0xf5: 0x0131, // ı "dotlessi" 0xf7: 0x0142, // ł "lslash" 0xf8: 0x00f8, // ø "oslash" 0xf9: 0x0153, // œ "oe" diff --git a/pdf/model/font.go b/pdf/model/font.go index 01ce2f2a..7c661de2 100644 --- a/pdf/model/font.go +++ b/pdf/model/font.go @@ -267,7 +267,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont, case "Type1", "Type3", "MMType1", "TrueType": var simplefont *pdfFontSimple std, ok := loadStandard14Font(Standard14Font(base.basefont)) - builtin := ok /*&& base.subtype == "Type1"*/ + builtin := ok if builtin { font.context = &std @@ -611,6 +611,7 @@ type fontCommon struct { // All fonts have these fields. basefont string // The font's "BaseFont" field. subtype string // The font's "Subtype" field. + name string // These are optional fields in the PDF font. toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject. @@ -669,8 +670,16 @@ func (base fontCommon) coreString() string { if base.fontDescriptor != nil { descriptor = base.fontDescriptor.String() } - return fmt.Sprintf("%#q %#q obj=%d ToUnicode=%t %s", - base.subtype, base.basefont, base.objectNumber, base.toUnicode != nil, descriptor) + return fmt.Sprintf("%#q %#q %q obj=%d ToUnicode=%t flags=0x%0x %s", + base.subtype, base.basefont, base.name, base.objectNumber, base.toUnicode != nil, + base.fontFlags(), descriptor) +} + +func (base fontCommon) fontFlags() int { + if base.fontDescriptor == nil { + return 0 + } + return base.fontDescriptor.flags } // isCIDFont returns true if `base` is a CID font. @@ -721,6 +730,11 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict } font.subtype = subtype + name, ok := core.GetNameVal(d.Get("Name")) + if ok { + font.name = name + } + if subtype == "Type3" { common.Log.Debug("ERROR: Type 3 font not supprted. d=%s", d) return nil, nil, ErrFontNotSupported @@ -815,6 +829,7 @@ type PdfFontDescriptor struct { FontFile3 core.PdfObject // OTF / CFF CharSet core.PdfObject + flags int missingWidth float64 *fontFile fontFile2 *fonts.TtfType @@ -923,6 +938,11 @@ func newPdfFontDescriptorFromPdfObject(obj core.PdfObject) (*PdfFontDescriptor, descriptor.FD = d.Get("FD") descriptor.CIDSet = d.Get("CIDSet") + if descriptor.Flags != nil { + if flags, ok := core.GetIntVal(descriptor.Flags); ok { + descriptor.flags = flags + } + } if descriptor.MissingWidth != nil { if missingWidth, err := core.GetNumberAsFloat(descriptor.MissingWidth); err == nil { descriptor.missingWidth = missingWidth diff --git a/pdf/model/font_simple.go b/pdf/model/font_simple.go index 1464bdcf..7eeef986 100644 --- a/pdf/model/font_simple.go +++ b/pdf/model/font_simple.go @@ -9,6 +9,7 @@ import ( "errors" "io/ioutil" "sort" + "strings" "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/core" @@ -236,19 +237,26 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon, func (font *pdfFontSimple) addEncoding() error { var baseEncoder string var differences map[byte]string - var err error var encoder *textencoding.SimpleEncoder + if font.Encoder() != nil { + encoder, ok := font.Encoder().(*textencoding.SimpleEncoder) + if ok && encoder != nil { + baseEncoder = encoder.BaseName() + } + } + if font.Encoding != nil { - baseEncoder, differences, err = getFontEncoding(font.Encoding) + baseEncoderName, differences, err := font.getFontEncoding() if err != nil { common.Log.Debug("ERROR: BaseFont=%q Subtype=%q Encoding=%s (%T) err=%v", font.basefont, font.subtype, font.Encoding, font.Encoding, err) return err } - base := font.baseFields() - common.Log.Trace("addEncoding: BaseFont=%q Subtype=%q Encoding=%s (%T) differences=%d %+v", - base.basefont, base.subtype, font.Encoding, font.Encoding, len(differences), differences) + if baseEncoderName != "" { + baseEncoder = baseEncoderName + } + encoder, err = textencoding.NewSimpleTextEncoder(baseEncoder, differences) if err != nil { return err @@ -298,17 +306,27 @@ func (font *pdfFontSimple) addEncoding() error { // Except for Type 3 fonts, every font program shall have a built-in encoding. Under certain // circumstances, a PDF font dictionary may change the encoding used with the font program to match // the requirements of the conforming writer generating the text being shown. -func getFontEncoding(obj core.PdfObject) (baseName string, differences map[byte]string, err error) { +func (font *pdfFontSimple) getFontEncoding() (baseName string, differences map[byte]string, err error) { baseName = "StandardEncoding" + if name, ok := builtinEncodings[font.basefont]; ok { + baseName = name + } else if font.fontFlags()&fontFlagSymbolic != 0 { + for base, name := range builtinEncodings { + if strings.Contains(font.basefont, base) { + baseName = name + break + } + } + } - if obj == nil { + if font.Encoding == nil { // Fall back to StandardEncoding // This works because the only way BaseEncoding can get overridden is by FontFile entries // and the only encoding names we have seen in FontFile's are StandardEncoding or no entry. return baseName, nil, nil } - switch encoding := obj.(type) { + switch encoding := font.Encoding.(type) { case *core.PdfObjectName: return string(*encoding), nil, nil case *core.PdfObjectDictionary: @@ -328,7 +346,7 @@ func getFontEncoding(obj core.PdfObject) (baseName string, differences map[byte] } return baseName, differences, err default: - common.Log.Debug("ERROR: Encoding not a name or dict (%T) %s", obj, obj.String()) + common.Log.Debug("ERROR: Encoding not a name or dict (%T) %s", font.Encoding, font.Encoding) return "", nil, core.ErrTypeError } } @@ -687,6 +705,11 @@ var standard14Fonts = map[Standard14Font]pdfFontSimple{ }, } +var builtinEncodings = map[string]string{ + "Symbol": "SymbolEncoding", + "ZapfDingbats": "ZapfDingbatsEncoding", +} + // builtinDescriptor returns the PdfFontDescriptor for the builtin font named `baseFont`, or nil if // there is none. func builtinDescriptor(baseFont string) *PdfFontDescriptor { diff --git a/pdf/model/font_test.go b/pdf/model/font_test.go index 8ab2e52a..f4312e1e 100644 --- a/pdf/model/font_test.go +++ b/pdf/model/font_test.go @@ -437,7 +437,7 @@ var charcodeBytesToUnicodeTest = []fontFragmentTest{ fontFragmentTest{"Type1 font with /Encoding with /Differences", "./testdata/font/noise-invariant.txt", 102, []byte{96, 247, 39, 32, 147, 231, 148, 32, 232, 32, 193, 111, 180, 32, 105, 116, - 169, 115, 32, 204, 195, 196, 197, 198, 199, 168, 202, 206, 226, 234, 172, 244, 173, 151, + 169, 115, 32, 204, 195, 196, 197, 198, 199, 168, 202, 206, 226, 234, 172, 245, 173, 151, 177, 151, 178, 179, 183, 185, 188, 205, 184, 189}, "‘ł’ “Ł” Ø `o´ it's ˝ˆ˜¯˘˙¨˚ˇªº‹ı›—–—†‡•„…˛¸‰", }, @@ -533,6 +533,8 @@ func (f *fontFragmentTest) String() string { // CharcodeBytesToUnicode on `data` and checks that output equals `expected`. func (f *fontFragmentTest) check(t *testing.T) { common.Log.Debug("fontFragmentTest: %s", f) + + numObj, err := parsePdfFragment(f.filename) if err != nil { t.Errorf("Failed to parse. %s err=%v", f, err) @@ -551,12 +553,12 @@ func (f *fontFragmentTest) check(t *testing.T) { actualText, numChars, numMisses := font.CharcodeBytesToUnicode(f.data) if numMisses != 0 { - t.Errorf("Some codes not decoded. numMisses=%d", numMisses) + t.Errorf("Some codes not decoded %s. font=%s numMisses=%d", f, font, numMisses) return } if actualText != f.expected { - t.Errorf("Incorrect decoding. %s\nexpected=%q\n actual=%q", - f, f.expected, actualText) + t.Errorf("Incorrect decoding. %s encoding=%s\nexpected=%q\n actual=%q", + f, font.Encoder(), f.expected, actualText) act, exp := []rune(actualText), []rune(f.expected) if len(act) != len(exp) { t.Errorf("\texpected=%d actual=%d", len(exp), len(act)) @@ -568,7 +570,6 @@ func (f *fontFragmentTest) check(t *testing.T) { } } } - } if numChars != len([]rune(actualText)) { t.Errorf("Incorrect numChars. %s numChars=%d expected=%d\n%+v\n%c", @@ -620,21 +621,21 @@ func parsePdfFragment(filename string) (map[int]core.PdfObject, error) { func parsePdfObjects(text string) (map[int]core.PdfObject, error) { numObj := map[int]core.PdfObject{} parser := core.NewParserFromString(text) - common.Log.Debug("parsePdfObjects") + common.Log.Trace("parsePdfObjects") // Build the numObj {object number: object} map nums := []int{} for { obj, err := parser.ParseIndirectObject() - common.Log.Debug("parsePdfObjects: %T %v", obj, err) + common.Log.Trace("parsePdfObjects: %T %v", obj, err) if err != nil { if err == io.EOF { break } - common.Log.Debug("parsePdfObjects: err=%v", err) + common.Log.Trace("parsePdfObjects: err=%v", err) return numObj, err } - common.Log.Debug("parsePdfObjects: %d %T", len(numObj), obj) + common.Log.Trace("parsePdfObjects: %d %T", len(numObj), obj) switch t := obj.(type) { case *core.PdfIndirectObject: numObj[int(t.ObjectNumber)] = obj @@ -645,16 +646,16 @@ func parsePdfObjects(text string) (map[int]core.PdfObject, error) { } } - common.Log.Debug("parsePdfObjects: Parsed %d objects %+v", len(numObj), nums) + common.Log.Trace("parsePdfObjects: Parsed %d objects %+v", len(numObj), nums) // Replace the indirect objects in all dicts and arrays with their values, if they are in numObj. for n, obj := range numObj { - common.Log.Debug("-- 0 %d obj %T", n, obj) + common.Log.Trace("-- 0 %d obj %T", n, obj) iobj, ok := obj.(*core.PdfIndirectObject) if !ok { continue } - common.Log.Debug(" -- %T", iobj.PdfObject) + common.Log.Trace(" -- %T", iobj.PdfObject) iobj.PdfObject, ok = replaceReferences(numObj, iobj.PdfObject) if !ok { common.Log.Debug("ERROR: unresolved reference") @@ -671,7 +672,7 @@ func replaceReferences(numObj map[int]core.PdfObject, obj core.PdfObject) (core. switch t := obj.(type) { case *core.PdfObjectReference: o, ok := numObj[int(t.ObjectNumber)] - common.Log.Debug(" %d 0 R %t ", t.ObjectNumber, ok) + common.Log.Trace(" %d 0 R %t ", t.ObjectNumber, ok) return o, ok case *core.PdfObjectDictionary: for _, k := range t.Keys() {