From b18c8ca93d360e3238f480f869046aaeb7155e29 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Mon, 17 Sep 2018 17:57:52 +1000 Subject: [PATCH] Add ToUnicode map when embedding Type0 CIDType2 fonts in PDF files. --- pdf/internal/cmap/cmap.go | 162 ++++++++++++++++++++++++++--- pdf/internal/cmap/cmap_parser.go | 2 +- pdf/model/font.go | 18 +++- pdf/model/font_composite.go | 5 +- pdf/model/fonts/ttfparser.go | 31 +++++- pdf/model/textencoding/truetype.go | 2 +- 6 files changed, 197 insertions(+), 23 deletions(-) diff --git a/pdf/internal/cmap/cmap.go b/pdf/internal/cmap/cmap.go index 3a511fe8..763f0db1 100644 --- a/pdf/internal/cmap/cmap.go +++ b/pdf/internal/cmap/cmap.go @@ -47,15 +47,30 @@ type CMap struct { nbits int // 8 bits for simple fonts, 16 bits for CID fonts. ctype int version string - usecmap string // Base this cmap on `usecmap` if `usecmap` is not empty + usecmap string // Base this cmap on `usecmap` if `usecmap` is not empty. systemInfo CIDSystemInfo - // For regular cmaps + // For regular cmaps. codespaces []Codespace - // For ToUnicode (ctype 2) cmaps - codeToUnicode map[CharCode]string - toUnicodeIdentity bool + // For ToUnicode (ctype 2) cmaps. + codeToUnicode map[CharCode]string +} + +// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg. +func NewToUnicodeCMap(codeToUnicode map[CharCode]string) *CMap { + return &CMap{ + name: "Adobe-Identity-UCS", + ctype: 2, + nbits: 16, + systemInfo: CIDSystemInfo{ + Registry: "Adobe", + Ordering: "UCS", + Supplement: 0, + }, + codespaces: []Codespace{Codespace{Low: 0, High: 0xffff}}, + codeToUnicode: codeToUnicode, + } } // String returns a human readable description of `cmap`. @@ -100,7 +115,7 @@ func (info *CIDSystemInfo) String() string { return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement) } -// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj` +// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`. func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) { d, ok := core.GetDict(obj) if !ok { @@ -135,7 +150,7 @@ func (cmap *CMap) Type() int { return cmap.ctype } -// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = �. Was '?' +// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = �. Was '?'. const MissingCodeRune = textencoding.MissingCodeRune // MissingCodeString replaces strings that can't be decoded. @@ -182,8 +197,8 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) { if s, ok := cmap.codeToUnicode[code]; ok { return s, true } - common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q", - code, cmap, MissingCodeString) + // common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q", + // code, cmap, MissingCodeString) return MissingCodeString, false } @@ -213,7 +228,7 @@ func (cmap *CMap) bytesToCharcodes(data []byte) ([]CharCode, bool) { return charcodes, true } -// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces +// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces. // Returns: // character code (if there is a match) of // number of bytes read (if there is a match) @@ -252,7 +267,7 @@ func LoadCmapFromDataCID(data []byte) (*CMap, error) { } // LoadCmapFromData parses the in-memory cmap `data` and returns the resulting CMap. -// If isCID is true then it uses 1-byte encodings, otherwise it uses the codespaces in the cmap. +// If `isSimple` is true, it uses 1-byte encodings, otherwise it uses the codespaces in the cmap. // // 9.10.3 ToUnicode CMaps (page 293) func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) { @@ -274,9 +289,132 @@ func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) { common.Log.Debug("ERROR: No codespaces. cmap=%s", cmap) return nil, ErrBadCMap } - // We need to sort codespaces so that we check shorter codes first + // We need to sort codespaces so that we check shorter codes first. sort.Slice(cmap.codespaces, func(i, j int) bool { return cmap.codespaces[i].Low < cmap.codespaces[j].Low }) return cmap, nil } + +// Bytes returns the raw bytes of a PDF CMap corresponding to `cmap`. +func (cmap *CMap) Bytes() []byte { + common.Log.Trace("cmap.Bytes: cmap=%s", cmap.String()) + body := cmap.toBfData() + whole := strings.Join([]string{cmapHeader, body, cmapTrailer}, "\n") + return []byte(whole) +} + +type ( + charRange struct { + code0 CharCode + code1 CharCode + } + fbRange struct { + code0 CharCode + code1 CharCode + r0 rune + } +) + +// toBfData returns the bfchar and bfrange sections of a CMap text file. +// Both sections are computed from cmap.codeToUnicode. +func (cmap *CMap) toBfData() string { + if len(cmap.codeToUnicode) == 0 { + return "" + } + + // codes is a sorted list of the codeToUnicode keys. + codes := []CharCode{} + for code := range cmap.codeToUnicode { + codes = append(codes, code) + } + sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] }) + + // charRanges is a list of the contiguous character code ranges in codes. + charRanges := []charRange{} + c0, c1 := codes[0], codes[0]+1 + for _, c := range codes[1:] { + if c != c1 { + charRanges = append(charRanges, charRange{c0, c1}) + c0 = c + } + c1 = c + 1 + } + if c1 != c0+1 { + charRanges = append(charRanges, charRange{c0, c1}) + } + + // fbChars is a list of single character ranges. fbRanges is a list of multiple character ranges. + fbChars := []CharCode{} + fbRanges := []fbRange{} + for _, cr := range charRanges { + if cr.code0+1 == cr.code1 { + fbChars = append(fbChars, cr.code0) + } else { + fbRanges = append(fbRanges, fbRange{ + code0: cr.code0, + code1: cr.code1, + r0: []rune(cmap.codeToUnicode[cr.code0])[0], + }) + } + } + common.Log.Trace("charRanges=%d fbChars=%d fbRanges=%d", len(charRanges), len(fbChars), + len(fbRanges)) + + lines := []string{} + if len(fbChars) > 0 { + numRanges := (len(fbChars) + maxBfEntries - 1) / maxBfEntries + for i := 0; i < numRanges; i++ { + n := min(len(fbChars)-i*maxBfEntries, maxBfEntries) + lines = append(lines, fmt.Sprintf("%d beginbfchar", n)) + for j := 0; j < n; j++ { + code := fbChars[i*maxBfEntries+j] + s := cmap.codeToUnicode[code] + r := []rune(s)[0] + lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r)) + } + lines = append(lines, "endbfchar") + } + } + if len(fbRanges) > 0 { + numRanges := (len(fbRanges) + maxBfEntries - 1) / maxBfEntries + for i := 0; i < numRanges; i++ { + n := min(len(fbRanges)-i*maxBfEntries, maxBfEntries) + lines = append(lines, fmt.Sprintf("%d beginbfrange", n)) + for j := 0; j < n; j++ { + rng := fbRanges[i*maxBfEntries+j] + r := rng.r0 + lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1-1, r)) + } + lines = append(lines, "endbfrange") + } + } + return strings.Join(lines, "\n") +} + +const ( + maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section. + cmapHeader = ` +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def +/CMapName /Adobe-Identity-UCS def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange +` + cmapTrailer = `endcmap +CMapName currentdict /CMap defineresource pop +end +end +` +) + +func min(i, j int) int { + if i < j { + return i + } + return j +} diff --git a/pdf/internal/cmap/cmap_parser.go b/pdf/internal/cmap/cmap_parser.go index 52ac2893..d8489ac4 100644 --- a/pdf/internal/cmap/cmap_parser.go +++ b/pdf/internal/cmap/cmap_parser.go @@ -416,7 +416,7 @@ func (cmap *CMap) parseBfchar() error { return nil } -// parseBfrange parses a c section of a CMap file. +// parseBfrange parses a bfrange section of a CMap file. func (cmap *CMap) parseBfrange() error { for { // The specifications are in triplets. diff --git a/pdf/model/font.go b/pdf/model/font.go index eb198630..712b7238 100644 --- a/pdf/model/font.go +++ b/pdf/model/font.go @@ -448,12 +448,12 @@ type fontCommon struct { basefont string // The font's "BaseFont" field. subtype string // The font's "Subtype" field. - // These are optional fields in the PDF font + // These are optional fields in the PDF font. toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject. // These objects are computed from optional fields in the PDF font. - toUnicodeCmap *cmap.CMap // Computed from "ToUnicode" - fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor" + toUnicodeCmap *cmap.CMap // Computed from "ToUnicode". + fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor". // objectNumber helps us find the font in the PDF being processed. This helps with debugging. objectNumber int64 @@ -482,6 +482,14 @@ func (base fontCommon) asPdfObjectDictionary(subtype string) *core.PdfObjectDict } if base.toUnicode != nil { d.Set("ToUnicode", base.toUnicode) + } else if base.toUnicodeCmap != nil { + data := base.toUnicodeCmap.Bytes() + o, err := core.MakeStream(data, nil) + if err != nil { + common.Log.Debug("MakeStream failed. err=%v", err) + } else { + d.Set("ToUnicode", o) + } } return d } @@ -584,7 +592,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict return d, font, nil } -// toUnicodeToCmap returns a CMap of `toUnicode` if it exists +// toUnicodeToCmap returns a CMap of `toUnicode` if it exists. func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, error) { toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream) if !ok { @@ -673,7 +681,7 @@ func (descriptor *PdfFontDescriptor) String() string { } parts = append(parts, fmt.Sprintf("FontFile3=%t", descriptor.FontFile3 != nil)) - return fmt.Sprintf("FONT_DESCRIPTON{%s}", strings.Join(parts, ", ")) + return fmt.Sprintf("FONT_DESCRIPTOR{%s}", strings.Join(parts, ", ")) } // newPdfFontDescriptorFromPdfObject loads the font descriptor from a core.PdfObject. Can either be a diff --git a/pdf/model/font_composite.go b/pdf/model/font_composite.go index 6ad4eb65..5aa5e75d 100644 --- a/pdf/model/font_composite.go +++ b/pdf/model/font_composite.go @@ -197,7 +197,8 @@ type pdfCIDFontType0 struct { encoder textencoding.TextEncoder // Table 117 – Entries in a CIDFont dictionary (page 269) - CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character collection of the CIDFont. See Table 116. + CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character + // collection of the CIDFont. See Table 116. } // pdfCIDFontType0FromSkeleton returns a pdfCIDFontType0 with its common fields initalized. @@ -528,6 +529,8 @@ func NewCompositePdfFontFromTTFFile(filePath string) (*PdfFont, error) { encoder: textencoding.NewTrueTypeFontEncoder(ttf.Chars), } + type0.toUnicodeCmap = ttf.MakeToUnicode() + // Build Font. font := PdfFont{ context: &type0, diff --git a/pdf/model/fonts/ttfparser.go b/pdf/model/fonts/ttfparser.go index fc7824b5..e6773f49 100644 --- a/pdf/model/fonts/ttfparser.go +++ b/pdf/model/fonts/ttfparser.go @@ -40,6 +40,7 @@ import ( "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/core" + "github.com/unidoc/unidoc/pdf/internal/cmap" "github.com/unidoc/unidoc/pdf/model/textencoding" ) @@ -83,13 +84,32 @@ type TtfType struct { CapHeight int16 Widths []uint16 - // Chars maps rune values (unicode) to the indexes in GlyphNames. i.e GlyphNames[Chars[r]] is + // Chars maps rune values (unicode) to the indexes in GlyphNames. i.e. GlyphNames[Chars[r]] is // the glyph corresponding to rune r. Chars map[uint16]uint16 // GlyphNames is a list of glyphs from the "post" section of the TrueType file. GlyphNames []string } +// MakeToUnicode returns a ToUnicode CMap based on the encoding of `ttf`. +// XX: This currently gives a bad text mapping for creator_test.go but leads to an otherwise +// valid PDF file that Adobe Reader displays without error. +func (ttf *TtfType) MakeToUnicode() *cmap.CMap { + codeToUnicode := map[cmap.CharCode]string{} + for code, idx := range ttf.Chars { + glyph := ttf.GlyphNames[idx] + + r, ok := textencoding.GlyphToRune(glyph) + if !ok { + common.Log.Debug("No rune. code=0x%04x glyph=%q", code, glyph) + r = textencoding.MissingCodeRune + } + codeToUnicode[cmap.CharCode(code)] = string(r) + } + return cmap.NewToUnicodeCMap(codeToUnicode) +} + +// String returns a human readable representation of `ttf`. func (ttf *TtfType) String() string { return fmt.Sprintf("FONT_FILE2{%#q Embeddable=%t UnitsPerEm=%d Bold=%t ItalicAngle=%f "+ "CapHeight=%d Chars=%d GlyphNames=%d}", @@ -420,6 +440,8 @@ func (t *ttfParser) ParseCmap() error { if platformID == 3 && encodingID == 1 { // (3,1) subtable. Windows Unicode. offset31 = offset + } else if platformID == 1 && encodingID == 0 { + offset10 = offset } } @@ -436,6 +458,9 @@ func (t *ttfParser) ParseCmap() error { return err } } + if offset31 == 0 && offset10 == 0 { + common.Log.Debug("ttfParser.ParseCmap. No 31 or 10 table.") + } return nil } @@ -516,11 +541,11 @@ func (t *ttfParser) parseCmapFormat12() error { startGlyph := t.ReadULong() if firstCode > 0x0010FFFF || (0xD800 <= firstCode && firstCode <= 0xDFFF) { - return errors.New("Invalid characters codes") + return errors.New("invalid characters codes") } if endCode < firstCode || endCode > 0x0010FFFF || (0xD800 <= endCode && endCode <= 0xDFFF) { - return errors.New("Invalid characters codes") + return errors.New("invalid characters codes") } for j := uint32(0); j <= endCode-firstCode; j++ { diff --git a/pdf/model/textencoding/truetype.go b/pdf/model/textencoding/truetype.go index b4e57af2..cd109403 100644 --- a/pdf/model/textencoding/truetype.go +++ b/pdf/model/textencoding/truetype.go @@ -36,7 +36,7 @@ func NewTrueTypeFontEncoder(runeToGlyphIndexMap map[uint16]uint16) TrueTypeFontE } // ttEncoderNumEntries is the maximum number of encoding entries shown in SimpleEncoder.String() -const ttEncoderNumEntries = 1000 +const ttEncoderNumEntries = 10 // String returns a string that describes `enc`. func (enc TrueTypeFontEncoder) String() string {