Add ToUnicode map when embedding Type0 CIDType2 fonts in PDF files.

2025-04-27 13:48:51 +08:00 · 2018-09-17 17:57:52 +10:00 · 2018-09-17 17:57:52 +10:00 · b18c8ca93d
commit b18c8ca93d
parent 38da971f78
6 changed files with 197 additions and 23 deletions
--- a/pdf/internal/cmap/cmap.go
+++ b/pdf/internal/cmap/cmap.go
@ -47,15 +47,30 @@ type CMap struct {
 	nbits      int // 8 bits for simple fonts, 16 bits for CID fonts.
 	ctype      int
 	version    string
-	usecmap    string // Base this cmap on `usecmap` if `usecmap` is not empty
+	usecmap    string // Base this cmap on `usecmap` if `usecmap` is not empty.
 	systemInfo CIDSystemInfo
-	// For regular cmaps
+	// For regular cmaps.
 	codespaces []Codespace
-	// For ToUnicode (ctype 2) cmaps
+	// For ToUnicode (ctype 2) cmaps.
-	codeToUnicode     map[CharCode]string
+	codeToUnicode map[CharCode]string
-	toUnicodeIdentity bool
+}
 // NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg.
 func NewToUnicodeCMap(codeToUnicode map[CharCode]string) *CMap {
 	return &CMap{
 		name:  "Adobe-Identity-UCS",
 		ctype: 2,
 		nbits: 16,
 		systemInfo: CIDSystemInfo{
 			Registry:   "Adobe",
 			Ordering:   "UCS",
 			Supplement: 0,
 		},
 		codespaces:    []Codespace{Codespace{Low: 0, High: 0xffff}},
 		codeToUnicode: codeToUnicode,
 	}
 }
 // String returns a human readable description of `cmap`.
@ -100,7 +115,7 @@ func (info *CIDSystemInfo) String() string {
 	return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement)
 }
-// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`
+// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`.
 func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) {
 	d, ok := core.GetDict(obj)
 	if !ok {
@ -135,7 +150,7 @@ func (cmap *CMap) Type() int {
 	return cmap.ctype
 }
-// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'
+// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
 const MissingCodeRune = textencoding.MissingCodeRune
 // MissingCodeString replaces strings that can't be decoded.
@ -182,8 +197,8 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
 	if s, ok := cmap.codeToUnicode[code]; ok {
 		return s, true
 	}
-	common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q",
+	// common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q",
-		code, cmap, MissingCodeString)
+	// 	code, cmap, MissingCodeString)
 	return MissingCodeString, false
 }
@ -213,7 +228,7 @@ func (cmap *CMap) bytesToCharcodes(data []byte) ([]CharCode, bool) {
 	return charcodes, true
 }
-// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces
+// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces.
 // Returns:
 //      character code (if there is a match) of
 //      number of bytes read (if there is a match)
@ -252,7 +267,7 @@ func LoadCmapFromDataCID(data []byte) (*CMap, error) {
 }
 // LoadCmapFromData parses the in-memory cmap `data` and returns the resulting CMap.
-// If isCID is true then it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
+// If `isSimple` is true, it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
 //
 // 9.10.3 ToUnicode CMaps (page 293)
 func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
@ -274,9 +289,132 @@ func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
 		common.Log.Debug("ERROR: No codespaces. cmap=%s", cmap)
 		return nil, ErrBadCMap
 	}
-	// We need to sort codespaces so that we check shorter codes first
+	// We need to sort codespaces so that we check shorter codes first.
 	sort.Slice(cmap.codespaces, func(i, j int) bool {
 		return cmap.codespaces[i].Low < cmap.codespaces[j].Low
 	})
 	return cmap, nil
 }
 // Bytes returns the raw bytes of a PDF CMap corresponding to `cmap`.
 func (cmap *CMap) Bytes() []byte {
 	common.Log.Trace("cmap.Bytes: cmap=%s", cmap.String())
 	body := cmap.toBfData()
 	whole := strings.Join([]string{cmapHeader, body, cmapTrailer}, "\n")
 	return []byte(whole)
 }
 type (
 	charRange struct {
 		code0 CharCode
 		code1 CharCode
 	}
 	fbRange struct {
 		code0 CharCode
 		code1 CharCode
 		r0    rune
 	}
 )
 // toBfData returns the bfchar and bfrange sections of a CMap text file.
 // Both sections are computed from cmap.codeToUnicode.
 func (cmap *CMap) toBfData() string {
 	if len(cmap.codeToUnicode) == 0 {
 		return ""
 	}
 	// codes is a sorted list of the codeToUnicode keys.
 	codes := []CharCode{}
 	for code := range cmap.codeToUnicode {
 		codes = append(codes, code)
 	}
 	sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
 	// charRanges is a list of the contiguous character code ranges in codes.
 	charRanges := []charRange{}
 	c0, c1 := codes[0], codes[0]+1
 	for _, c := range codes[1:] {
 		if c != c1 {
 			charRanges = append(charRanges, charRange{c0, c1})
 			c0 = c
 		}
 		c1 = c + 1
 	}
 	if c1 != c0+1 {
 		charRanges = append(charRanges, charRange{c0, c1})
 	}
 	// fbChars is a list of single character ranges. fbRanges is a list of multiple character ranges.
 	fbChars := []CharCode{}
 	fbRanges := []fbRange{}
 	for _, cr := range charRanges {
 		if cr.code0+1 == cr.code1 {
 			fbChars = append(fbChars, cr.code0)
 		} else {
 			fbRanges = append(fbRanges, fbRange{
 				code0: cr.code0,
 				code1: cr.code1,
 				r0:    []rune(cmap.codeToUnicode[cr.code0])[0],
 			})
 		}
 	}
 	common.Log.Trace("charRanges=%d fbChars=%d fbRanges=%d", len(charRanges), len(fbChars),
 		len(fbRanges))
 	lines := []string{}
 	if len(fbChars) > 0 {
 		numRanges := (len(fbChars) + maxBfEntries - 1) / maxBfEntries
 		for i := 0; i < numRanges; i++ {
 			n := min(len(fbChars)-i*maxBfEntries, maxBfEntries)
 			lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
 			for j := 0; j < n; j++ {
 				code := fbChars[i*maxBfEntries+j]
 				s := cmap.codeToUnicode[code]
 				r := []rune(s)[0]
 				lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
 			}
 			lines = append(lines, "endbfchar")
 		}
 	}
 	if len(fbRanges) > 0 {
 		numRanges := (len(fbRanges) + maxBfEntries - 1) / maxBfEntries
 		for i := 0; i < numRanges; i++ {
 			n := min(len(fbRanges)-i*maxBfEntries, maxBfEntries)
 			lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
 			for j := 0; j < n; j++ {
 				rng := fbRanges[i*maxBfEntries+j]
 				r := rng.r0
 				lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1-1, r))
 			}
 			lines = append(lines, "endbfrange")
 		}
 	}
 	return strings.Join(lines, "\n")
 }
 const (
 	maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
 	cmapHeader   = `
 /CIDInit /ProcSet findresource begin
 12 dict begin
 begincmap
 /CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def
 /CMapName /Adobe-Identity-UCS def
 /CMapType 2 def
 1 begincodespacerange
 <0000> <FFFF>
 endcodespacerange
 `
 	cmapTrailer = `endcmap
 CMapName currentdict /CMap defineresource pop
 end
 end
 `
 )
 func min(i, j int) int {
 	if i < j {
 		return i
 	}
 	return j
 }
--- a/pdf/internal/cmap/cmap_parser.go
+++ b/pdf/internal/cmap/cmap_parser.go
@ -416,7 +416,7 @@ func (cmap *CMap) parseBfchar() error {
 	return nil
 }
-// parseBfrange parses a c section of a CMap file.
+// parseBfrange parses a bfrange section of a CMap file.
 func (cmap *CMap) parseBfrange() error {
 	for {
 		// The specifications are in triplets.
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@ -448,12 +448,12 @@ type fontCommon struct {
 	basefont string // The font's "BaseFont" field.
 	subtype  string // The font's "Subtype" field.
-	// These are optional fields in the PDF font
+	// These are optional fields in the PDF font.
 	toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject.
 	// These objects are computed from optional fields in the PDF font.
-	toUnicodeCmap  *cmap.CMap         // Computed from "ToUnicode"
+	toUnicodeCmap  *cmap.CMap         // Computed from "ToUnicode".
-	fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor"
+	fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor".
 	// objectNumber helps us find the font in the PDF being processed. This helps with debugging.
 	objectNumber int64
@ -482,6 +482,14 @@ func (base fontCommon) asPdfObjectDictionary(subtype string) *core.PdfObjectDict
 	}
 	if base.toUnicode != nil {
 		d.Set("ToUnicode", base.toUnicode)
 	} else if base.toUnicodeCmap != nil {
 		data := base.toUnicodeCmap.Bytes()
 		o, err := core.MakeStream(data, nil)
 		if err != nil {
 			common.Log.Debug("MakeStream failed. err=%v", err)
 		} else {
 			d.Set("ToUnicode", o)
 		}
 	}
 	return d
 }
@ -584,7 +592,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
 	return d, font, nil
 }
-// toUnicodeToCmap returns a CMap of `toUnicode` if it exists
+// toUnicodeToCmap returns a CMap of `toUnicode` if it exists.
 func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, error) {
 	toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
 	if !ok {
@ -673,7 +681,7 @@ func (descriptor *PdfFontDescriptor) String() string {
 	}
 	parts = append(parts, fmt.Sprintf("FontFile3=%t", descriptor.FontFile3 != nil))
-	return fmt.Sprintf("FONT_DESCRIPTON{%s}", strings.Join(parts, ", "))
+	return fmt.Sprintf("FONT_DESCRIPTOR{%s}", strings.Join(parts, ", "))
 }
 // newPdfFontDescriptorFromPdfObject loads the font descriptor from a core.PdfObject.  Can either be a
--- a/pdf/model/font_composite.go
+++ b/pdf/model/font_composite.go
@ -197,7 +197,8 @@ type pdfCIDFontType0 struct {
 	encoder textencoding.TextEncoder
 	// Table 117 – Entries in a CIDFont dictionary (page 269)
-	CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character collection of the CIDFont. See Table 116.
+	CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character
 	// collection of the CIDFont. See Table 116.
 }
 // pdfCIDFontType0FromSkeleton returns a pdfCIDFontType0 with its common fields initalized.
@ -528,6 +529,8 @@ func NewCompositePdfFontFromTTFFile(filePath string) (*PdfFont, error) {
 		encoder:  textencoding.NewTrueTypeFontEncoder(ttf.Chars),
 	}
 	type0.toUnicodeCmap = ttf.MakeToUnicode()
 	// Build Font.
 	font := PdfFont{
 		context: &type0,
--- a/pdf/model/fonts/ttfparser.go
+++ b/pdf/model/fonts/ttfparser.go
@ -40,6 +40,7 @@ import (
 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/core"
 	"github.com/unidoc/unidoc/pdf/internal/cmap"
 	"github.com/unidoc/unidoc/pdf/model/textencoding"
 )
@ -83,13 +84,32 @@ type TtfType struct {
 	CapHeight              int16
 	Widths                 []uint16
-	// Chars maps rune values (unicode) to the indexes in GlyphNames. i.e GlyphNames[Chars[r]] is
+	// Chars maps rune values (unicode) to the indexes in GlyphNames. i.e. GlyphNames[Chars[r]] is
 	// the glyph corresponding to rune r.
 	Chars map[uint16]uint16
 	// GlyphNames is a list of glyphs from the "post" section of the TrueType file.
 	GlyphNames []string
 }
 // MakeToUnicode returns a ToUnicode CMap based on the encoding of `ttf`.
 // XX: This currently gives a bad text mapping for creator_test.go but leads to an otherwise
 // valid PDF file that Adobe Reader displays without error.
 func (ttf *TtfType) MakeToUnicode() *cmap.CMap {
 	codeToUnicode := map[cmap.CharCode]string{}
 	for code, idx := range ttf.Chars {
 		glyph := ttf.GlyphNames[idx]
 		r, ok := textencoding.GlyphToRune(glyph)
 		if !ok {
 			common.Log.Debug("No rune. code=0x%04x glyph=%q", code, glyph)
 			r = textencoding.MissingCodeRune
 		}
 		codeToUnicode[cmap.CharCode(code)] = string(r)
 	}
 	return cmap.NewToUnicodeCMap(codeToUnicode)
 }
 // String returns a human readable representation of `ttf`.
 func (ttf *TtfType) String() string {
 	return fmt.Sprintf("FONT_FILE2{%#q Embeddable=%t UnitsPerEm=%d Bold=%t ItalicAngle=%f "+
 		"CapHeight=%d Chars=%d GlyphNames=%d}",
@ -420,6 +440,8 @@ func (t *ttfParser) ParseCmap() error {
 		if platformID == 3 && encodingID == 1 {
 			// (3,1) subtable. Windows Unicode.
 			offset31 = offset
 		} else if platformID == 1 && encodingID == 0 {
 			offset10 = offset
 		}
 	}
@ -436,6 +458,9 @@ func (t *ttfParser) ParseCmap() error {
 			return err
 		}
 	}
 	if offset31 == 0 && offset10 == 0 {
 		common.Log.Debug("ttfParser.ParseCmap. No 31 or 10 table.")
 	}
 	return nil
 }
@ -516,11 +541,11 @@ func (t *ttfParser) parseCmapFormat12() error {
 		startGlyph := t.ReadULong()
 		if firstCode > 0x0010FFFF || (0xD800 <= firstCode && firstCode <= 0xDFFF) {
-			return errors.New("Invalid characters codes")
+			return errors.New("invalid characters codes")
 		}
 		if endCode < firstCode || endCode > 0x0010FFFF || (0xD800 <= endCode && endCode <= 0xDFFF) {
-			return errors.New("Invalid characters codes")
+			return errors.New("invalid characters codes")
 		}
 		for j := uint32(0); j <= endCode-firstCode; j++ {
--- a/pdf/model/textencoding/truetype.go
+++ b/pdf/model/textencoding/truetype.go
@ -36,7 +36,7 @@ func NewTrueTypeFontEncoder(runeToGlyphIndexMap map[uint16]uint16) TrueTypeFontE
 }
 // ttEncoderNumEntries is the maximum number of encoding entries shown in SimpleEncoder.String()
-const ttEncoderNumEntries = 1000
+const ttEncoderNumEntries = 10
 // String returns a string that describes `enc`.
 func (enc TrueTypeFontEncoder) String() string {