Add ToUnicode map when embedding Type0 CIDType2 fonts in PDF files.

2025-04-27 13:48:51 +08:00 · 2018-09-17 17:57:52 +10:00 · 2018-09-17 17:57:52 +10:00 · b18c8ca93d
commit b18c8ca93d
parent 38da971f78
6 changed files with 197 additions and 23 deletions
--- a/pdf/internal/cmap/cmap.go
+++ b/pdf/internal/cmap/cmap.go
@ -47,15 +47,30 @@ type CMap struct {
 	nbits      int // 8 bits for simple fonts, 16 bits for CID fonts.
 	ctype      int
 	version    string
-	usecmap    string // Base this cmap on `usecmap` if `usecmap` is not empty
+	usecmap    string // Base this cmap on `usecmap` if `usecmap` is not empty.
 	systemInfo CIDSystemInfo

-	// For regular cmaps
+	// For regular cmaps.
 	codespaces []Codespace

-	// For ToUnicode (ctype 2) cmaps
+	// For ToUnicode (ctype 2) cmaps.
 	codeToUnicode map[CharCode]string
-	toUnicodeIdentity bool
+}
+
+// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg.
+func NewToUnicodeCMap(codeToUnicode map[CharCode]string) *CMap {
+	return &CMap{
+		name:  "Adobe-Identity-UCS",
+		ctype: 2,
+		nbits: 16,
+		systemInfo: CIDSystemInfo{
+			Registry:   "Adobe",
+			Ordering:   "UCS",
+			Supplement: 0,
+		},
+		codespaces:    []Codespace{Codespace{Low: 0, High: 0xffff}},
+		codeToUnicode: codeToUnicode,
+	}
 }

 // String returns a human readable description of `cmap`.
@ -100,7 +115,7 @@ func (info *CIDSystemInfo) String() string {
 	return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement)
 }

-// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`
+// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`.
 func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) {
 	d, ok := core.GetDict(obj)
 	if !ok {
@ -135,7 +150,7 @@ func (cmap *CMap) Type() int {
 	return cmap.ctype
 }

-// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'
+// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
 const MissingCodeRune = textencoding.MissingCodeRune

 // MissingCodeString replaces strings that can't be decoded.
@ -182,8 +197,8 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
 	if s, ok := cmap.codeToUnicode[code]; ok {
 		return s, true
 	}
-	common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q",
-		code, cmap, MissingCodeString)
+	// common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q",
+	// 	code, cmap, MissingCodeString)
 	return MissingCodeString, false
 }

@ -213,7 +228,7 @@ func (cmap *CMap) bytesToCharcodes(data []byte) ([]CharCode, bool) {
 	return charcodes, true
 }

-// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces
+// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces.
 // Returns:
 //      character code (if there is a match) of
 //      number of bytes read (if there is a match)
@ -252,7 +267,7 @@ func LoadCmapFromDataCID(data []byte) (*CMap, error) {
 }

 // LoadCmapFromData parses the in-memory cmap `data` and returns the resulting CMap.
-// If isCID is true then it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
+// If `isSimple` is true, it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
 //
 // 9.10.3 ToUnicode CMaps (page 293)
 func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
@ -274,9 +289,132 @@ func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
 		common.Log.Debug("ERROR: No codespaces. cmap=%s", cmap)
 		return nil, ErrBadCMap
 	}
-	// We need to sort codespaces so that we check shorter codes first
+	// We need to sort codespaces so that we check shorter codes first.
 	sort.Slice(cmap.codespaces, func(i, j int) bool {
 		return cmap.codespaces[i].Low < cmap.codespaces[j].Low
 	})
 	return cmap, nil
 }
+
+// Bytes returns the raw bytes of a PDF CMap corresponding to `cmap`.
+func (cmap *CMap) Bytes() []byte {
+	common.Log.Trace("cmap.Bytes: cmap=%s", cmap.String())
+	body := cmap.toBfData()
+	whole := strings.Join([]string{cmapHeader, body, cmapTrailer}, "\n")
+	return []byte(whole)
+}
+
+type (
+	charRange struct {
+		code0 CharCode
+		code1 CharCode
+	}
+	fbRange struct {
+		code0 CharCode
+		code1 CharCode
+		r0    rune
+	}
+)
+
+// toBfData returns the bfchar and bfrange sections of a CMap text file.
+// Both sections are computed from cmap.codeToUnicode.
+func (cmap *CMap) toBfData() string {
+	if len(cmap.codeToUnicode) == 0 {
+		return ""
+	}
+
+	// codes is a sorted list of the codeToUnicode keys.
+	codes := []CharCode{}
+	for code := range cmap.codeToUnicode {
+		codes = append(codes, code)
+	}
+	sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
+
+	// charRanges is a list of the contiguous character code ranges in codes.
+	charRanges := []charRange{}
+	c0, c1 := codes[0], codes[0]+1
+	for _, c := range codes[1:] {
+		if c != c1 {
+			charRanges = append(charRanges, charRange{c0, c1})
+			c0 = c
+		}
+		c1 = c + 1
+	}
+	if c1 != c0+1 {
+		charRanges = append(charRanges, charRange{c0, c1})
+	}
+
+	// fbChars is a list of single character ranges. fbRanges is a list of multiple character ranges.
+	fbChars := []CharCode{}
+	fbRanges := []fbRange{}
+	for _, cr := range charRanges {
+		if cr.code0+1 == cr.code1 {
+			fbChars = append(fbChars, cr.code0)
+		} else {
+			fbRanges = append(fbRanges, fbRange{
+				code0: cr.code0,
+				code1: cr.code1,
+				r0:    []rune(cmap.codeToUnicode[cr.code0])[0],
+			})
+		}
+	}
+	common.Log.Trace("charRanges=%d fbChars=%d fbRanges=%d", len(charRanges), len(fbChars),
+		len(fbRanges))
+
+	lines := []string{}
+	if len(fbChars) > 0 {
+		numRanges := (len(fbChars) + maxBfEntries - 1) / maxBfEntries
+		for i := 0; i < numRanges; i++ {
+			n := min(len(fbChars)-i*maxBfEntries, maxBfEntries)
+			lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
+			for j := 0; j < n; j++ {
+				code := fbChars[i*maxBfEntries+j]
+				s := cmap.codeToUnicode[code]
+				r := []rune(s)[0]
+				lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
+			}
+			lines = append(lines, "endbfchar")
+		}
+	}
+	if len(fbRanges) > 0 {
+		numRanges := (len(fbRanges) + maxBfEntries - 1) / maxBfEntries
+		for i := 0; i < numRanges; i++ {
+			n := min(len(fbRanges)-i*maxBfEntries, maxBfEntries)
+			lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
+			for j := 0; j < n; j++ {
+				rng := fbRanges[i*maxBfEntries+j]
+				r := rng.r0
+				lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1-1, r))
+			}
+			lines = append(lines, "endbfrange")
+		}
+	}
+	return strings.Join(lines, "\n")
+}
+
+const (
+	maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
+	cmapHeader   = `
+/CIDInit /ProcSet findresource begin
+12 dict begin
+begincmap
+/CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def
+/CMapName /Adobe-Identity-UCS def
+/CMapType 2 def
+1 begincodespacerange
+<0000> <FFFF>
+endcodespacerange
+`
+	cmapTrailer = `endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+`
+)
+
+func min(i, j int) int {
+	if i < j {
+		return i
+	}
+	return j
+}
--- a/pdf/internal/cmap/cmap_parser.go
+++ b/pdf/internal/cmap/cmap_parser.go
@ -416,7 +416,7 @@ func (cmap *CMap) parseBfchar() error {
 	return nil
 }

-// parseBfrange parses a c section of a CMap file.
+// parseBfrange parses a bfrange section of a CMap file.
 func (cmap *CMap) parseBfrange() error {
 	for {
 		// The specifications are in triplets.
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@ -448,12 +448,12 @@ type fontCommon struct {
 	basefont string // The font's "BaseFont" field.
 	subtype  string // The font's "Subtype" field.

-	// These are optional fields in the PDF font
+	// These are optional fields in the PDF font.
 	toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject.

 	// These objects are computed from optional fields in the PDF font.
-	toUnicodeCmap  *cmap.CMap         // Computed from "ToUnicode"
-	fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor"
+	toUnicodeCmap  *cmap.CMap         // Computed from "ToUnicode".
+	fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor".

 	// objectNumber helps us find the font in the PDF being processed. This helps with debugging.
 	objectNumber int64
@ -482,6 +482,14 @@ func (base fontCommon) asPdfObjectDictionary(subtype string) *core.PdfObjectDict
 	}
 	if base.toUnicode != nil {
 		d.Set("ToUnicode", base.toUnicode)
+	} else if base.toUnicodeCmap != nil {
+		data := base.toUnicodeCmap.Bytes()
+		o, err := core.MakeStream(data, nil)
+		if err != nil {
+			common.Log.Debug("MakeStream failed. err=%v", err)
+		} else {
+			d.Set("ToUnicode", o)
+		}
 	}
 	return d
 }
@ -584,7 +592,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
 	return d, font, nil
 }

-// toUnicodeToCmap returns a CMap of `toUnicode` if it exists
+// toUnicodeToCmap returns a CMap of `toUnicode` if it exists.
 func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, error) {
 	toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
 	if !ok {
@ -673,7 +681,7 @@ func (descriptor *PdfFontDescriptor) String() string {
 	}
 	parts = append(parts, fmt.Sprintf("FontFile3=%t", descriptor.FontFile3 != nil))

-	return fmt.Sprintf("FONT_DESCRIPTON{%s}", strings.Join(parts, ", "))
+	return fmt.Sprintf("FONT_DESCRIPTOR{%s}", strings.Join(parts, ", "))
 }

 // newPdfFontDescriptorFromPdfObject loads the font descriptor from a core.PdfObject.  Can either be a
--- a/pdf/model/font_composite.go
+++ b/pdf/model/font_composite.go
@ -197,7 +197,8 @@ type pdfCIDFontType0 struct {
 	encoder textencoding.TextEncoder

 	// Table 117 – Entries in a CIDFont dictionary (page 269)
-	CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character collection of the CIDFont. See Table 116.
+	CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character
+	// collection of the CIDFont. See Table 116.
 }

 // pdfCIDFontType0FromSkeleton returns a pdfCIDFontType0 with its common fields initalized.
@ -528,6 +529,8 @@ func NewCompositePdfFontFromTTFFile(filePath string) (*PdfFont, error) {
 		encoder:  textencoding.NewTrueTypeFontEncoder(ttf.Chars),
 	}

+	type0.toUnicodeCmap = ttf.MakeToUnicode()
+
 	// Build Font.
 	font := PdfFont{
 		context: &type0,
--- a/pdf/model/fonts/ttfparser.go
+++ b/pdf/model/fonts/ttfparser.go
@ -40,6 +40,7 @@ import (

 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/core"
+	"github.com/unidoc/unidoc/pdf/internal/cmap"
 	"github.com/unidoc/unidoc/pdf/model/textencoding"
 )

@ -83,13 +84,32 @@ type TtfType struct {
 	CapHeight              int16
 	Widths                 []uint16

-	// Chars maps rune values (unicode) to the indexes in GlyphNames. i.e GlyphNames[Chars[r]] is
+	// Chars maps rune values (unicode) to the indexes in GlyphNames. i.e. GlyphNames[Chars[r]] is
 	// the glyph corresponding to rune r.
 	Chars map[uint16]uint16
 	// GlyphNames is a list of glyphs from the "post" section of the TrueType file.
 	GlyphNames []string
 }

+// MakeToUnicode returns a ToUnicode CMap based on the encoding of `ttf`.
+// XX: This currently gives a bad text mapping for creator_test.go but leads to an otherwise
+// valid PDF file that Adobe Reader displays without error.
+func (ttf *TtfType) MakeToUnicode() *cmap.CMap {
+	codeToUnicode := map[cmap.CharCode]string{}
+	for code, idx := range ttf.Chars {
+		glyph := ttf.GlyphNames[idx]
+
+		r, ok := textencoding.GlyphToRune(glyph)
+		if !ok {
+			common.Log.Debug("No rune. code=0x%04x glyph=%q", code, glyph)
+			r = textencoding.MissingCodeRune
+		}
+		codeToUnicode[cmap.CharCode(code)] = string(r)
+	}
+	return cmap.NewToUnicodeCMap(codeToUnicode)
+}
+
+// String returns a human readable representation of `ttf`.
 func (ttf *TtfType) String() string {
 	return fmt.Sprintf("FONT_FILE2{%#q Embeddable=%t UnitsPerEm=%d Bold=%t ItalicAngle=%f "+
 		"CapHeight=%d Chars=%d GlyphNames=%d}",
@ -420,6 +440,8 @@ func (t *ttfParser) ParseCmap() error {
 		if platformID == 3 && encodingID == 1 {
 			// (3,1) subtable. Windows Unicode.
 			offset31 = offset
+		} else if platformID == 1 && encodingID == 0 {
+			offset10 = offset
 		}
 	}

@ -436,6 +458,9 @@ func (t *ttfParser) ParseCmap() error {
 			return err
 		}
 	}
+	if offset31 == 0 && offset10 == 0 {
+		common.Log.Debug("ttfParser.ParseCmap. No 31 or 10 table.")
+	}

 	return nil
 }
@ -516,11 +541,11 @@ func (t *ttfParser) parseCmapFormat12() error {
 		startGlyph := t.ReadULong()

 		if firstCode > 0x0010FFFF || (0xD800 <= firstCode && firstCode <= 0xDFFF) {
-			return errors.New("Invalid characters codes")
+			return errors.New("invalid characters codes")
 		}

 		if endCode < firstCode || endCode > 0x0010FFFF || (0xD800 <= endCode && endCode <= 0xDFFF) {
-			return errors.New("Invalid characters codes")
+			return errors.New("invalid characters codes")
 		}

 		for j := uint32(0); j <= endCode-firstCode; j++ {
--- a/pdf/model/textencoding/truetype.go
+++ b/pdf/model/textencoding/truetype.go
@ -36,7 +36,7 @@ func NewTrueTypeFontEncoder(runeToGlyphIndexMap map[uint16]uint16) TrueTypeFontE
 }

 // ttEncoderNumEntries is the maximum number of encoding entries shown in SimpleEncoder.String()
-const ttEncoderNumEntries = 1000
+const ttEncoderNumEntries = 10

 // String returns a string that describes `enc`.
 func (enc TrueTypeFontEncoder) String() string {