Add ToUnicode map when embedding Type0 CIDType2 fonts in PDF files.

This commit is contained in:
Peter Williams 2018-09-17 17:57:52 +10:00
parent 38da971f78
commit b18c8ca93d
6 changed files with 197 additions and 23 deletions

View File

@ -47,15 +47,30 @@ type CMap struct {
nbits int // 8 bits for simple fonts, 16 bits for CID fonts. nbits int // 8 bits for simple fonts, 16 bits for CID fonts.
ctype int ctype int
version string version string
usecmap string // Base this cmap on `usecmap` if `usecmap` is not empty usecmap string // Base this cmap on `usecmap` if `usecmap` is not empty.
systemInfo CIDSystemInfo systemInfo CIDSystemInfo
// For regular cmaps // For regular cmaps.
codespaces []Codespace codespaces []Codespace
// For ToUnicode (ctype 2) cmaps // For ToUnicode (ctype 2) cmaps.
codeToUnicode map[CharCode]string codeToUnicode map[CharCode]string
toUnicodeIdentity bool }
// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg.
func NewToUnicodeCMap(codeToUnicode map[CharCode]string) *CMap {
return &CMap{
name: "Adobe-Identity-UCS",
ctype: 2,
nbits: 16,
systemInfo: CIDSystemInfo{
Registry: "Adobe",
Ordering: "UCS",
Supplement: 0,
},
codespaces: []Codespace{Codespace{Low: 0, High: 0xffff}},
codeToUnicode: codeToUnicode,
}
} }
// String returns a human readable description of `cmap`. // String returns a human readable description of `cmap`.
@ -100,7 +115,7 @@ func (info *CIDSystemInfo) String() string {
return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement) return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement)
} }
// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj` // NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`.
func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) { func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) {
d, ok := core.GetDict(obj) d, ok := core.GetDict(obj)
if !ok { if !ok {
@ -135,7 +150,7 @@ func (cmap *CMap) Type() int {
return cmap.ctype return cmap.ctype
} }
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?' // MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
const MissingCodeRune = textencoding.MissingCodeRune const MissingCodeRune = textencoding.MissingCodeRune
// MissingCodeString replaces strings that can't be decoded. // MissingCodeString replaces strings that can't be decoded.
@ -182,8 +197,8 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
if s, ok := cmap.codeToUnicode[code]; ok { if s, ok := cmap.codeToUnicode[code]; ok {
return s, true return s, true
} }
common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q", // common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q",
code, cmap, MissingCodeString) // code, cmap, MissingCodeString)
return MissingCodeString, false return MissingCodeString, false
} }
@ -213,7 +228,7 @@ func (cmap *CMap) bytesToCharcodes(data []byte) ([]CharCode, bool) {
return charcodes, true return charcodes, true
} }
// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces // matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces.
// Returns: // Returns:
// character code (if there is a match) of // character code (if there is a match) of
// number of bytes read (if there is a match) // number of bytes read (if there is a match)
@ -252,7 +267,7 @@ func LoadCmapFromDataCID(data []byte) (*CMap, error) {
} }
// LoadCmapFromData parses the in-memory cmap `data` and returns the resulting CMap. // LoadCmapFromData parses the in-memory cmap `data` and returns the resulting CMap.
// If isCID is true then it uses 1-byte encodings, otherwise it uses the codespaces in the cmap. // If `isSimple` is true, it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
// //
// 9.10.3 ToUnicode CMaps (page 293) // 9.10.3 ToUnicode CMaps (page 293)
func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) { func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
@ -274,9 +289,132 @@ func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
common.Log.Debug("ERROR: No codespaces. cmap=%s", cmap) common.Log.Debug("ERROR: No codespaces. cmap=%s", cmap)
return nil, ErrBadCMap return nil, ErrBadCMap
} }
// We need to sort codespaces so that we check shorter codes first // We need to sort codespaces so that we check shorter codes first.
sort.Slice(cmap.codespaces, func(i, j int) bool { sort.Slice(cmap.codespaces, func(i, j int) bool {
return cmap.codespaces[i].Low < cmap.codespaces[j].Low return cmap.codespaces[i].Low < cmap.codespaces[j].Low
}) })
return cmap, nil return cmap, nil
} }
// Bytes returns the raw bytes of a PDF CMap corresponding to `cmap`.
func (cmap *CMap) Bytes() []byte {
common.Log.Trace("cmap.Bytes: cmap=%s", cmap.String())
body := cmap.toBfData()
whole := strings.Join([]string{cmapHeader, body, cmapTrailer}, "\n")
return []byte(whole)
}
type (
charRange struct {
code0 CharCode
code1 CharCode
}
fbRange struct {
code0 CharCode
code1 CharCode
r0 rune
}
)
// toBfData returns the bfchar and bfrange sections of a CMap text file.
// Both sections are computed from cmap.codeToUnicode.
func (cmap *CMap) toBfData() string {
if len(cmap.codeToUnicode) == 0 {
return ""
}
// codes is a sorted list of the codeToUnicode keys.
codes := []CharCode{}
for code := range cmap.codeToUnicode {
codes = append(codes, code)
}
sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
// charRanges is a list of the contiguous character code ranges in codes.
charRanges := []charRange{}
c0, c1 := codes[0], codes[0]+1
for _, c := range codes[1:] {
if c != c1 {
charRanges = append(charRanges, charRange{c0, c1})
c0 = c
}
c1 = c + 1
}
if c1 != c0+1 {
charRanges = append(charRanges, charRange{c0, c1})
}
// fbChars is a list of single character ranges. fbRanges is a list of multiple character ranges.
fbChars := []CharCode{}
fbRanges := []fbRange{}
for _, cr := range charRanges {
if cr.code0+1 == cr.code1 {
fbChars = append(fbChars, cr.code0)
} else {
fbRanges = append(fbRanges, fbRange{
code0: cr.code0,
code1: cr.code1,
r0: []rune(cmap.codeToUnicode[cr.code0])[0],
})
}
}
common.Log.Trace("charRanges=%d fbChars=%d fbRanges=%d", len(charRanges), len(fbChars),
len(fbRanges))
lines := []string{}
if len(fbChars) > 0 {
numRanges := (len(fbChars) + maxBfEntries - 1) / maxBfEntries
for i := 0; i < numRanges; i++ {
n := min(len(fbChars)-i*maxBfEntries, maxBfEntries)
lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
for j := 0; j < n; j++ {
code := fbChars[i*maxBfEntries+j]
s := cmap.codeToUnicode[code]
r := []rune(s)[0]
lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
}
lines = append(lines, "endbfchar")
}
}
if len(fbRanges) > 0 {
numRanges := (len(fbRanges) + maxBfEntries - 1) / maxBfEntries
for i := 0; i < numRanges; i++ {
n := min(len(fbRanges)-i*maxBfEntries, maxBfEntries)
lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
for j := 0; j < n; j++ {
rng := fbRanges[i*maxBfEntries+j]
r := rng.r0
lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1-1, r))
}
lines = append(lines, "endbfrange")
}
}
return strings.Join(lines, "\n")
}
const (
maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
cmapHeader = `
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def
/CMapName /Adobe-Identity-UCS def
/CMapType 2 def
1 begincodespacerange
<0000> <FFFF>
endcodespacerange
`
cmapTrailer = `endcmap
CMapName currentdict /CMap defineresource pop
end
end
`
)
func min(i, j int) int {
if i < j {
return i
}
return j
}

View File

@ -416,7 +416,7 @@ func (cmap *CMap) parseBfchar() error {
return nil return nil
} }
// parseBfrange parses a c section of a CMap file. // parseBfrange parses a bfrange section of a CMap file.
func (cmap *CMap) parseBfrange() error { func (cmap *CMap) parseBfrange() error {
for { for {
// The specifications are in triplets. // The specifications are in triplets.

View File

@ -448,12 +448,12 @@ type fontCommon struct {
basefont string // The font's "BaseFont" field. basefont string // The font's "BaseFont" field.
subtype string // The font's "Subtype" field. subtype string // The font's "Subtype" field.
// These are optional fields in the PDF font // These are optional fields in the PDF font.
toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject. toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject.
// These objects are computed from optional fields in the PDF font. // These objects are computed from optional fields in the PDF font.
toUnicodeCmap *cmap.CMap // Computed from "ToUnicode" toUnicodeCmap *cmap.CMap // Computed from "ToUnicode".
fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor" fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor".
// objectNumber helps us find the font in the PDF being processed. This helps with debugging. // objectNumber helps us find the font in the PDF being processed. This helps with debugging.
objectNumber int64 objectNumber int64
@ -482,6 +482,14 @@ func (base fontCommon) asPdfObjectDictionary(subtype string) *core.PdfObjectDict
} }
if base.toUnicode != nil { if base.toUnicode != nil {
d.Set("ToUnicode", base.toUnicode) d.Set("ToUnicode", base.toUnicode)
} else if base.toUnicodeCmap != nil {
data := base.toUnicodeCmap.Bytes()
o, err := core.MakeStream(data, nil)
if err != nil {
common.Log.Debug("MakeStream failed. err=%v", err)
} else {
d.Set("ToUnicode", o)
}
} }
return d return d
} }
@ -584,7 +592,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
return d, font, nil return d, font, nil
} }
// toUnicodeToCmap returns a CMap of `toUnicode` if it exists // toUnicodeToCmap returns a CMap of `toUnicode` if it exists.
func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, error) { func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, error) {
toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream) toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
if !ok { if !ok {
@ -673,7 +681,7 @@ func (descriptor *PdfFontDescriptor) String() string {
} }
parts = append(parts, fmt.Sprintf("FontFile3=%t", descriptor.FontFile3 != nil)) parts = append(parts, fmt.Sprintf("FontFile3=%t", descriptor.FontFile3 != nil))
return fmt.Sprintf("FONT_DESCRIPTON{%s}", strings.Join(parts, ", ")) return fmt.Sprintf("FONT_DESCRIPTOR{%s}", strings.Join(parts, ", "))
} }
// newPdfFontDescriptorFromPdfObject loads the font descriptor from a core.PdfObject. Can either be a // newPdfFontDescriptorFromPdfObject loads the font descriptor from a core.PdfObject. Can either be a

View File

@ -197,7 +197,8 @@ type pdfCIDFontType0 struct {
encoder textencoding.TextEncoder encoder textencoding.TextEncoder
// Table 117 Entries in a CIDFont dictionary (page 269) // Table 117 Entries in a CIDFont dictionary (page 269)
CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character collection of the CIDFont. See Table 116. CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character
// collection of the CIDFont. See Table 116.
} }
// pdfCIDFontType0FromSkeleton returns a pdfCIDFontType0 with its common fields initalized. // pdfCIDFontType0FromSkeleton returns a pdfCIDFontType0 with its common fields initalized.
@ -528,6 +529,8 @@ func NewCompositePdfFontFromTTFFile(filePath string) (*PdfFont, error) {
encoder: textencoding.NewTrueTypeFontEncoder(ttf.Chars), encoder: textencoding.NewTrueTypeFontEncoder(ttf.Chars),
} }
type0.toUnicodeCmap = ttf.MakeToUnicode()
// Build Font. // Build Font.
font := PdfFont{ font := PdfFont{
context: &type0, context: &type0,

View File

@ -40,6 +40,7 @@ import (
"github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/core" "github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/internal/cmap"
"github.com/unidoc/unidoc/pdf/model/textencoding" "github.com/unidoc/unidoc/pdf/model/textencoding"
) )
@ -83,13 +84,32 @@ type TtfType struct {
CapHeight int16 CapHeight int16
Widths []uint16 Widths []uint16
// Chars maps rune values (unicode) to the indexes in GlyphNames. i.e GlyphNames[Chars[r]] is // Chars maps rune values (unicode) to the indexes in GlyphNames. i.e. GlyphNames[Chars[r]] is
// the glyph corresponding to rune r. // the glyph corresponding to rune r.
Chars map[uint16]uint16 Chars map[uint16]uint16
// GlyphNames is a list of glyphs from the "post" section of the TrueType file. // GlyphNames is a list of glyphs from the "post" section of the TrueType file.
GlyphNames []string GlyphNames []string
} }
// MakeToUnicode returns a ToUnicode CMap based on the encoding of `ttf`.
// XX: This currently gives a bad text mapping for creator_test.go but leads to an otherwise
// valid PDF file that Adobe Reader displays without error.
func (ttf *TtfType) MakeToUnicode() *cmap.CMap {
codeToUnicode := map[cmap.CharCode]string{}
for code, idx := range ttf.Chars {
glyph := ttf.GlyphNames[idx]
r, ok := textencoding.GlyphToRune(glyph)
if !ok {
common.Log.Debug("No rune. code=0x%04x glyph=%q", code, glyph)
r = textencoding.MissingCodeRune
}
codeToUnicode[cmap.CharCode(code)] = string(r)
}
return cmap.NewToUnicodeCMap(codeToUnicode)
}
// String returns a human readable representation of `ttf`.
func (ttf *TtfType) String() string { func (ttf *TtfType) String() string {
return fmt.Sprintf("FONT_FILE2{%#q Embeddable=%t UnitsPerEm=%d Bold=%t ItalicAngle=%f "+ return fmt.Sprintf("FONT_FILE2{%#q Embeddable=%t UnitsPerEm=%d Bold=%t ItalicAngle=%f "+
"CapHeight=%d Chars=%d GlyphNames=%d}", "CapHeight=%d Chars=%d GlyphNames=%d}",
@ -420,6 +440,8 @@ func (t *ttfParser) ParseCmap() error {
if platformID == 3 && encodingID == 1 { if platformID == 3 && encodingID == 1 {
// (3,1) subtable. Windows Unicode. // (3,1) subtable. Windows Unicode.
offset31 = offset offset31 = offset
} else if platformID == 1 && encodingID == 0 {
offset10 = offset
} }
} }
@ -436,6 +458,9 @@ func (t *ttfParser) ParseCmap() error {
return err return err
} }
} }
if offset31 == 0 && offset10 == 0 {
common.Log.Debug("ttfParser.ParseCmap. No 31 or 10 table.")
}
return nil return nil
} }
@ -516,11 +541,11 @@ func (t *ttfParser) parseCmapFormat12() error {
startGlyph := t.ReadULong() startGlyph := t.ReadULong()
if firstCode > 0x0010FFFF || (0xD800 <= firstCode && firstCode <= 0xDFFF) { if firstCode > 0x0010FFFF || (0xD800 <= firstCode && firstCode <= 0xDFFF) {
return errors.New("Invalid characters codes") return errors.New("invalid characters codes")
} }
if endCode < firstCode || endCode > 0x0010FFFF || (0xD800 <= endCode && endCode <= 0xDFFF) { if endCode < firstCode || endCode > 0x0010FFFF || (0xD800 <= endCode && endCode <= 0xDFFF) {
return errors.New("Invalid characters codes") return errors.New("invalid characters codes")
} }
for j := uint32(0); j <= endCode-firstCode; j++ { for j := uint32(0); j <= endCode-firstCode; j++ {

View File

@ -36,7 +36,7 @@ func NewTrueTypeFontEncoder(runeToGlyphIndexMap map[uint16]uint16) TrueTypeFontE
} }
// ttEncoderNumEntries is the maximum number of encoding entries shown in SimpleEncoder.String() // ttEncoderNumEntries is the maximum number of encoding entries shown in SimpleEncoder.String()
const ttEncoderNumEntries = 1000 const ttEncoderNumEntries = 10
// String returns a string that describes `enc`. // String returns a string that describes `enc`.
func (enc TrueTypeFontEncoder) String() string { func (enc TrueTypeFontEncoder) String() string {