mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00
Add ToUnicode map when embedding Type0 CIDType2 fonts in PDF files.
This commit is contained in:
parent
38da971f78
commit
b18c8ca93d
@ -47,15 +47,30 @@ type CMap struct {
|
||||
nbits int // 8 bits for simple fonts, 16 bits for CID fonts.
|
||||
ctype int
|
||||
version string
|
||||
usecmap string // Base this cmap on `usecmap` if `usecmap` is not empty
|
||||
usecmap string // Base this cmap on `usecmap` if `usecmap` is not empty.
|
||||
systemInfo CIDSystemInfo
|
||||
|
||||
// For regular cmaps
|
||||
// For regular cmaps.
|
||||
codespaces []Codespace
|
||||
|
||||
// For ToUnicode (ctype 2) cmaps
|
||||
// For ToUnicode (ctype 2) cmaps.
|
||||
codeToUnicode map[CharCode]string
|
||||
toUnicodeIdentity bool
|
||||
}
|
||||
|
||||
// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg.
|
||||
func NewToUnicodeCMap(codeToUnicode map[CharCode]string) *CMap {
|
||||
return &CMap{
|
||||
name: "Adobe-Identity-UCS",
|
||||
ctype: 2,
|
||||
nbits: 16,
|
||||
systemInfo: CIDSystemInfo{
|
||||
Registry: "Adobe",
|
||||
Ordering: "UCS",
|
||||
Supplement: 0,
|
||||
},
|
||||
codespaces: []Codespace{Codespace{Low: 0, High: 0xffff}},
|
||||
codeToUnicode: codeToUnicode,
|
||||
}
|
||||
}
|
||||
|
||||
// String returns a human readable description of `cmap`.
|
||||
@ -100,7 +115,7 @@ func (info *CIDSystemInfo) String() string {
|
||||
return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement)
|
||||
}
|
||||
|
||||
// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`
|
||||
// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`.
|
||||
func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) {
|
||||
d, ok := core.GetDict(obj)
|
||||
if !ok {
|
||||
@ -135,7 +150,7 @@ func (cmap *CMap) Type() int {
|
||||
return cmap.ctype
|
||||
}
|
||||
|
||||
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'
|
||||
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
|
||||
const MissingCodeRune = textencoding.MissingCodeRune
|
||||
|
||||
// MissingCodeString replaces strings that can't be decoded.
|
||||
@ -182,8 +197,8 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
|
||||
if s, ok := cmap.codeToUnicode[code]; ok {
|
||||
return s, true
|
||||
}
|
||||
common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q",
|
||||
code, cmap, MissingCodeString)
|
||||
// common.Log.Debug("ERROR: CharcodeToUnicode could not convert code=0x%04x. cmap=%s. Returning %q",
|
||||
// code, cmap, MissingCodeString)
|
||||
return MissingCodeString, false
|
||||
}
|
||||
|
||||
@ -213,7 +228,7 @@ func (cmap *CMap) bytesToCharcodes(data []byte) ([]CharCode, bool) {
|
||||
return charcodes, true
|
||||
}
|
||||
|
||||
// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces
|
||||
// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces.
|
||||
// Returns:
|
||||
// character code (if there is a match) of
|
||||
// number of bytes read (if there is a match)
|
||||
@ -252,7 +267,7 @@ func LoadCmapFromDataCID(data []byte) (*CMap, error) {
|
||||
}
|
||||
|
||||
// LoadCmapFromData parses the in-memory cmap `data` and returns the resulting CMap.
|
||||
// If isCID is true then it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
|
||||
// If `isSimple` is true, it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
|
||||
//
|
||||
// 9.10.3 ToUnicode CMaps (page 293)
|
||||
func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
|
||||
@ -274,9 +289,132 @@ func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
|
||||
common.Log.Debug("ERROR: No codespaces. cmap=%s", cmap)
|
||||
return nil, ErrBadCMap
|
||||
}
|
||||
// We need to sort codespaces so that we check shorter codes first
|
||||
// We need to sort codespaces so that we check shorter codes first.
|
||||
sort.Slice(cmap.codespaces, func(i, j int) bool {
|
||||
return cmap.codespaces[i].Low < cmap.codespaces[j].Low
|
||||
})
|
||||
return cmap, nil
|
||||
}
|
||||
|
||||
// Bytes returns the raw bytes of a PDF CMap corresponding to `cmap`.
|
||||
func (cmap *CMap) Bytes() []byte {
|
||||
common.Log.Trace("cmap.Bytes: cmap=%s", cmap.String())
|
||||
body := cmap.toBfData()
|
||||
whole := strings.Join([]string{cmapHeader, body, cmapTrailer}, "\n")
|
||||
return []byte(whole)
|
||||
}
|
||||
|
||||
type (
|
||||
charRange struct {
|
||||
code0 CharCode
|
||||
code1 CharCode
|
||||
}
|
||||
fbRange struct {
|
||||
code0 CharCode
|
||||
code1 CharCode
|
||||
r0 rune
|
||||
}
|
||||
)
|
||||
|
||||
// toBfData returns the bfchar and bfrange sections of a CMap text file.
|
||||
// Both sections are computed from cmap.codeToUnicode.
|
||||
func (cmap *CMap) toBfData() string {
|
||||
if len(cmap.codeToUnicode) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// codes is a sorted list of the codeToUnicode keys.
|
||||
codes := []CharCode{}
|
||||
for code := range cmap.codeToUnicode {
|
||||
codes = append(codes, code)
|
||||
}
|
||||
sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
|
||||
|
||||
// charRanges is a list of the contiguous character code ranges in codes.
|
||||
charRanges := []charRange{}
|
||||
c0, c1 := codes[0], codes[0]+1
|
||||
for _, c := range codes[1:] {
|
||||
if c != c1 {
|
||||
charRanges = append(charRanges, charRange{c0, c1})
|
||||
c0 = c
|
||||
}
|
||||
c1 = c + 1
|
||||
}
|
||||
if c1 != c0+1 {
|
||||
charRanges = append(charRanges, charRange{c0, c1})
|
||||
}
|
||||
|
||||
// fbChars is a list of single character ranges. fbRanges is a list of multiple character ranges.
|
||||
fbChars := []CharCode{}
|
||||
fbRanges := []fbRange{}
|
||||
for _, cr := range charRanges {
|
||||
if cr.code0+1 == cr.code1 {
|
||||
fbChars = append(fbChars, cr.code0)
|
||||
} else {
|
||||
fbRanges = append(fbRanges, fbRange{
|
||||
code0: cr.code0,
|
||||
code1: cr.code1,
|
||||
r0: []rune(cmap.codeToUnicode[cr.code0])[0],
|
||||
})
|
||||
}
|
||||
}
|
||||
common.Log.Trace("charRanges=%d fbChars=%d fbRanges=%d", len(charRanges), len(fbChars),
|
||||
len(fbRanges))
|
||||
|
||||
lines := []string{}
|
||||
if len(fbChars) > 0 {
|
||||
numRanges := (len(fbChars) + maxBfEntries - 1) / maxBfEntries
|
||||
for i := 0; i < numRanges; i++ {
|
||||
n := min(len(fbChars)-i*maxBfEntries, maxBfEntries)
|
||||
lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
|
||||
for j := 0; j < n; j++ {
|
||||
code := fbChars[i*maxBfEntries+j]
|
||||
s := cmap.codeToUnicode[code]
|
||||
r := []rune(s)[0]
|
||||
lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
|
||||
}
|
||||
lines = append(lines, "endbfchar")
|
||||
}
|
||||
}
|
||||
if len(fbRanges) > 0 {
|
||||
numRanges := (len(fbRanges) + maxBfEntries - 1) / maxBfEntries
|
||||
for i := 0; i < numRanges; i++ {
|
||||
n := min(len(fbRanges)-i*maxBfEntries, maxBfEntries)
|
||||
lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
|
||||
for j := 0; j < n; j++ {
|
||||
rng := fbRanges[i*maxBfEntries+j]
|
||||
r := rng.r0
|
||||
lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1-1, r))
|
||||
}
|
||||
lines = append(lines, "endbfrange")
|
||||
}
|
||||
}
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
|
||||
const (
|
||||
maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
|
||||
cmapHeader = `
|
||||
/CIDInit /ProcSet findresource begin
|
||||
12 dict begin
|
||||
begincmap
|
||||
/CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def
|
||||
/CMapName /Adobe-Identity-UCS def
|
||||
/CMapType 2 def
|
||||
1 begincodespacerange
|
||||
<0000> <FFFF>
|
||||
endcodespacerange
|
||||
`
|
||||
cmapTrailer = `endcmap
|
||||
CMapName currentdict /CMap defineresource pop
|
||||
end
|
||||
end
|
||||
`
|
||||
)
|
||||
|
||||
func min(i, j int) int {
|
||||
if i < j {
|
||||
return i
|
||||
}
|
||||
return j
|
||||
}
|
||||
|
@ -416,7 +416,7 @@ func (cmap *CMap) parseBfchar() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseBfrange parses a c section of a CMap file.
|
||||
// parseBfrange parses a bfrange section of a CMap file.
|
||||
func (cmap *CMap) parseBfrange() error {
|
||||
for {
|
||||
// The specifications are in triplets.
|
||||
|
@ -448,12 +448,12 @@ type fontCommon struct {
|
||||
basefont string // The font's "BaseFont" field.
|
||||
subtype string // The font's "Subtype" field.
|
||||
|
||||
// These are optional fields in the PDF font
|
||||
// These are optional fields in the PDF font.
|
||||
toUnicode core.PdfObject // The stream containing toUnicodeCmap. We keep it around for ToPdfObject.
|
||||
|
||||
// These objects are computed from optional fields in the PDF font.
|
||||
toUnicodeCmap *cmap.CMap // Computed from "ToUnicode"
|
||||
fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor"
|
||||
toUnicodeCmap *cmap.CMap // Computed from "ToUnicode".
|
||||
fontDescriptor *PdfFontDescriptor // Computed from "FontDescriptor".
|
||||
|
||||
// objectNumber helps us find the font in the PDF being processed. This helps with debugging.
|
||||
objectNumber int64
|
||||
@ -482,6 +482,14 @@ func (base fontCommon) asPdfObjectDictionary(subtype string) *core.PdfObjectDict
|
||||
}
|
||||
if base.toUnicode != nil {
|
||||
d.Set("ToUnicode", base.toUnicode)
|
||||
} else if base.toUnicodeCmap != nil {
|
||||
data := base.toUnicodeCmap.Bytes()
|
||||
o, err := core.MakeStream(data, nil)
|
||||
if err != nil {
|
||||
common.Log.Debug("MakeStream failed. err=%v", err)
|
||||
} else {
|
||||
d.Set("ToUnicode", o)
|
||||
}
|
||||
}
|
||||
return d
|
||||
}
|
||||
@ -584,7 +592,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
|
||||
return d, font, nil
|
||||
}
|
||||
|
||||
// toUnicodeToCmap returns a CMap of `toUnicode` if it exists
|
||||
// toUnicodeToCmap returns a CMap of `toUnicode` if it exists.
|
||||
func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, error) {
|
||||
toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
|
||||
if !ok {
|
||||
@ -673,7 +681,7 @@ func (descriptor *PdfFontDescriptor) String() string {
|
||||
}
|
||||
parts = append(parts, fmt.Sprintf("FontFile3=%t", descriptor.FontFile3 != nil))
|
||||
|
||||
return fmt.Sprintf("FONT_DESCRIPTON{%s}", strings.Join(parts, ", "))
|
||||
return fmt.Sprintf("FONT_DESCRIPTOR{%s}", strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
// newPdfFontDescriptorFromPdfObject loads the font descriptor from a core.PdfObject. Can either be a
|
||||
|
@ -197,7 +197,8 @@ type pdfCIDFontType0 struct {
|
||||
encoder textencoding.TextEncoder
|
||||
|
||||
// Table 117 – Entries in a CIDFont dictionary (page 269)
|
||||
CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character collection of the CIDFont. See Table 116.
|
||||
CIDSystemInfo *core.PdfObjectDictionary // (Required) Dictionary that defines the character
|
||||
// collection of the CIDFont. See Table 116.
|
||||
}
|
||||
|
||||
// pdfCIDFontType0FromSkeleton returns a pdfCIDFontType0 with its common fields initalized.
|
||||
@ -528,6 +529,8 @@ func NewCompositePdfFontFromTTFFile(filePath string) (*PdfFont, error) {
|
||||
encoder: textencoding.NewTrueTypeFontEncoder(ttf.Chars),
|
||||
}
|
||||
|
||||
type0.toUnicodeCmap = ttf.MakeToUnicode()
|
||||
|
||||
// Build Font.
|
||||
font := PdfFont{
|
||||
context: &type0,
|
||||
|
@ -40,6 +40,7 @@ import (
|
||||
|
||||
"github.com/unidoc/unidoc/common"
|
||||
"github.com/unidoc/unidoc/pdf/core"
|
||||
"github.com/unidoc/unidoc/pdf/internal/cmap"
|
||||
"github.com/unidoc/unidoc/pdf/model/textencoding"
|
||||
)
|
||||
|
||||
@ -83,13 +84,32 @@ type TtfType struct {
|
||||
CapHeight int16
|
||||
Widths []uint16
|
||||
|
||||
// Chars maps rune values (unicode) to the indexes in GlyphNames. i.e GlyphNames[Chars[r]] is
|
||||
// Chars maps rune values (unicode) to the indexes in GlyphNames. i.e. GlyphNames[Chars[r]] is
|
||||
// the glyph corresponding to rune r.
|
||||
Chars map[uint16]uint16
|
||||
// GlyphNames is a list of glyphs from the "post" section of the TrueType file.
|
||||
GlyphNames []string
|
||||
}
|
||||
|
||||
// MakeToUnicode returns a ToUnicode CMap based on the encoding of `ttf`.
|
||||
// XX: This currently gives a bad text mapping for creator_test.go but leads to an otherwise
|
||||
// valid PDF file that Adobe Reader displays without error.
|
||||
func (ttf *TtfType) MakeToUnicode() *cmap.CMap {
|
||||
codeToUnicode := map[cmap.CharCode]string{}
|
||||
for code, idx := range ttf.Chars {
|
||||
glyph := ttf.GlyphNames[idx]
|
||||
|
||||
r, ok := textencoding.GlyphToRune(glyph)
|
||||
if !ok {
|
||||
common.Log.Debug("No rune. code=0x%04x glyph=%q", code, glyph)
|
||||
r = textencoding.MissingCodeRune
|
||||
}
|
||||
codeToUnicode[cmap.CharCode(code)] = string(r)
|
||||
}
|
||||
return cmap.NewToUnicodeCMap(codeToUnicode)
|
||||
}
|
||||
|
||||
// String returns a human readable representation of `ttf`.
|
||||
func (ttf *TtfType) String() string {
|
||||
return fmt.Sprintf("FONT_FILE2{%#q Embeddable=%t UnitsPerEm=%d Bold=%t ItalicAngle=%f "+
|
||||
"CapHeight=%d Chars=%d GlyphNames=%d}",
|
||||
@ -420,6 +440,8 @@ func (t *ttfParser) ParseCmap() error {
|
||||
if platformID == 3 && encodingID == 1 {
|
||||
// (3,1) subtable. Windows Unicode.
|
||||
offset31 = offset
|
||||
} else if platformID == 1 && encodingID == 0 {
|
||||
offset10 = offset
|
||||
}
|
||||
}
|
||||
|
||||
@ -436,6 +458,9 @@ func (t *ttfParser) ParseCmap() error {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if offset31 == 0 && offset10 == 0 {
|
||||
common.Log.Debug("ttfParser.ParseCmap. No 31 or 10 table.")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@ -516,11 +541,11 @@ func (t *ttfParser) parseCmapFormat12() error {
|
||||
startGlyph := t.ReadULong()
|
||||
|
||||
if firstCode > 0x0010FFFF || (0xD800 <= firstCode && firstCode <= 0xDFFF) {
|
||||
return errors.New("Invalid characters codes")
|
||||
return errors.New("invalid characters codes")
|
||||
}
|
||||
|
||||
if endCode < firstCode || endCode > 0x0010FFFF || (0xD800 <= endCode && endCode <= 0xDFFF) {
|
||||
return errors.New("Invalid characters codes")
|
||||
return errors.New("invalid characters codes")
|
||||
}
|
||||
|
||||
for j := uint32(0); j <= endCode-firstCode; j++ {
|
||||
|
@ -36,7 +36,7 @@ func NewTrueTypeFontEncoder(runeToGlyphIndexMap map[uint16]uint16) TrueTypeFontE
|
||||
}
|
||||
|
||||
// ttEncoderNumEntries is the maximum number of encoding entries shown in SimpleEncoder.String()
|
||||
const ttEncoderNumEntries = 1000
|
||||
const ttEncoderNumEntries = 10
|
||||
|
||||
// String returns a string that describes `enc`.
|
||||
func (enc TrueTypeFontEncoder) String() string {
|
||||
|
Loading…
x
Reference in New Issue
Block a user