mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * Changed rune->CharCode maps to string->CharCode. * Removed unintentional changes. * Updated comments to match new function definitions. * Changed some []rune APIs to string * Fixes for reviewer comments.
576 lines
16 KiB
Go
576 lines
16 KiB
Go
/*
|
||
* This file is subject to the terms and conditions defined in
|
||
* file 'LICENSE.md', which is part of this source code package.
|
||
*/
|
||
|
||
package cmap
|
||
|
||
import (
|
||
"fmt"
|
||
"sort"
|
||
"strings"
|
||
|
||
"github.com/unidoc/unipdf/v3/common"
|
||
"github.com/unidoc/unipdf/v3/core"
|
||
"github.com/unidoc/unipdf/v3/internal/cmap/bcmaps"
|
||
)
|
||
|
||
const (
|
||
// Maximum number of possible bytes per code.
|
||
maxCodeLen = 4
|
||
|
||
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
|
||
MissingCodeRune = '\ufffd' // <20>
|
||
|
||
// MissingCodeString replaces strings that can't be decoded.
|
||
MissingCodeString = string(MissingCodeRune)
|
||
)
|
||
|
||
// CharCode is a character code or Unicode
|
||
// rune is int32 https://golang.org/doc/go1#rune
|
||
type CharCode uint32
|
||
|
||
// Codespace represents a single codespace range used in the CMap.
|
||
type Codespace struct {
|
||
NumBytes int
|
||
Low CharCode
|
||
High CharCode
|
||
}
|
||
|
||
type charRange struct {
|
||
code0 CharCode
|
||
code1 CharCode
|
||
}
|
||
type fbRange struct {
|
||
code0 CharCode
|
||
code1 CharCode
|
||
r0 string
|
||
}
|
||
|
||
// CIDSystemInfo contains information for identifying the character collection
|
||
// used by a CID font.
|
||
// CIDSystemInfo=Dict("Registry": Adobe, "Ordering": Korea1, "Supplement": 0, )
|
||
type CIDSystemInfo struct {
|
||
Registry string
|
||
Ordering string
|
||
Supplement int
|
||
}
|
||
|
||
// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`.
|
||
func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) {
|
||
d, ok := core.GetDict(obj)
|
||
if !ok {
|
||
return CIDSystemInfo{}, core.ErrTypeError
|
||
}
|
||
registry, ok := core.GetStringVal(d.Get("Registry"))
|
||
if !ok {
|
||
return CIDSystemInfo{}, core.ErrTypeError
|
||
}
|
||
ordering, ok := core.GetStringVal(d.Get("Ordering"))
|
||
if !ok {
|
||
return CIDSystemInfo{}, core.ErrTypeError
|
||
}
|
||
supplement, ok := core.GetIntVal(d.Get("Supplement"))
|
||
if !ok {
|
||
return CIDSystemInfo{}, core.ErrTypeError
|
||
}
|
||
return CIDSystemInfo{
|
||
Registry: registry,
|
||
Ordering: ordering,
|
||
Supplement: supplement,
|
||
}, nil
|
||
}
|
||
|
||
// String returns a human readable description of `info`.
|
||
// It looks like "Adobe-Japan2-000".
|
||
func (info *CIDSystemInfo) String() string {
|
||
return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement)
|
||
}
|
||
|
||
// CMap represents a character code to unicode mapping used in PDF files.
|
||
// References:
|
||
// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/5411.ToUnicode.pdf
|
||
// https://github.com/adobe-type-tools/cmap-resources/releases
|
||
type CMap struct {
|
||
*cMapParser
|
||
|
||
name string
|
||
nbits int // 8 bits for simple fonts, 16 bits for CID fonts.
|
||
ctype int
|
||
version string
|
||
usecmap string // Base this cmap on `usecmap` if `usecmap` is not empty.
|
||
systemInfo CIDSystemInfo
|
||
|
||
// For regular cmaps.
|
||
codespaces []Codespace
|
||
|
||
// Used by ctype 1 CMaps.
|
||
codeToCID map[CharCode]CharCode // charcode -> CID
|
||
cidToCode map[CharCode]CharCode // CID -> charcode
|
||
|
||
// Used by ctype 2 CMaps.
|
||
codeToUnicode map[CharCode]string // CID -> Unicode string
|
||
unicodeToCode map[string]CharCode // Unicode rune -> CID
|
||
|
||
// cached contains the raw CMap data. It is used by the Bytes method in
|
||
// order to avoid generating the data for every call.
|
||
// NOTE: While it is not currently required, a cache invalidation mechanism
|
||
// might be needed in the future.
|
||
cached []byte
|
||
}
|
||
|
||
// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToRune` arg.
|
||
func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
|
||
codeToUnicode := make(map[CharCode]string, len(codeToRune))
|
||
for code, r := range codeToRune {
|
||
codeToUnicode[code] = string(r)
|
||
}
|
||
|
||
cmap := &CMap{
|
||
name: "Adobe-Identity-UCS",
|
||
ctype: 2,
|
||
nbits: 16,
|
||
systemInfo: CIDSystemInfo{
|
||
Registry: "Adobe",
|
||
Ordering: "UCS",
|
||
Supplement: 0,
|
||
},
|
||
codespaces: []Codespace{{Low: 0, High: 0xffff}},
|
||
codeToUnicode: codeToUnicode,
|
||
unicodeToCode: make(map[string]CharCode, len(codeToRune)),
|
||
codeToCID: make(map[CharCode]CharCode, len(codeToRune)),
|
||
cidToCode: make(map[CharCode]CharCode, len(codeToRune)),
|
||
}
|
||
|
||
cmap.computeInverseMappings()
|
||
|
||
return cmap
|
||
}
|
||
|
||
// newCMap returns an initialized CMap.
|
||
func newCMap(isSimple bool) *CMap {
|
||
nbits := 16
|
||
if isSimple {
|
||
nbits = 8
|
||
}
|
||
return &CMap{
|
||
nbits: nbits,
|
||
codeToCID: make(map[CharCode]CharCode),
|
||
cidToCode: make(map[CharCode]CharCode),
|
||
codeToUnicode: make(map[CharCode]string),
|
||
unicodeToCode: make(map[string]CharCode),
|
||
}
|
||
}
|
||
|
||
// LoadCmapFromDataCID parses the in-memory cmap `data` and returns the resulting CMap.
|
||
// It is a convenience function.
|
||
func LoadCmapFromDataCID(data []byte) (*CMap, error) {
|
||
return LoadCmapFromData(data, false)
|
||
}
|
||
|
||
// LoadCmapFromData parses the in-memory cmap `data` and returns the resulting CMap.
|
||
// If `isSimple` is true, it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
|
||
//
|
||
// 9.10.3 ToUnicode CMaps (page 293).
|
||
func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
|
||
common.Log.Trace("LoadCmapFromData: isSimple=%t", isSimple)
|
||
|
||
cmap := newCMap(isSimple)
|
||
cmap.cMapParser = newCMapParser(data)
|
||
|
||
// In debugging it may help to see the data being parsed.
|
||
// fmt.Println("===============*******===========")
|
||
// fmt.Printf("%s\n", string(data))
|
||
// fmt.Println("===============&&&&&&&===========")
|
||
|
||
err := cmap.parse()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if len(cmap.codespaces) == 0 {
|
||
if cmap.usecmap != "" {
|
||
return cmap, nil
|
||
}
|
||
|
||
common.Log.Debug("ERROR: No codespaces. cmap=%s", cmap)
|
||
return nil, ErrBadCMap
|
||
}
|
||
|
||
cmap.computeInverseMappings()
|
||
return cmap, nil
|
||
}
|
||
|
||
// IsPredefinedCMap returns true if the specified CMap name is a predefined
|
||
// CJK CMap. The predefined CMaps are bundled with the package and can be loaded
|
||
// using the LoadPredefinedCMap function.
|
||
// See section 9.7.5.2 "Predefined CMaps" (page 273, Table 118).
|
||
func IsPredefinedCMap(name string) bool {
|
||
return bcmaps.AssetExists(name)
|
||
}
|
||
|
||
// LoadPredefinedCMap loads a predefined CJK CMap by name.
|
||
// See section 9.7.5.2 "Predefined CMaps" (page 273, Table 118).
|
||
func LoadPredefinedCMap(name string) (*CMap, error) {
|
||
// Load cmap.
|
||
cmap, err := loadPredefinedCMap(name)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if cmap.usecmap == "" {
|
||
cmap.computeInverseMappings()
|
||
return cmap, nil
|
||
}
|
||
|
||
// Load base cmap.
|
||
base, err := loadPredefinedCMap(cmap.usecmap)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
// Add CID ranges.
|
||
for charcode, cid := range base.codeToCID {
|
||
if _, ok := cmap.codeToCID[charcode]; !ok {
|
||
cmap.codeToCID[charcode] = cid
|
||
}
|
||
}
|
||
|
||
// Add codespaces.
|
||
for _, codespace := range base.codespaces {
|
||
cmap.codespaces = append(cmap.codespaces, codespace)
|
||
}
|
||
|
||
cmap.computeInverseMappings()
|
||
return cmap, nil
|
||
}
|
||
|
||
// loadPredefinedCMap loads an embedded CMap from the bcmaps package, specified
|
||
// by name.
|
||
func loadPredefinedCMap(name string) (*CMap, error) {
|
||
cmapData, err := bcmaps.Asset(name)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
return LoadCmapFromDataCID(cmapData)
|
||
}
|
||
|
||
func (cmap *CMap) computeInverseMappings() {
|
||
// Generate CID -> charcode map.
|
||
for code, cid := range cmap.codeToCID {
|
||
if c, ok := cmap.cidToCode[cid]; !ok || (ok && c > code) {
|
||
cmap.cidToCode[cid] = code
|
||
}
|
||
}
|
||
|
||
// Generate Unicode -> CID map.
|
||
for cid, s := range cmap.codeToUnicode {
|
||
if c, ok := cmap.unicodeToCode[s]; !ok || (ok && c > cid) {
|
||
cmap.unicodeToCode[s] = cid
|
||
}
|
||
}
|
||
|
||
// Sort codespaces in order for shorter codes to be checked first.
|
||
sort.Slice(cmap.codespaces, func(i, j int) bool {
|
||
return cmap.codespaces[i].Low < cmap.codespaces[j].Low
|
||
})
|
||
}
|
||
|
||
// CharcodeBytesToUnicode converts a byte array of charcodes to a unicode string representation.
|
||
// It also returns a bool flag to tell if the conversion was successful.
|
||
// NOTE: This only works for ToUnicode cmaps.
|
||
func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
|
||
charcodes, matched := cmap.BytesToCharcodes(data)
|
||
if !matched {
|
||
common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in codespaces. data=[% 02x] cmap=%s",
|
||
data, cmap)
|
||
return "", 0
|
||
}
|
||
|
||
parts := make([]string, len(charcodes))
|
||
var missing []CharCode
|
||
for i, code := range charcodes {
|
||
s, ok := cmap.codeToUnicode[code]
|
||
if !ok {
|
||
missing = append(missing, code)
|
||
s = MissingCodeString
|
||
}
|
||
parts[i] = s
|
||
}
|
||
unicode := strings.Join(parts, "")
|
||
|
||
if len(missing) > 0 {
|
||
common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+
|
||
"\tdata=[% 02x]=%#q\n"+
|
||
"\tcharcodes=%02x\n"+
|
||
"\tmissing=%d %02x\n"+
|
||
"\tunicode=`%s`\n"+
|
||
"\tcmap=%s",
|
||
data, string(data), charcodes, len(missing), missing, unicode, cmap)
|
||
}
|
||
return unicode, len(missing)
|
||
}
|
||
|
||
// CharcodeToUnicode converts a single character code `code` to a unicode string.
|
||
// If `code` is not in the unicode map, '<27>' is returned.
|
||
// NOTE: CharcodeBytesToUnicode is typically more efficient.
|
||
func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
|
||
if s, ok := cmap.codeToUnicode[code]; ok {
|
||
return s, true
|
||
}
|
||
return MissingCodeString, false
|
||
}
|
||
|
||
// StringToCID maps the specified string to a character identifier. If the provided
|
||
// string has no available mapping, the bool return value is false.
|
||
func (cmap *CMap) StringToCID(s string) (CharCode, bool) {
|
||
cid, ok := cmap.unicodeToCode[s]
|
||
return cid, ok
|
||
}
|
||
|
||
// CharcodeToCID maps the specified character code to a character identifier.
|
||
// If the provided charcode has no available mapping, the second return value
|
||
// is false. The returned CID can be mapped to a Unicode character using a
|
||
// Unicode conversion CMap.
|
||
func (cmap *CMap) CharcodeToCID(code CharCode) (CharCode, bool) {
|
||
cid, ok := cmap.codeToCID[code]
|
||
return cid, ok
|
||
}
|
||
|
||
// CIDToCharcode maps the specified character identified to a character code. If
|
||
// the provided CID has no available mapping, the second return value is false.
|
||
func (cmap *CMap) CIDToCharcode(cid CharCode) (CharCode, bool) {
|
||
code, ok := cmap.cidToCode[cid]
|
||
return code, ok
|
||
}
|
||
|
||
// BytesToCharcodes attempts to convert the entire byte array `data` to a list
|
||
// of character codes from the ranges specified by `cmap`'s codespaces.
|
||
// Returns:
|
||
// character code sequence (if there is a match complete match)
|
||
// matched?
|
||
// NOTE: A partial list of character codes will be returned if a complete match
|
||
// is not possible.
|
||
func (cmap *CMap) BytesToCharcodes(data []byte) ([]CharCode, bool) {
|
||
var charcodes []CharCode
|
||
if cmap.nbits == 8 {
|
||
for _, b := range data {
|
||
charcodes = append(charcodes, CharCode(b))
|
||
}
|
||
return charcodes, true
|
||
}
|
||
for i := 0; i < len(data); {
|
||
code, n, matched := cmap.matchCode(data[i:])
|
||
if !matched {
|
||
common.Log.Debug("ERROR: No code match at i=%d bytes=[% 02x]=%#q", i, data, string(data))
|
||
return charcodes, false
|
||
}
|
||
charcodes = append(charcodes, code)
|
||
i += n
|
||
}
|
||
return charcodes, true
|
||
}
|
||
|
||
// Name returns the name of the CMap.
|
||
func (cmap *CMap) Name() string {
|
||
return cmap.name
|
||
}
|
||
|
||
// Type returns the CMap type.
|
||
func (cmap *CMap) Type() int {
|
||
return cmap.ctype
|
||
}
|
||
|
||
// Nbits returns 8 bits for simple font CMaps and 16 bits for CID font CMaps.
|
||
func (cmap *CMap) NBits() int {
|
||
return cmap.nbits
|
||
}
|
||
|
||
// String returns a human readable description of `cmap`.
|
||
func (cmap *CMap) String() string {
|
||
si := cmap.systemInfo
|
||
parts := []string{
|
||
fmt.Sprintf("nbits:%d", cmap.nbits),
|
||
fmt.Sprintf("type:%d", cmap.ctype),
|
||
}
|
||
if cmap.version != "" {
|
||
parts = append(parts, fmt.Sprintf("version:%s", cmap.version))
|
||
}
|
||
if cmap.usecmap != "" {
|
||
parts = append(parts, fmt.Sprintf("usecmap:%#q", cmap.usecmap))
|
||
}
|
||
parts = append(parts, fmt.Sprintf("systemInfo:%s", si.String()))
|
||
if len(cmap.codespaces) > 0 {
|
||
parts = append(parts, fmt.Sprintf("codespaces:%d", len(cmap.codespaces)))
|
||
}
|
||
if len(cmap.codeToUnicode) > 0 {
|
||
parts = append(parts, fmt.Sprintf("codeToUnicode:%d", len(cmap.codeToUnicode)))
|
||
}
|
||
return fmt.Sprintf("CMAP{%#q %s}", cmap.name, strings.Join(parts, " "))
|
||
}
|
||
|
||
// Bytes returns the raw bytes of a PDF CMap corresponding to `cmap`.
|
||
func (cmap *CMap) Bytes() []byte {
|
||
common.Log.Trace("cmap.Bytes: cmap=%s", cmap.String())
|
||
if len(cmap.cached) > 0 {
|
||
return cmap.cached
|
||
}
|
||
|
||
cmap.cached = []byte(strings.Join([]string{
|
||
cmapHeader, cmap.toBfData(), cmapTrailer,
|
||
}, "\n"))
|
||
return cmap.cached
|
||
}
|
||
|
||
// matchCode attempts to match the byte array `data` with a character code in `cmap`'s codespaces.
|
||
// Returns:
|
||
// character code (if there is a match) of
|
||
// number of bytes read (if there is a match)
|
||
// matched?
|
||
func (cmap *CMap) matchCode(data []byte) (code CharCode, n int, matched bool) {
|
||
for j := 0; j < maxCodeLen; j++ {
|
||
if j < len(data) {
|
||
code = code<<8 | CharCode(data[j])
|
||
n++
|
||
}
|
||
matched = cmap.inCodespace(code, j+1)
|
||
if matched {
|
||
return code, n, true
|
||
}
|
||
}
|
||
// No codespace matched data. This is a serious problem.
|
||
common.Log.Debug("ERROR: No codespace matches bytes=[% 02x]=%#q cmap=%s",
|
||
data, string(data), cmap)
|
||
return 0, 0, false
|
||
}
|
||
|
||
// inCodespace returns true if `code` is in the `numBytes` byte codespace.
|
||
func (cmap *CMap) inCodespace(code CharCode, numBytes int) bool {
|
||
for _, cs := range cmap.codespaces {
|
||
if cs.Low <= code && code <= cs.High && numBytes == cs.NumBytes {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// toBfData returns the bfchar and bfrange sections of a CMap text file.
|
||
// Both sections are computed from cmap.codeToUnicode.
|
||
func (cmap *CMap) toBfData() string {
|
||
if len(cmap.codeToUnicode) == 0 {
|
||
return ""
|
||
}
|
||
|
||
// codes is a sorted list of the codeToUnicode keys.
|
||
codes := make([]CharCode, 0, len(cmap.codeToUnicode))
|
||
for code := range cmap.codeToUnicode {
|
||
codes = append(codes, code)
|
||
}
|
||
sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
|
||
|
||
// Generate CMap character code ranges.
|
||
// The code ranges are intervals of consecutive charcodes (c1 = c0 + 1)
|
||
// mapping to consecutive runes.
|
||
// Start with a range consisting of the current character code for both ends
|
||
// of the interval. Check if the next character is consecutive to the upper
|
||
// end of the interval and if it maps to the next rune. If so, increase the
|
||
// interval to the right. Otherwise, append the current range to the
|
||
// character ranges slice and start over. Continue the process until all
|
||
// character codes have been mapped to code ranges.
|
||
var charRanges []charRange
|
||
currCharRange := charRange{codes[0], codes[0]}
|
||
prevRune := cmap.codeToUnicode[codes[0]]
|
||
for _, c := range codes[1:] {
|
||
currRune := cmap.codeToUnicode[c]
|
||
if c == currCharRange.code1+1 && lastRune(currRune) == lastRune(prevRune)+1 {
|
||
currCharRange.code1 = c
|
||
} else {
|
||
charRanges = append(charRanges, currCharRange)
|
||
currCharRange.code0, currCharRange.code1 = c, c
|
||
}
|
||
prevRune = currRune
|
||
}
|
||
charRanges = append(charRanges, currCharRange)
|
||
|
||
// fbChars is a list of single character ranges. fbRanges is a list of multiple character ranges.
|
||
var fbChars []CharCode
|
||
var fbRanges []fbRange
|
||
for _, cr := range charRanges {
|
||
if cr.code0 == cr.code1 {
|
||
fbChars = append(fbChars, cr.code0)
|
||
} else {
|
||
fbRanges = append(fbRanges, fbRange{
|
||
code0: cr.code0,
|
||
code1: cr.code1,
|
||
r0: cmap.codeToUnicode[cr.code0],
|
||
})
|
||
}
|
||
}
|
||
common.Log.Trace("charRanges=%d fbChars=%d fbRanges=%d", len(charRanges), len(fbChars),
|
||
len(fbRanges))
|
||
|
||
var lines []string
|
||
if len(fbChars) > 0 {
|
||
numRanges := (len(fbChars) + maxBfEntries - 1) / maxBfEntries
|
||
for i := 0; i < numRanges; i++ {
|
||
n := min(len(fbChars)-i*maxBfEntries, maxBfEntries)
|
||
lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
|
||
for j := 0; j < n; j++ {
|
||
code := fbChars[i*maxBfEntries+j]
|
||
s := cmap.codeToUnicode[code]
|
||
lines = append(lines, fmt.Sprintf("<%04x> %s", code, hexCode(s)))
|
||
}
|
||
lines = append(lines, "endbfchar")
|
||
}
|
||
}
|
||
if len(fbRanges) > 0 {
|
||
numRanges := (len(fbRanges) + maxBfEntries - 1) / maxBfEntries
|
||
for i := 0; i < numRanges; i++ {
|
||
n := min(len(fbRanges)-i*maxBfEntries, maxBfEntries)
|
||
lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
|
||
for j := 0; j < n; j++ {
|
||
rng := fbRanges[i*maxBfEntries+j]
|
||
lines = append(lines, fmt.Sprintf("<%04x><%04x> %s",
|
||
rng.code0, rng.code1, hexCode(rng.r0)))
|
||
}
|
||
lines = append(lines, "endbfrange")
|
||
}
|
||
}
|
||
return strings.Join(lines, "\n")
|
||
}
|
||
|
||
// lastRune returns the last rune in `s`.
|
||
func lastRune(s string) rune {
|
||
runes := []rune(s)
|
||
return runes[len(runes)-1]
|
||
}
|
||
|
||
// hexCode return the CMap hex code for `s`.
|
||
func hexCode(s string) string {
|
||
runes := []rune(s)
|
||
codes := make([]string, len(runes))
|
||
for i, r := range runes {
|
||
codes[i] = fmt.Sprintf("%04x", r)
|
||
}
|
||
return fmt.Sprintf("<%s>", strings.Join(codes, ""))
|
||
}
|
||
|
||
const (
|
||
maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
|
||
cmapHeader = `
|
||
/CIDInit /ProcSet findresource begin
|
||
12 dict begin
|
||
begincmap
|
||
/CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def
|
||
/CMapName /Adobe-Identity-UCS def
|
||
/CMapType 2 def
|
||
1 begincodespacerange
|
||
<0000> <FFFF>
|
||
endcodespacerange
|
||
`
|
||
cmapTrailer = `endcmap
|
||
CMapName currentdict /CMap defineresource pop
|
||
end
|
||
end
|
||
`
|
||
)
|