mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00
376 lines
7.2 KiB
Go
376 lines
7.2 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package cmap
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"io"
|
|
|
|
"github.com/unidoc/unidoc/common"
|
|
"github.com/unidoc/unidoc/pdf/model/textencoding"
|
|
)
|
|
|
|
// CMap represents a character code to unicode mapping used in PDF files.
|
|
type CMap struct {
|
|
*cMapParser
|
|
|
|
// Text encoder to look up runes from input glyph names.
|
|
encoder textencoding.TextEncoder
|
|
|
|
codeMap map[uint64]string
|
|
|
|
name string
|
|
ctype int
|
|
codespaces []codespace
|
|
}
|
|
|
|
// codespace represents a single codespace range used in the CMap.
|
|
type codespace struct {
|
|
low uint64
|
|
high uint64
|
|
}
|
|
|
|
// Name returns the name of the CMap.
|
|
func (cmap *CMap) Name() string {
|
|
return cmap.name
|
|
}
|
|
|
|
// Type returns the type of the CMap.
|
|
func (cmap *CMap) Type() int {
|
|
return cmap.ctype
|
|
}
|
|
|
|
// CharcodeBytesToUnicode converts a byte array of charcodes to a unicode string representation.
|
|
func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string {
|
|
var buf bytes.Buffer
|
|
|
|
// Maximum number of possible bytes per code.
|
|
maxLen := 4
|
|
|
|
i := 0
|
|
for i < len(src) {
|
|
var code uint64
|
|
var j int
|
|
for j = 0; j < maxLen && i+j < len(src); j++ {
|
|
b := src[i+j]
|
|
|
|
code <<= 8
|
|
code |= uint64(b)
|
|
|
|
tgt, has := cmap.codeMap[code]
|
|
if has {
|
|
buf.WriteString(tgt)
|
|
break
|
|
}
|
|
}
|
|
i += j + 1
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// CharcodeToUnicode converts a single character code to unicode string.
|
|
func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string {
|
|
if c, has := cmap.codeMap[srcCode]; has {
|
|
return c
|
|
}
|
|
|
|
// Not found.
|
|
return "?"
|
|
}
|
|
|
|
// newCMap returns an initialized CMap.
|
|
func newCMap() *CMap {
|
|
cmap := &CMap{}
|
|
cmap.codespaces = []codespace{}
|
|
cmap.codeMap = map[uint64]string{}
|
|
return cmap
|
|
}
|
|
|
|
// LoadCmapFromData parses CMap data in memory through a byte vector and returns a CMap which
|
|
// can be used for character code to unicode conversion.
|
|
func LoadCmapFromData(data []byte) (*CMap, error) {
|
|
cmap := newCMap()
|
|
cmap.cMapParser = newCMapParser(data)
|
|
|
|
err := cmap.parse()
|
|
if err != nil {
|
|
return cmap, err
|
|
}
|
|
|
|
return cmap, nil
|
|
}
|
|
|
|
// parse parses the CMap file and loads into the CMap structure.
|
|
func (cmap *CMap) parse() error {
|
|
for {
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
|
|
common.Log.Debug("Error parsing CMap: %v", err)
|
|
return err
|
|
}
|
|
|
|
if op, isOp := o.(cmapOperand); isOp {
|
|
common.Log.Trace("Operand: %s", op.Operand)
|
|
|
|
if op.Operand == begincodespacerange {
|
|
err := cmap.parseCodespaceRange()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else if op.Operand == beginbfchar {
|
|
err := cmap.parseBfchar()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else if op.Operand == beginbfrange {
|
|
err := cmap.parseBfrange()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
} else if n, isName := o.(cmapName); isName {
|
|
if n.Name == cmapname {
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
name, ok := o.(cmapName)
|
|
if !ok {
|
|
return errors.New("CMap name not a name")
|
|
}
|
|
cmap.name = name.Name
|
|
} else if n.Name == cmaptype {
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
typeInt, ok := o.(cmapInt)
|
|
if !ok {
|
|
return errors.New("CMap type not an integer")
|
|
}
|
|
cmap.ctype = int(typeInt.val)
|
|
}
|
|
} else {
|
|
common.Log.Trace("Unhandled object: %T %#v", o, o)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// parseCodespaceRange parses the codespace range section of a CMap.
|
|
func (cmap *CMap) parseCodespaceRange() error {
|
|
for {
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
|
|
hexLow, isHex := o.(cmapHexString)
|
|
if !isHex {
|
|
if op, isOperand := o.(cmapOperand); isOperand {
|
|
if op.Operand == endcodespacerange {
|
|
return nil
|
|
}
|
|
return errors.New("Unexpected operand")
|
|
}
|
|
}
|
|
|
|
o, err = cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
hexHigh, ok := o.(cmapHexString)
|
|
if !ok {
|
|
return errors.New("Non-hex high")
|
|
}
|
|
|
|
low := hexToUint64(hexLow)
|
|
high := hexToUint64(hexHigh)
|
|
|
|
cspace := codespace{low, high}
|
|
cmap.codespaces = append(cmap.codespaces, cspace)
|
|
|
|
common.Log.Trace("Codespace low: 0x%X, high: 0x%X", low, high)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// parseBfchar parses a bfchar section of a CMap file.
|
|
func (cmap *CMap) parseBfchar() error {
|
|
for {
|
|
// Src code.
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
var srcCode uint64
|
|
|
|
switch v := o.(type) {
|
|
case cmapOperand:
|
|
if v.Operand == endbfchar {
|
|
return nil
|
|
}
|
|
return errors.New("Unexpected operand")
|
|
case cmapHexString:
|
|
srcCode = hexToUint64(v)
|
|
default:
|
|
return errors.New("Unexpected type")
|
|
}
|
|
|
|
// Target code.
|
|
o, err = cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
var toCode string
|
|
|
|
switch v := o.(type) {
|
|
case cmapOperand:
|
|
if v.Operand == endbfchar {
|
|
return nil
|
|
}
|
|
return errors.New("Unexpected operand")
|
|
case cmapHexString:
|
|
toCode = hexToString(v)
|
|
case cmapName:
|
|
toCode = "?"
|
|
if cmap.encoder != nil {
|
|
if r, found := cmap.encoder.GlyphToRune(v.Name); found {
|
|
toCode = string(r)
|
|
}
|
|
}
|
|
default:
|
|
return errors.New("Unexpected type")
|
|
}
|
|
|
|
cmap.codeMap[srcCode] = toCode
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// parseBfrange parses a bfrange section of a CMap file.
|
|
func (cmap *CMap) parseBfrange() error {
|
|
for {
|
|
// The specifications are in pairs of 3.
|
|
// <srcCodeFrom> <srcCodeTo> <target>
|
|
// where target can be either <destFrom> as a hex code, or a list.
|
|
|
|
// Src code from.
|
|
var srcCodeFrom uint64
|
|
{
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
|
|
switch v := o.(type) {
|
|
case cmapOperand:
|
|
if v.Operand == endbfrange {
|
|
return nil
|
|
}
|
|
return errors.New("Unexpected operand")
|
|
case cmapHexString:
|
|
srcCodeFrom = hexToUint64(v)
|
|
default:
|
|
return errors.New("Unexpected type")
|
|
}
|
|
}
|
|
|
|
// Src code to.
|
|
var srcCodeTo uint64
|
|
{
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
|
|
switch v := o.(type) {
|
|
case cmapOperand:
|
|
if v.Operand == endbfrange {
|
|
return nil
|
|
}
|
|
return errors.New("Unexpected operand")
|
|
case cmapHexString:
|
|
srcCodeTo = hexToUint64(v)
|
|
default:
|
|
return errors.New("Unexpected type")
|
|
}
|
|
}
|
|
|
|
// target(s).
|
|
o, err := cmap.parseObject()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
|
|
switch v := o.(type) {
|
|
case cmapArray:
|
|
sc := srcCodeFrom
|
|
for _, o := range v.Array {
|
|
hexs, ok := o.(cmapHexString)
|
|
if !ok {
|
|
return errors.New("Non-hex string in array")
|
|
}
|
|
cmap.codeMap[sc] = hexToString(hexs)
|
|
sc++
|
|
}
|
|
if sc != srcCodeTo+1 {
|
|
return errors.New("Invalid number of items in array")
|
|
}
|
|
case cmapHexString:
|
|
// <srcCodeFrom> <srcCodeTo> <dstCode>, maps [from,to] to [dstCode,dstCode+to-from].
|
|
// in hex format.
|
|
target := hexToUint64(v)
|
|
i := uint64(0)
|
|
for sc := srcCodeFrom; sc <= srcCodeTo; sc++ {
|
|
r := target + i
|
|
cmap.codeMap[sc] = string(r)
|
|
i++
|
|
}
|
|
default:
|
|
return errors.New("Unexpected type")
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|