mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-09 19:29:34 +08:00
Track number of bytes per character code for mappings. Fixes problem posed in PR #156 in a generic fashion.
This commit is contained in:
parent
0c9502ad8b
commit
a7abcd0160
@ -21,7 +21,8 @@ type CMap struct {
|
||||
// Text encoder to look up runes from input glyph names.
|
||||
encoder textencoding.TextEncoder
|
||||
|
||||
codeMap map[uint64]string
|
||||
// map of character code to string (sequence of runes) for 1-4 byte codes separately.
|
||||
codeMap [4]map[uint64]string
|
||||
|
||||
name string
|
||||
ctype int
|
||||
@ -30,8 +31,9 @@ type CMap struct {
|
||||
|
||||
// codespace represents a single codespace range used in the CMap.
|
||||
type codespace struct {
|
||||
low uint64
|
||||
high uint64
|
||||
numBytes int
|
||||
low uint64
|
||||
high uint64
|
||||
}
|
||||
|
||||
// Name returns the name of the CMap.
|
||||
@ -61,7 +63,7 @@ func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string {
|
||||
code <<= 8
|
||||
code |= uint64(b)
|
||||
|
||||
tgt, has := cmap.codeMap[code]
|
||||
tgt, has := cmap.codeMap[j][code]
|
||||
if has {
|
||||
buf.WriteString(tgt)
|
||||
break
|
||||
@ -76,9 +78,13 @@ func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string {
|
||||
}
|
||||
|
||||
// CharcodeToUnicode converts a single character code to unicode string.
|
||||
// Note that CharcodeBytesToUnicode is typically more efficient.
|
||||
func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string {
|
||||
if c, has := cmap.codeMap[srcCode]; has {
|
||||
return c
|
||||
// Search through different code lengths.
|
||||
for numBytes := 1; numBytes <= 4; numBytes++ {
|
||||
if c, has := cmap.codeMap[numBytes-1][srcCode]; has {
|
||||
return c
|
||||
}
|
||||
}
|
||||
|
||||
// Not found.
|
||||
@ -89,7 +95,12 @@ func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string {
|
||||
func newCMap() *CMap {
|
||||
cmap := &CMap{}
|
||||
cmap.codespaces = []codespace{}
|
||||
cmap.codeMap = map[uint64]string{}
|
||||
cmap.codeMap = [4]map[uint64]string{}
|
||||
// Maps for 1-4 bytes are initialized. Minimal overhead if not used (most commonly used are 1-2 bytes).
|
||||
cmap.codeMap[0] = map[uint64]string{}
|
||||
cmap.codeMap[1] = map[uint64]string{}
|
||||
cmap.codeMap[2] = map[uint64]string{}
|
||||
cmap.codeMap[3] = map[uint64]string{}
|
||||
return cmap
|
||||
}
|
||||
|
||||
@ -208,10 +219,15 @@ func (cmap *CMap) parseCodespaceRange() error {
|
||||
return errors.New("Non-hex high")
|
||||
}
|
||||
|
||||
if hexLow.numBytes != hexHigh.numBytes {
|
||||
return errors.New("Unequal number of bytes in range")
|
||||
}
|
||||
|
||||
low := hexToUint64(hexLow)
|
||||
high := hexToUint64(hexHigh)
|
||||
numBytes := hexLow.numBytes
|
||||
|
||||
cspace := codespace{low, high}
|
||||
cspace := codespace{numBytes: numBytes, low: low, high: high}
|
||||
cmap.codespaces = append(cmap.codespaces, cspace)
|
||||
|
||||
common.Log.Trace("Codespace low: 0x%X, high: 0x%X", low, high)
|
||||
@ -232,6 +248,7 @@ func (cmap *CMap) parseBfchar() error {
|
||||
return err
|
||||
}
|
||||
var srcCode uint64
|
||||
var numBytes int
|
||||
|
||||
switch v := o.(type) {
|
||||
case cmapOperand:
|
||||
@ -241,6 +258,7 @@ func (cmap *CMap) parseBfchar() error {
|
||||
return errors.New("Unexpected operand")
|
||||
case cmapHexString:
|
||||
srcCode = hexToUint64(v)
|
||||
numBytes = v.numBytes
|
||||
default:
|
||||
return errors.New("Unexpected type")
|
||||
}
|
||||
@ -274,7 +292,11 @@ func (cmap *CMap) parseBfchar() error {
|
||||
return errors.New("Unexpected type")
|
||||
}
|
||||
|
||||
cmap.codeMap[srcCode] = toCode
|
||||
if numBytes <= 0 || numBytes > 4 {
|
||||
return errors.New("Invalid code length")
|
||||
}
|
||||
|
||||
cmap.codeMap[numBytes-1][srcCode] = toCode
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -289,6 +311,7 @@ func (cmap *CMap) parseBfrange() error {
|
||||
|
||||
// Src code from.
|
||||
var srcCodeFrom uint64
|
||||
var numBytes int
|
||||
{
|
||||
o, err := cmap.parseObject()
|
||||
if err != nil {
|
||||
@ -306,6 +329,7 @@ func (cmap *CMap) parseBfrange() error {
|
||||
return errors.New("Unexpected operand")
|
||||
case cmapHexString:
|
||||
srcCodeFrom = hexToUint64(v)
|
||||
numBytes = v.numBytes
|
||||
default:
|
||||
return errors.New("Unexpected type")
|
||||
}
|
||||
@ -344,6 +368,10 @@ func (cmap *CMap) parseBfrange() error {
|
||||
return err
|
||||
}
|
||||
|
||||
if numBytes <= 0 || numBytes > 4 {
|
||||
return errors.New("Invalid code length")
|
||||
}
|
||||
|
||||
switch v := o.(type) {
|
||||
case cmapArray:
|
||||
sc := srcCodeFrom
|
||||
@ -352,7 +380,7 @@ func (cmap *CMap) parseBfrange() error {
|
||||
if !ok {
|
||||
return errors.New("Non-hex string in array")
|
||||
}
|
||||
cmap.codeMap[sc] = hexToString(hexs)
|
||||
cmap.codeMap[numBytes-1][sc] = hexToString(hexs)
|
||||
sc++
|
||||
}
|
||||
if sc != srcCodeTo+1 {
|
||||
@ -365,7 +393,7 @@ func (cmap *CMap) parseBfrange() error {
|
||||
i := uint64(0)
|
||||
for sc := srcCodeFrom; sc <= srcCodeTo; sc++ {
|
||||
r := target + i
|
||||
cmap.codeMap[sc] = string(r)
|
||||
cmap.codeMap[numBytes-1][sc] = string(r)
|
||||
i++
|
||||
}
|
||||
default:
|
||||
|
@ -16,6 +16,7 @@ func init() {
|
||||
common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
|
||||
}
|
||||
|
||||
// cmap1Data represents a basic CMap.
|
||||
const cmap1Data = `
|
||||
/CIDInit /ProcSet findresource begin
|
||||
12 dict begin
|
||||
@ -55,6 +56,7 @@ end
|
||||
end
|
||||
`
|
||||
|
||||
// TestCMapParser tests basic loading of a simple CMap.
|
||||
func TestCMapParser1(t *testing.T) {
|
||||
common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
|
||||
|
||||
@ -122,3 +124,206 @@ func TestCMapParser1(t *testing.T) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
const cmap2Data = `
|
||||
/CIDInit /ProcSet findresource begin
|
||||
12 dict begin
|
||||
begincmap
|
||||
/CIDSystemInfo
|
||||
<< /Registry (Adobe)
|
||||
/Ordering (UCS)
|
||||
/Supplement 0
|
||||
>> def
|
||||
/CMapName /Adobe-Identity-UCS def
|
||||
/CMapType 2 def
|
||||
1 begincodespacerange
|
||||
<0000> <FFFF>
|
||||
endcodespacerange
|
||||
7 beginbfrange
|
||||
<0080> <00FF> <002C>
|
||||
<802F> <902F> <0038>
|
||||
endbfrange
|
||||
endcmap
|
||||
CMapName currentdict /CMap defineresource pop
|
||||
end
|
||||
end
|
||||
`
|
||||
|
||||
// TestCMapParser2 tests a bug that came up when 2-byte character codes had the higher byte set to 0,
|
||||
// e.g. 0x0080, and the character map was not taking the number of bytes of the input codemap into account.
|
||||
func TestCMapParser2(t *testing.T) {
|
||||
common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
|
||||
|
||||
cmap, err := LoadCmapFromData([]byte(cmap2Data))
|
||||
if err != nil {
|
||||
t.Error("Failed: ", err)
|
||||
return
|
||||
}
|
||||
|
||||
if cmap.Name() != "Adobe-Identity-UCS" {
|
||||
t.Errorf("CMap name incorrect (%s)", cmap.Name())
|
||||
return
|
||||
}
|
||||
|
||||
if cmap.Type() != 2 {
|
||||
t.Errorf("CMap type incorrect")
|
||||
return
|
||||
}
|
||||
|
||||
if len(cmap.codespaces) != 1 {
|
||||
t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
|
||||
return
|
||||
}
|
||||
|
||||
if cmap.codespaces[0].low != 0 {
|
||||
t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].low)
|
||||
return
|
||||
}
|
||||
|
||||
if cmap.codespaces[0].high != 0xFFFF {
|
||||
t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].high)
|
||||
return
|
||||
}
|
||||
|
||||
expectedMappings := map[uint64]rune{
|
||||
0x0080: 0x002C,
|
||||
0x802F: 0x0038,
|
||||
}
|
||||
|
||||
for k, expected := range expectedMappings {
|
||||
if v := cmap.CharcodeToUnicode(k); v != string(expected) {
|
||||
t.Errorf("incorrect mapping, expecting 0x%X -> 0x%X (got 0x%X)", k, expected, v)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Check byte sequence mappings.
|
||||
excpectedSequenceMappings := []struct {
|
||||
bytes []byte
|
||||
expected string
|
||||
}{
|
||||
{[]byte{0x80, 0x2F, 0x00, 0x80}, string([]rune{0x0038, 0x002C})},
|
||||
}
|
||||
|
||||
for _, exp := range excpectedSequenceMappings {
|
||||
str := cmap.CharcodeBytesToUnicode(exp.bytes)
|
||||
if str != exp.expected {
|
||||
t.Errorf("Incorrect byte sequence mapping -> % X -> % X (got % X)", exp.bytes, []rune(exp.expected), []rune(str))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cmapData3 is a CMap with a mixture of 1 and 2 byte codespaces.
|
||||
const cmapData3 = `
|
||||
/CIDInit /ProcSet findresource begin
|
||||
12 dict begin begincmap
|
||||
/CIDSystemInfo
|
||||
3 dict dup begin
|
||||
/Registry (Adobe) def
|
||||
/Supplement 2 def
|
||||
end def
|
||||
|
||||
/CMapName /test-1 def
|
||||
/CMapType 1 def
|
||||
|
||||
4 begincodespacerange
|
||||
<00> <80>
|
||||
<8100> <9fff>
|
||||
<a0> <df>
|
||||
<d040> <fbfc>
|
||||
endcodespacerange
|
||||
7 beginbfrange
|
||||
<00> <80> <10>
|
||||
<8100> <9f00> <1000>
|
||||
<a0> <d0> <90>
|
||||
<d140> <f000> <a000>
|
||||
endbfrange
|
||||
endcmap
|
||||
`
|
||||
|
||||
// TestCMapParser3 test case of a CMap with mixed number of 1 and 2 bytes in the codespace range.
|
||||
func TestCMapParser3(t *testing.T) {
|
||||
common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
|
||||
|
||||
cmap, err := LoadCmapFromData([]byte(cmapData3))
|
||||
if err != nil {
|
||||
t.Error("Failed: ", err)
|
||||
return
|
||||
}
|
||||
|
||||
if cmap.Name() != "test-1" {
|
||||
t.Errorf("CMap name incorrect (%s)", cmap.Name())
|
||||
return
|
||||
}
|
||||
|
||||
if cmap.Type() != 1 {
|
||||
t.Errorf("CMap type incorrect")
|
||||
return
|
||||
}
|
||||
|
||||
// Check codespaces.
|
||||
expectedCodespaces := []struct {
|
||||
numBytes int
|
||||
low uint64
|
||||
high uint64
|
||||
}{
|
||||
{1, 0x00, 0x80},
|
||||
{2, 0x8100, 0x9fff},
|
||||
{1, 0xa0, 0xdf},
|
||||
{2, 0xd040, 0xfbfc},
|
||||
}
|
||||
|
||||
if len(cmap.codespaces) != len(expectedCodespaces) {
|
||||
t.Errorf("len codespace != %d (%d)", len(expectedCodespaces), len(cmap.codespaces))
|
||||
return
|
||||
}
|
||||
|
||||
for i, cs := range cmap.codespaces {
|
||||
exp := expectedCodespaces[i]
|
||||
if cs.numBytes != exp.numBytes {
|
||||
t.Errorf("code space number of bytes != %d (%d)", exp.numBytes, cs.numBytes)
|
||||
return
|
||||
}
|
||||
|
||||
if cs.low != exp.low {
|
||||
t.Errorf("code space low range != %d (%d)", exp.low, cs.low)
|
||||
return
|
||||
}
|
||||
|
||||
if cs.high != exp.high {
|
||||
t.Errorf("code space high range != 0x%X (0x%X)", exp.high, cs.high)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Check mappings.
|
||||
expectedMappings := map[uint64]rune{
|
||||
0x0080: 0x10 + 0x80,
|
||||
0x8100: 0x1000,
|
||||
0x00a0: 0x90,
|
||||
0xd140: 0xa000,
|
||||
}
|
||||
for k, expected := range expectedMappings {
|
||||
if v := cmap.CharcodeToUnicode(k); v != string(expected) {
|
||||
t.Errorf("incorrect mapping, expecting 0x%X -> 0x%X (got 0x%X)", k, expected, v)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Check byte sequence mappings.
|
||||
excpectedSequenceMappings := []struct {
|
||||
bytes []byte
|
||||
expected string
|
||||
}{
|
||||
{[]byte{0x80, 0x81, 0x00, 0xa1, 0xd1, 0x80, 0x00}, string([]rune{0x90, 0x1000, 0x91, 0xa000 + 0x40, 0x10})},
|
||||
}
|
||||
|
||||
for _, exp := range excpectedSequenceMappings {
|
||||
str := cmap.CharcodeBytesToUnicode(exp.bytes)
|
||||
if str != exp.expected {
|
||||
t.Errorf("Incorrect byte sequence mapping -> % X -> % X (got % X)", exp.bytes, []rune(exp.expected), []rune(str))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -279,7 +279,7 @@ func (p *cMapParser) parseHexString() (cmapHexString, error) {
|
||||
|
||||
bb, err := p.reader.Peek(1)
|
||||
if err != nil {
|
||||
return cmapHexString{[]byte("")}, err
|
||||
return cmapHexString{numBytes: 0, b: []byte("")}, err
|
||||
}
|
||||
|
||||
if bb[0] == '>' {
|
||||
@ -296,10 +296,10 @@ func (p *cMapParser) parseHexString() (cmapHexString, error) {
|
||||
if buf.Len()%2 == 1 {
|
||||
buf.WriteByte('0')
|
||||
}
|
||||
numBytes := buf.Len() / 2
|
||||
|
||||
hexb, _ := hex.DecodeString(buf.String())
|
||||
|
||||
return cmapHexString{hexb}, nil
|
||||
return cmapHexString{numBytes: numBytes, b: hexb}, nil
|
||||
}
|
||||
|
||||
// Starts with '[' ends with ']'. Can contain any kinds of direct objects.
|
||||
|
@ -17,7 +17,8 @@ type cmapOperand struct {
|
||||
}
|
||||
|
||||
type cmapHexString struct {
|
||||
b []byte
|
||||
numBytes int // original number of bytes in the raw representation
|
||||
b []byte
|
||||
}
|
||||
|
||||
type cmapString struct {
|
||||
|
Loading…
x
Reference in New Issue
Block a user