Merge branch 'cmap' into columns

This commit is contained in:
Peter Williams 2020-05-24 20:45:31 +10:00
commit e9c46fa3b9
9 changed files with 111 additions and 61 deletions

View File

@ -221,7 +221,7 @@ func (l WriterLogger) logToWriter(f io.Writer, prefix string, format string, arg
} }
func logToWriter(f io.Writer, prefix string, format string, args ...interface{}) { func logToWriter(f io.Writer, prefix string, format string, args ...interface{}) {
_, file, line, ok := runtime.Caller(2) _, file, line, ok := runtime.Caller(3)
if !ok { if !ok {
file = "???" file = "???"
line = 0 line = 0

View File

@ -698,7 +698,7 @@ func (to *textObject) reset() {
func (to *textObject) renderText(data []byte) error { func (to *textObject) renderText(data []byte) error {
font := to.getCurrentFont() font := to.getCurrentFont()
charcodes := font.BytesToCharcodes(data) charcodes := font.BytesToCharcodes(data)
runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes) runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
if numMisses > 0 { if numMisses > 0 {
common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses) common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
} }
@ -717,18 +717,17 @@ func (to *textObject) renderText(data []byte) error {
spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ') spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
} }
spaceWidth := spaceMetrics.Wx * glyphTextRatio spaceWidth := spaceMetrics.Wx * glyphTextRatio
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs) // common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)
stateMatrix := transform.NewMatrix( stateMatrix := transform.NewMatrix(
tfs*th, 0, tfs*th, 0,
0, tfs, 0, tfs,
0, state.trise) 0, state.trise)
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes) // common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, drunes)
for i, r := range runes { for i, r := range runeSlices {
// TODO(peterwilliams97): Need to find and fix cases where this happens. if len(r) == 1 && r[0] == '\x00' {
if r == '\x00' {
continue continue
} }
@ -742,14 +741,14 @@ func (to *textObject) renderText(data []byte) error {
// w is the unscaled movement at the end of a word. // w is the unscaled movement at the end of a word.
w := 0.0 w := 0.0
if r == ' ' { if string(r) == " " {
w = state.tw w = state.tw
} }
m, ok := font.GetCharMetrics(code) m, ok := font.GetCharMetrics(code)
if !ok { if !ok {
common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font) common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
return errors.New("no char metrics") return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code)
} }
// c is the character size in unscaled text units. // c is the character size in unscaled text units.

View File

@ -316,6 +316,11 @@ var fileExtractionTests = []struct {
`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`}, `The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
}, },
}, },
{filename: "Saudi.pdf",
pageTerms: map[int][]string{
10: []string{"الله"},
},
},
// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed. // TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
// {filename: "Ito_Formula.pdf", // {filename: "Ito_Formula.pdf",
// pageTerms: map[int][]string{ // pageTerms: map[int][]string{

View File

@ -21,6 +21,9 @@ const (
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'. // MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
MissingCodeRune = '\ufffd' // <20> MissingCodeRune = '\ufffd' // <20>
// MissingCodeRune replaces strings that can't be decoded.
MissingCodeString = string(MissingCodeRune)
) )
// CharCode is a character code or Unicode // CharCode is a character code or Unicode
@ -41,7 +44,7 @@ type charRange struct {
type fbRange struct { type fbRange struct {
code0 CharCode code0 CharCode
code1 CharCode code1 CharCode
r0 rune r0 rune // TODO (peterwilliams97): Change to string for compound codes.
} }
// CIDSystemInfo contains information for identifying the character collection // CIDSystemInfo contains information for identifying the character collection
@ -106,8 +109,9 @@ type CMap struct {
cidToCode map[CharCode]CharCode // CID -> charcode cidToCode map[CharCode]CharCode // CID -> charcode
// Used by ctype 2 CMaps. // Used by ctype 2 CMaps.
codeToUnicode map[CharCode]rune // CID -> Unicode codeToUnicode map[CharCode]string // CID -> Unicode string
unicodeToCode map[rune]CharCode // Unicode -> CID // XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode?
unicodeToCode map[rune]CharCode // Unicode rune -> CID
// cached contains the raw CMap data. It is used by the Bytes method in // cached contains the raw CMap data. It is used by the Bytes method in
// order to avoid generating the data for every call. // order to avoid generating the data for every call.
@ -116,8 +120,13 @@ type CMap struct {
cached []byte cached []byte
} }
// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg. // NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToRune` arg.
func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap { func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
codeToUnicode := make(map[CharCode]string, len(codeToRune))
for code, r := range codeToRune {
codeToUnicode[code] = string(r)
}
cmap := &CMap{ cmap := &CMap{
name: "Adobe-Identity-UCS", name: "Adobe-Identity-UCS",
ctype: 2, ctype: 2,
@ -135,6 +144,7 @@ func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap {
} }
cmap.computeInverseMappings() cmap.computeInverseMappings()
return cmap return cmap
} }
@ -148,7 +158,7 @@ func newCMap(isSimple bool) *CMap {
nbits: nbits, nbits: nbits,
codeToCID: make(map[CharCode]CharCode), codeToCID: make(map[CharCode]CharCode),
cidToCode: make(map[CharCode]CharCode), cidToCode: make(map[CharCode]CharCode),
codeToUnicode: make(map[CharCode]rune), codeToUnicode: make(map[CharCode]string),
unicodeToCode: make(map[rune]CharCode), unicodeToCode: make(map[rune]CharCode),
} }
} }
@ -254,7 +264,12 @@ func (cmap *CMap) computeInverseMappings() {
} }
// Generate Unicode -> CID map. // Generate Unicode -> CID map.
for cid, r := range cmap.codeToUnicode { for cid, s := range cmap.codeToUnicode {
// The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf
if len(s) == 0 {
continue
}
r := rune0(s)
if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) { if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
cmap.unicodeToCode[r] = cid cmap.unicodeToCode[r] = cid
} }
@ -277,19 +292,18 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
return "", 0 return "", 0
} }
var ( parts := make([]string, len(charcodes))
parts []rune var missing []CharCode
missing []CharCode for i, code := range charcodes {
)
for _, code := range charcodes {
s, ok := cmap.codeToUnicode[code] s, ok := cmap.codeToUnicode[code]
if !ok { if !ok {
missing = append(missing, code) missing = append(missing, code)
s = MissingCodeRune s = MissingCodeString
} }
parts = append(parts, s) parts[i] = s
} }
unicode := string(parts) unicode := strings.Join(parts, "")
if len(missing) > 0 { if len(missing) > 0 {
common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+ common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+
"\tdata=[% 02x]=%#q\n"+ "\tdata=[% 02x]=%#q\n"+
@ -305,11 +319,11 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
// CharcodeToUnicode converts a single character code `code` to a unicode string. // CharcodeToUnicode converts a single character code `code` to a unicode string.
// If `code` is not in the unicode map, '<27>' is returned. // If `code` is not in the unicode map, '<27>' is returned.
// NOTE: CharcodeBytesToUnicode is typically more efficient. // NOTE: CharcodeBytesToUnicode is typically more efficient.
func (cmap *CMap) CharcodeToUnicode(code CharCode) (rune, bool) { func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
if s, ok := cmap.codeToUnicode[code]; ok { if s, ok := cmap.codeToUnicode[code]; ok {
return s, true return s, true
} }
return MissingCodeRune, false return MissingCodeString, false
} }
// RuneToCID maps the specified rune to a character identifier. If the provided // RuneToCID maps the specified rune to a character identifier. If the provided
@ -453,7 +467,7 @@ func (cmap *CMap) toBfData() string {
} }
// codes is a sorted list of the codeToUnicode keys. // codes is a sorted list of the codeToUnicode keys.
var codes []CharCode codes := make([]CharCode, 0, len(cmap.codeToUnicode))
for code := range cmap.codeToUnicode { for code := range cmap.codeToUnicode {
codes = append(codes, code) codes = append(codes, code)
} }
@ -470,9 +484,9 @@ func (cmap *CMap) toBfData() string {
// character codes have been mapped to code ranges. // character codes have been mapped to code ranges.
var charRanges []charRange var charRanges []charRange
currCharRange := charRange{codes[0], codes[0]} currCharRange := charRange{codes[0], codes[0]}
prevRune := cmap.codeToUnicode[codes[0]] prevRune := rune0(cmap.codeToUnicode[codes[0]])
for _, c := range codes[1:] { for _, c := range codes[1:] {
currRune := cmap.codeToUnicode[c] currRune := rune0(cmap.codeToUnicode[c])
if c == currCharRange.code1+1 && currRune == prevRune+1 { if c == currCharRange.code1+1 && currRune == prevRune+1 {
currCharRange.code1 = c currCharRange.code1 = c
} else { } else {
@ -493,7 +507,7 @@ func (cmap *CMap) toBfData() string {
fbRanges = append(fbRanges, fbRange{ fbRanges = append(fbRanges, fbRange{
code0: cr.code0, code0: cr.code0,
code1: cr.code1, code1: cr.code1,
r0: cmap.codeToUnicode[cr.code0], r0: rune0(cmap.codeToUnicode[cr.code0]),
}) })
} }
} }
@ -508,7 +522,7 @@ func (cmap *CMap) toBfData() string {
lines = append(lines, fmt.Sprintf("%d beginbfchar", n)) lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
for j := 0; j < n; j++ { for j := 0; j < n; j++ {
code := fbChars[i*maxBfEntries+j] code := fbChars[i*maxBfEntries+j]
r := cmap.codeToUnicode[code] r := rune0(cmap.codeToUnicode[code])
lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r)) lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
} }
lines = append(lines, "endbfchar") lines = append(lines, "endbfchar")
@ -549,3 +563,9 @@ end
end end
` `
) )
// rune0 is a convenience function that returns the first rune in `s`.
// Caller must check that `s` is not empty.
func rune0(s string) rune {
return ([]rune(s))[0]
}

View File

@ -105,7 +105,8 @@ func (cmap *CMap) parse() error {
func (cmap *CMap) parseName() error { func (cmap *CMap) parseName() error {
name := "" name := ""
done := false done := false
for i := 0; i < 10 && !done; i++ { // /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf
for i := 0; i < 20 && !done; i++ {
o, err := cmap.parseObject() o, err := cmap.parseObject()
if err != nil { if err != nil {
return err return err
@ -141,7 +142,6 @@ func (cmap *CMap) parseName() error {
// parseType parses a cmap type and adds it to `cmap`. // parseType parses a cmap type and adds it to `cmap`.
// cmap names are defined like this: /CMapType 1 def // cmap names are defined like this: /CMapType 1 def
func (cmap *CMap) parseType() error { func (cmap *CMap) parseType() error {
ctype := 0 ctype := 0
done := false done := false
for i := 0; i < 3 && !done; i++ { for i := 0; i < 3 && !done; i++ {
@ -171,7 +171,6 @@ func (cmap *CMap) parseType() error {
// We don't need the version. We do this to eat up the version code in the cmap definition // We don't need the version. We do this to eat up the version code in the cmap definition
// to reduce unhandled parse object warnings. // to reduce unhandled parse object warnings.
func (cmap *CMap) parseVersion() error { func (cmap *CMap) parseVersion() error {
version := "" version := ""
done := false done := false
for i := 0; i < 3 && !done; i++ { for i := 0; i < 3 && !done; i++ {
@ -471,7 +470,7 @@ func (cmap *CMap) parseBfchar() error {
} }
return err return err
} }
var target rune var target []rune
switch v := o.(type) { switch v := o.(type) {
case cmapOperand: case cmapOperand:
if v.Operand == endbfchar { if v.Operand == endbfchar {
@ -480,16 +479,16 @@ func (cmap *CMap) parseBfchar() error {
common.Log.Debug("ERROR: Unexpected operand. %#v", v) common.Log.Debug("ERROR: Unexpected operand. %#v", v)
return ErrBadCMap return ErrBadCMap
case cmapHexString: case cmapHexString:
target = hexToRune(v) target = hexToRunes(v)
case cmapName: case cmapName:
common.Log.Debug("ERROR: Unexpected name. %#v", v) common.Log.Debug("ERROR: Unexpected name. %#v", v)
target = MissingCodeRune target = []rune{MissingCodeRune}
default: default:
common.Log.Debug("ERROR: Unexpected type. %#v", o) common.Log.Debug("ERROR: Unexpected type. %#v", o)
return ErrBadCMap return ErrBadCMap
} }
cmap.codeToUnicode[code] = target cmap.codeToUnicode[code] = string(target)
} }
return nil return nil
@ -563,16 +562,17 @@ func (cmap *CMap) parseBfrange() error {
if !ok { if !ok {
return errors.New("non-hex string in array") return errors.New("non-hex string in array")
} }
r := hexToRune(hexs) runes := hexToRunes(hexs)
cmap.codeToUnicode[code] = r cmap.codeToUnicode[code] = string(runes)
} }
case cmapHexString: case cmapHexString:
// <codeFrom> <codeTo> <dst>, maps [from,to] to [dst,dst+to-from]. // <codeFrom> <codeTo> <dst>, maps [from,to] to [dst,dst+to-from].
r := hexToRune(v) runes := hexToRunes(v)
n := len(runes)
for code := srcCodeFrom; code <= srcCodeTo; code++ { for code := srcCodeFrom; code <= srcCodeTo; code++ {
cmap.codeToUnicode[code] = r cmap.codeToUnicode[code] = string(runes)
r++ runes[n-1]++
} }
default: default:
common.Log.Debug("ERROR: Unexpected type %T", o) common.Log.Debug("ERROR: Unexpected type %T", o)

View File

@ -104,14 +104,14 @@ func TestCMapParser1(t *testing.T) {
} }
for k, expected := range expectedMappings { for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v) t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v)
return return
} }
} }
v, _ := cmap.CharcodeToUnicode(0x99) v, _ := cmap.CharcodeToUnicode(0x99)
if v != MissingCodeRune { //!= "notdef" { if v != MissingCodeString { //!= "notdef" {
t.Errorf("Unmapped code, expected to map to undefined") t.Errorf("Unmapped code, expected to map to undefined")
return return
} }
@ -188,7 +188,7 @@ func TestCMapParser2(t *testing.T) {
} }
for k, expected := range expectedMappings { for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v) t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v)
return return
} }
@ -297,7 +297,7 @@ func TestCMapParser3(t *testing.T) {
0xd140: 0xa000, 0xd140: 0xa000,
} }
for k, expected := range expectedMappings { for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v) t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v)
return return
} }
@ -407,7 +407,7 @@ func TestCMapParser4(t *testing.T) {
} }
for k, expected := range expectedMappings { for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v) t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v)
return return
} }
@ -520,6 +520,7 @@ var (
0x017b: 'Ż', 0x017b: 'Ż',
0x017d: 'Ž', 0x017d: 'Ž',
} }
codeToUnicode3 = map[CharCode]rune{ // 93 entries codeToUnicode3 = map[CharCode]rune{ // 93 entries
0x0124: 'Ĥ', 0x0124: 'Ĥ',
0x0125: 'ĥ', 0x0125: 'ĥ',
@ -695,7 +696,7 @@ func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) {
} }
u0 := codeToUnicode[code] u0 := codeToUnicode[code]
u := cmap.codeToUnicode[code] u := cmap.codeToUnicode[code]
if u != u0 { if u != string(u0) {
t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u) t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u)
return return
} }

View File

@ -48,8 +48,8 @@ func (enc CMapEncoder) Decode(raw []byte) string {
if codes, ok := enc.codeToCID.BytesToCharcodes(raw); ok { if codes, ok := enc.codeToCID.BytesToCharcodes(raw); ok {
var buf bytes.Buffer var buf bytes.Buffer
for _, code := range codes { for _, code := range codes {
r, _ := enc.CharcodeToRune(CharCode(code)) s, _ := enc.charcodeToString(CharCode(code))
buf.WriteRune(r) buf.WriteString(s)
} }
return buf.String() return buf.String()
@ -87,8 +87,13 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) {
// CharcodeToRune converts PDF character code `code` to a rune. // CharcodeToRune converts PDF character code `code` to a rune.
// The bool return flag is true if there was a match, and false otherwise. // The bool return flag is true if there was a match, and false otherwise.
func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) { func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
s, ok := enc.charcodeToString(code)
return ([]rune(s))[0], ok
}
func (enc CMapEncoder) charcodeToString(code CharCode) (string, bool) {
if enc.cidToUnicode == nil { if enc.cidToUnicode == nil {
return MissingCodeRune, false return MissingCodeString, false
} }
// Map charcode to CID. If charcode to CID CMap is nil, assume Identity encoding. // Map charcode to CID. If charcode to CID CMap is nil, assume Identity encoding.
@ -96,7 +101,7 @@ func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
if enc.codeToCID != nil { if enc.codeToCID != nil {
var ok bool var ok bool
if cid, ok = enc.codeToCID.CharcodeToCID(cmap.CharCode(code)); !ok { if cid, ok = enc.codeToCID.CharcodeToCID(cmap.CharCode(code)); !ok {
return MissingCodeRune, false return MissingCodeString, false
} }
} }

View File

@ -18,7 +18,13 @@ import (
) )
// MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'. // MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'.
const MissingCodeRune = '\ufffd' // <20> const (
// MissingCodeRune replaces runes that can't be decoded. .
MissingCodeRune = '\ufffd' // <20>
// MissingCodeRune replaces strings that can't be decoded.
MissingCodeString = string(MissingCodeRune)
)
// GlyphToRune returns the rune corresponding to glyph `glyph` if there is one. // GlyphToRune returns the rune corresponding to glyph `glyph` if there is one.
// TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi" // TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi"

View File

@ -422,14 +422,28 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical // CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical
// information about hits and misses from the reverse mapping process. // information about hits and misses from the reverse mapping process.
// NOTE: The number of runes returned may be greater than the number of charcodes.
// TODO(peterwilliams97): Deprecate?
func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) { func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) {
runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes)
var runes []rune
for _, r := range runeSlices {
runes = append(runes, r...)
}
return runes, numHits, numMisses
}
// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices.
// The int return is the number of unconvereted codes.
// NOTE: The number of rune slices returned is equal to the number of charcodes
func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) {
fontBase := font.baseFields() fontBase := font.baseFields()
runes := make([]rune, 0, len(charcodes)) runeSlices := make([][]rune, 0, len(charcodes))
numMisses = 0 numMisses := 0
for _, code := range charcodes { for _, code := range charcodes {
if fontBase.toUnicodeCmap != nil { if fontBase.toUnicodeCmap != nil {
if r, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok { if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
runes = append(runes, r) runeSlices = append(runeSlices, []rune(s))
continue continue
} }
} }
@ -438,7 +452,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
encoder := font.Encoder() encoder := font.Encoder()
if encoder != nil { if encoder != nil {
if r, ok := encoder.CharcodeToRune(code); ok { if r, ok := encoder.CharcodeToRune(code); ok {
runes = append(runes, r) runeSlices = append(runeSlices, []rune{r})
continue continue
} }
} }
@ -447,7 +461,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
"\tfont=%s\n\tencoding=%s", "\tfont=%s\n\tencoding=%s",
code, charcodes, fontBase.isCIDFont(), font, encoder) code, charcodes, fontBase.isCIDFont(), font, encoder)
numMisses++ numMisses++
runes = append(runes, cmap.MissingCodeRune) runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune})
} }
if numMisses != 0 { if numMisses != 0 {
@ -457,7 +471,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
len(charcodes), numMisses, font) len(charcodes), numMisses, font)
} }
return runes, len(runes), numMisses return runeSlices, len(runeSlices), numMisses
} }
// CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string. // CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.