diff --git a/pdf/internal/textencoding/encoder.go b/pdf/internal/textencoding/encoder.go index 447a153b..467e2b64 100644 --- a/pdf/internal/textencoding/encoder.go +++ b/pdf/internal/textencoding/encoder.go @@ -6,6 +6,8 @@ package textencoding import ( + "encoding/binary" + "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/core" ) @@ -50,12 +52,13 @@ type TextEncoder interface { // Convenience functions -// doEncode converts a Go unicode string `raw` to a PDF encoded string using the encoder `enc`. -func doEncode(enc TextEncoder, raw string) []byte { - encoded := []byte{} +// encodeString8bit converts a Go unicode string `raw` to a PDF encoded string using the encoder `enc`. +// It expects that character codes will fit into a single byte. +func encodeString8bit(enc TextEncoder, raw string) []byte { + encoded := make([]byte, 0, len(raw)) for _, r := range raw { code, found := enc.RuneToCharcode(r) - if !found { + if !found || code > 0xff { common.Log.Debug("Failed to map rune to charcode for rune 0x%04x", r) continue } @@ -64,6 +67,27 @@ func doEncode(enc TextEncoder, raw string) []byte { return encoded } +// encodeString16bit converts a Go unicode string `raw` to a PDF encoded string using the encoder `enc`. +// Each character will be encoded as two bytes. +func encodeString16bit(enc TextEncoder, raw string) []byte { + // runes -> character codes -> bytes + runes := []rune(raw) + encoded := make([]byte, 0, len(runes)*2) + for _, r := range runes { + code, ok := enc.RuneToCharcode(r) + if !ok { + common.Log.Debug("Failed to map rune to charcode. rune=%+q", r) + continue + } + + // Each entry represented by 2 bytes. + var v [2]byte + binary.BigEndian.PutUint16(v[:], code) + encoded = append(encoded, v[:]...) + } + return encoded +} + // doRuneToCharcode converts rune `r` to a PDF character code. // The bool return flag is true if there was a match, and false otherwise. func doRuneToCharcode(enc TextEncoder, r rune) (uint16, bool) { diff --git a/pdf/internal/textencoding/identity.go b/pdf/internal/textencoding/identity.go index 64a28c1e..9759c338 100644 --- a/pdf/internal/textencoding/identity.go +++ b/pdf/internal/textencoding/identity.go @@ -6,10 +6,10 @@ package textencoding import ( - "bytes" "fmt" + "strconv" + "strings" - "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/core" ) @@ -31,20 +31,7 @@ func (enc IdentityEncoder) String() string { // Encode converts the Go unicode string `raw` to a PDF encoded string. func (enc IdentityEncoder) Encode(raw string) []byte { - // runes -> character codes -> bytes - var encoded bytes.Buffer - for _, r := range raw { - code, ok := enc.RuneToCharcode(r) - if !ok { - common.Log.Debug("Failed to map rune to charcode. rune=%+q", r) - continue - } - - // Each entry represented by 2 bytes. - encoded.WriteByte(byte((code & 0xff00) >> 8)) - encoded.WriteByte(byte(code & 0xff)) - } - return encoded.Bytes() + return encodeString16bit(enc, raw) } // CharcodeToGlyph returns the glyph name matching character code `code`. @@ -63,17 +50,11 @@ func (enc IdentityEncoder) CharcodeToGlyph(code uint16) (string, bool) { // GlyphToCharcode returns the character code matching glyph `glyph`. // The bool return flag is true if there was a match, and false otherwise. func (enc IdentityEncoder) GlyphToCharcode(glyph string) (uint16, bool) { - // String with "uniXXXX" format where XXXX is the hexcode. - if len(glyph) == 7 && glyph[0:3] == "uni" { - var unicode uint16 - n, err := fmt.Sscanf(glyph, "uni%X", &unicode) - if n == 1 && err == nil { - return enc.RuneToCharcode(rune(unicode)) - } + r, ok := enc.GlyphToRune(glyph) + if !ok { + return 0, false } - - common.Log.Debug("Symbol encoding error: unable to find glyph->charcode entry (%s)", glyph) - return 0, false + return enc.RuneToCharcode(r) } // RuneToCharcode converts rune `r` to a PDF character code. @@ -91,7 +72,7 @@ func (enc IdentityEncoder) CharcodeToRune(code uint16) (rune, bool) { // RuneToGlyph returns the glyph name for rune `r`. // The bool return flag is true if there was a match, and false otherwise. func (enc IdentityEncoder) RuneToGlyph(r rune) (string, bool) { - if r == 0x20 { + if r == ' ' { return "space", true } glyph := fmt.Sprintf("uni%.4X", r) @@ -102,14 +83,16 @@ func (enc IdentityEncoder) RuneToGlyph(r rune) (string, bool) { // The bool return flag is true if there was a match, and false otherwise. func (enc IdentityEncoder) GlyphToRune(glyph string) (rune, bool) { // String with "uniXXXX" format where XXXX is the hexcode. - if len(glyph) == 7 && glyph[0:3] == "uni" { - unicode := uint16(0) - n, err := fmt.Sscanf(glyph, "uni%X", &unicode) - if n == 1 && err == nil { - return rune(unicode), true - } + if glyph == "space" { + return ' ', true + } else if !strings.HasPrefix(glyph, "uni") || len(glyph) != 7 { + return 0, false } - return 0, false + r, err := strconv.ParseUint(glyph[3:], 16, 16) + if err != nil { + return 0, false + } + return rune(r), true } // ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file. diff --git a/pdf/internal/textencoding/simple.go b/pdf/internal/textencoding/simple.go index 4d0891bb..e11d76c6 100644 --- a/pdf/internal/textencoding/simple.go +++ b/pdf/internal/textencoding/simple.go @@ -117,7 +117,7 @@ func (se SimpleEncoder) String() string { // Encode converts a Go unicode string `raw` to a PDF encoded string. func (se SimpleEncoder) Encode(raw string) []byte { - return doEncode(se, raw) + return encodeString8bit(se, raw) } // CharcodeToGlyph returns the glyph name for character code `code`. diff --git a/pdf/internal/textencoding/truetype.go b/pdf/internal/textencoding/truetype.go index 680d1bfb..d520efe1 100644 --- a/pdf/internal/textencoding/truetype.go +++ b/pdf/internal/textencoding/truetype.go @@ -6,7 +6,6 @@ package textencoding import ( - "bytes" "fmt" "sort" "strings" @@ -64,20 +63,7 @@ func (enc TrueTypeFontEncoder) String() string { // Encode converts the Go unicode string `raw` to a PDF encoded string. func (enc TrueTypeFontEncoder) Encode(raw string) []byte { - // runes -> character codes -> bytes - var encoded bytes.Buffer - for _, r := range raw { - code, ok := enc.RuneToCharcode(r) - if !ok { - common.Log.Debug("Failed to map rune to charcode. rune=%+q", r) - continue - } - - // Each entry represented by 2 bytes. - encoded.WriteByte(byte((code & 0xff00) >> 8)) - encoded.WriteByte(byte(code & 0xff)) - } - return encoded.Bytes() + return encodeString16bit(enc, raw) } // CharcodeToGlyph returns the glyph name matching character code `code`.