2017-07-05 23:10:57 +00:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package textencoding
|
|
|
|
|
2018-06-27 12:25:59 +10:00
|
|
|
import (
|
2018-10-16 02:32:17 +03:00
|
|
|
"encoding/binary"
|
|
|
|
|
2019-05-16 23:08:40 +03:00
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
2019-05-16 23:44:51 +03:00
|
|
|
"github.com/unidoc/unipdf/v3/core"
|
2018-06-27 12:25:59 +10:00
|
|
|
)
|
2017-07-05 23:10:57 +00:00
|
|
|
|
2018-11-29 23:24:40 +02:00
|
|
|
// CharCode is a character code used in the specific encoding.
|
2018-11-29 04:02:20 +02:00
|
|
|
type CharCode uint16
|
|
|
|
|
2018-11-29 23:24:40 +02:00
|
|
|
// GlyphName is a name of a glyph.
|
|
|
|
type GlyphName string
|
|
|
|
|
2018-08-03 10:18:44 +00:00
|
|
|
// TextEncoder defines the common methods that a text encoder implementation must have in UniDoc.
|
2017-07-05 23:10:57 +00:00
|
|
|
type TextEncoder interface {
|
2018-06-27 12:25:59 +10:00
|
|
|
// String returns a string that describes the TextEncoder instance.
|
|
|
|
String() string
|
|
|
|
|
2019-01-01 23:24:11 +02:00
|
|
|
// Encode converts the Go unicode string to a PDF encoded string.
|
|
|
|
Encode(str string) []byte
|
2017-07-10 15:17:46 +00:00
|
|
|
|
2019-01-01 23:24:11 +02:00
|
|
|
// Decode converts PDF encoded string to a Go unicode string.
|
|
|
|
Decode(raw []byte) string
|
2017-07-10 15:17:46 +00:00
|
|
|
|
2018-06-27 12:25:59 +10:00
|
|
|
// RuneToCharcode returns the PDF character code corresponding to rune `r`.
|
2017-07-10 15:17:46 +00:00
|
|
|
// The bool return flag is true if there was a match, and false otherwise.
|
2018-06-27 12:25:59 +10:00
|
|
|
// This is usually implemented as RuneToGlyph->GlyphToCharcode
|
2018-11-29 04:02:20 +02:00
|
|
|
RuneToCharcode(r rune) (CharCode, bool)
|
2017-07-10 15:17:46 +00:00
|
|
|
|
2018-06-27 12:25:59 +10:00
|
|
|
// CharcodeToRune returns the rune corresponding to character code `code`.
|
2017-07-10 15:17:46 +00:00
|
|
|
// The bool return flag is true if there was a match, and false otherwise.
|
2018-06-27 12:25:59 +10:00
|
|
|
// This is usually implemented as CharcodeToGlyph->GlyphToRune
|
2018-11-29 04:02:20 +02:00
|
|
|
CharcodeToRune(code CharCode) (rune, bool)
|
2017-07-10 15:17:46 +00:00
|
|
|
|
2018-06-27 12:25:59 +10:00
|
|
|
// ToPdfObject returns a PDF Object that represents the encoding.
|
2018-07-24 21:32:02 +10:00
|
|
|
ToPdfObject() core.PdfObject
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
// Convenience functions
|
|
|
|
|
2018-10-16 02:32:17 +03:00
|
|
|
// encodeString8bit converts a Go unicode string `raw` to a PDF encoded string using the encoder `enc`.
|
|
|
|
// It expects that character codes will fit into a single byte.
|
|
|
|
func encodeString8bit(enc TextEncoder, raw string) []byte {
|
|
|
|
encoded := make([]byte, 0, len(raw))
|
2018-06-27 12:25:59 +10:00
|
|
|
for _, r := range raw {
|
|
|
|
code, found := enc.RuneToCharcode(r)
|
2018-10-16 02:32:17 +03:00
|
|
|
if !found || code > 0xff {
|
2018-08-17 08:41:35 +10:00
|
|
|
common.Log.Debug("Failed to map rune to charcode for rune 0x%04x", r)
|
2018-06-27 12:25:59 +10:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
encoded = append(encoded, byte(code))
|
|
|
|
}
|
2018-08-20 20:13:10 +10:00
|
|
|
return encoded
|
2018-06-27 12:25:59 +10:00
|
|
|
}
|
|
|
|
|
2018-10-16 02:32:17 +03:00
|
|
|
// encodeString16bit converts a Go unicode string `raw` to a PDF encoded string using the encoder `enc`.
|
|
|
|
// Each character will be encoded as two bytes.
|
|
|
|
func encodeString16bit(enc TextEncoder, raw string) []byte {
|
|
|
|
// runes -> character codes -> bytes
|
|
|
|
runes := []rune(raw)
|
|
|
|
encoded := make([]byte, 0, len(runes)*2)
|
|
|
|
for _, r := range runes {
|
|
|
|
code, ok := enc.RuneToCharcode(r)
|
|
|
|
if !ok {
|
|
|
|
common.Log.Debug("Failed to map rune to charcode. rune=%+q", r)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Each entry represented by 2 bytes.
|
|
|
|
var v [2]byte
|
2018-11-29 04:02:20 +02:00
|
|
|
binary.BigEndian.PutUint16(v[:], uint16(code))
|
2018-10-16 02:32:17 +03:00
|
|
|
encoded = append(encoded, v[:]...)
|
|
|
|
}
|
|
|
|
return encoded
|
|
|
|
}
|
2019-01-01 23:24:11 +02:00
|
|
|
|
|
|
|
// decodeString16bit converts PDF encoded string to a Go unicode string using the encoder `enc`.
|
|
|
|
// Each character will be decoded from two bytes.
|
|
|
|
func decodeString16bit(enc TextEncoder, raw []byte) string {
|
|
|
|
// bytes -> character codes -> runes
|
|
|
|
runes := make([]rune, 0, len(raw)/2+len(raw)%2)
|
|
|
|
|
|
|
|
for len(raw) > 0 {
|
|
|
|
if len(raw) == 1 {
|
|
|
|
raw = []byte{raw[0], 0}
|
|
|
|
}
|
|
|
|
// Each entry represented by 2 bytes.
|
|
|
|
code := CharCode(binary.BigEndian.Uint16(raw[:]))
|
|
|
|
raw = raw[2:]
|
|
|
|
|
|
|
|
r, ok := enc.CharcodeToRune(code)
|
|
|
|
if !ok {
|
|
|
|
common.Log.Debug("Failed to map charcode to rune. charcode=%#x", code)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
runes = append(runes, r)
|
|
|
|
}
|
|
|
|
return string(runes)
|
|
|
|
}
|