From 23aec77478cabfe01eb813b3376255ccbf6ac710 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Thu, 28 Nov 2019 02:47:00 +0200 Subject: [PATCH] Add basic support for UTF-16 text encodings (#203) * Add UTF-16 text encoder --- internal/textencoding/utf16.go | 57 ++++++++++++++++++++++++++++++++++ model/font_composite.go | 18 +++++++++-- 2 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 internal/textencoding/utf16.go diff --git a/internal/textencoding/utf16.go b/internal/textencoding/utf16.go new file mode 100644 index 00000000..bd90dac0 --- /dev/null +++ b/internal/textencoding/utf16.go @@ -0,0 +1,57 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +package textencoding + +import ( + "github.com/unidoc/unipdf/v3/core" + "github.com/unidoc/unipdf/v3/internal/strutils" +) + +// UTF16Encoder represents UTF-16 encoding. +type UTF16Encoder struct { + baseName string +} + +// NewUTF16TextEncoder returns a new UTF16Encoder based on the predefined +// encoding `baseName`. +func NewUTF16TextEncoder(baseName string) UTF16Encoder { + return UTF16Encoder{baseName} +} + +// String returns a string that describes `enc`. +func (enc UTF16Encoder) String() string { + return enc.baseName +} + +// Encode converts the Go unicode string to a PDF encoded string. +func (enc UTF16Encoder) Encode(str string) []byte { + return []byte(strutils.StringToUTF16(str)) +} + +// Decode converts PDF encoded string to a Go unicode string. +func (enc UTF16Encoder) Decode(raw []byte) string { + return strutils.UTF16ToString(raw) +} + +// RuneToCharcode converts rune `r` to a PDF character code. +// The bool return flag is true if there was a match, and false otherwise. +func (enc UTF16Encoder) RuneToCharcode(r rune) (CharCode, bool) { + return CharCode(r), true +} + +// CharcodeToRune converts PDF character code `code` to a rune. +// The bool return flag is true if there was a match, and false otherwise. +func (enc UTF16Encoder) CharcodeToRune(code CharCode) (rune, bool) { + return rune(code), true +} + +// ToPdfObject returns a PDF Object that represents the encoding. +func (enc UTF16Encoder) ToPdfObject() core.PdfObject { + if enc.baseName != "" { + return core.MakeName(enc.baseName) + } + return core.MakeNull() +} diff --git a/model/font_composite.go b/model/font_composite.go index fb5bbefe..085d73ab 100644 --- a/model/font_composite.go +++ b/model/font_composite.go @@ -194,9 +194,23 @@ func newPdfFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon) encoderName, ok := core.GetNameVal(d.Get("Encoding")) if ok { - if encoderName == "Identity-H" || encoderName == "Identity-V" { + switch encoderName { + case "Identity-H", "Identity-V": font.encoder = textencoding.NewIdentityTextEncoder(encoderName) - } else { + case + // Reference: https://www.adobe.com/content/dam/acom/en/devnet/font/pdfs/5094.CJK_CID.pdf + // Adobe-GB1-4, Adobe-GB1-5 + "UniGB-UTF16-H", "UniGB-UTF16-V", + // Adobe-CNS1-4, Adobe-CNS1-5 + "UniCNS-UTF16-H", "UniCNS-UTF16-V", + // Adobe-Japan1-4, Adobe-Japan1-5, Adobe-Japan1-6 + "UniJIS-UTF16-H", "UniJIS-UTF16-V", "UniJIS2004-UTF16-H", + // Adobe-Japan2-0 + "UniHojo-UTF16-H", "UniHojo-UTF16-V", + // Adobe-Korea1-2 + "UniKS-UTF16-H", "UniKS-UTF16-V": + font.encoder = textencoding.NewUTF16TextEncoder(encoderName) + default: common.Log.Debug("Unhandled cmap %q", encoderName) } }