Add basic support for UTF-16 text encodings (#203)

* Add UTF-16 text encoder
2025-04-26 13:48:55 +08:00 · 2019-11-28 02:47:00 +02:00 · 2019-11-28 02:47:00 +02:00 · 23aec77478
commit 23aec77478
parent 1e26aa81f6
2 changed files with 73 additions and 2 deletions
--- a/internal/textencoding/utf16.go
+++ b/internal/textencoding/utf16.go
@ -0,0 +1,57 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package textencoding
+
+import (
+	"github.com/unidoc/unipdf/v3/core"
+	"github.com/unidoc/unipdf/v3/internal/strutils"
+)
+
+// UTF16Encoder represents UTF-16 encoding.
+type UTF16Encoder struct {
+	baseName string
+}
+
+// NewUTF16TextEncoder returns a new UTF16Encoder based on the predefined
+// encoding `baseName`.
+func NewUTF16TextEncoder(baseName string) UTF16Encoder {
+	return UTF16Encoder{baseName}
+}
+
+// String returns a string that describes `enc`.
+func (enc UTF16Encoder) String() string {
+	return enc.baseName
+}
+
+// Encode converts the Go unicode string to a PDF encoded string.
+func (enc UTF16Encoder) Encode(str string) []byte {
+	return []byte(strutils.StringToUTF16(str))
+}
+
+// Decode converts PDF encoded string to a Go unicode string.
+func (enc UTF16Encoder) Decode(raw []byte) string {
+	return strutils.UTF16ToString(raw)
+}
+
+// RuneToCharcode converts rune `r` to a PDF character code.
+// The bool return flag is true if there was a match, and false otherwise.
+func (enc UTF16Encoder) RuneToCharcode(r rune) (CharCode, bool) {
+	return CharCode(r), true
+}
+
+// CharcodeToRune converts PDF character code `code` to a rune.
+// The bool return flag is true if there was a match, and false otherwise.
+func (enc UTF16Encoder) CharcodeToRune(code CharCode) (rune, bool) {
+	return rune(code), true
+}
+
+// ToPdfObject returns a PDF Object that represents the encoding.
+func (enc UTF16Encoder) ToPdfObject() core.PdfObject {
+	if enc.baseName != "" {
+		return core.MakeName(enc.baseName)
+	}
+	return core.MakeNull()
+}
--- a/model/font_composite.go
+++ b/model/font_composite.go
@ -194,9 +194,23 @@ func newPdfFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon)

 	encoderName, ok := core.GetNameVal(d.Get("Encoding"))
 	if ok {
-		if encoderName == "Identity-H" || encoderName == "Identity-V" {
+		switch encoderName {
+		case "Identity-H", "Identity-V":
 			font.encoder = textencoding.NewIdentityTextEncoder(encoderName)
-		} else {
+		case
+			// Reference: https://www.adobe.com/content/dam/acom/en/devnet/font/pdfs/5094.CJK_CID.pdf
+			// Adobe-GB1-4, Adobe-GB1-5
+			"UniGB-UTF16-H", "UniGB-UTF16-V",
+			// Adobe-CNS1-4, Adobe-CNS1-5
+			"UniCNS-UTF16-H", "UniCNS-UTF16-V",
+			// Adobe-Japan1-4, Adobe-Japan1-5, Adobe-Japan1-6
+			"UniJIS-UTF16-H", "UniJIS-UTF16-V", "UniJIS2004-UTF16-H",
+			// Adobe-Japan2-0
+			"UniHojo-UTF16-H", "UniHojo-UTF16-V",
+			// Adobe-Korea1-2
+			"UniKS-UTF16-H", "UniKS-UTF16-V":
+			font.encoder = textencoding.NewUTF16TextEncoder(encoderName)
+		default:
 			common.Log.Debug("Unhandled cmap %q", encoderName)
 		}
 	}