From ac7696693b1c829c60aa04916e425e6871ab3d50 Mon Sep 17 00:00:00 2001
From: Denys Smirnov <denys@sourced.tech>
Date: Sat, 29 Dec 2018 19:01:05 +0200
Subject: [PATCH] fonts: describe few issues with the code; remove unused cmap
 type

---
 pdf/internal/textencoding/cmap.go         | 36 ------------------
 pdf/internal/textencoding/cmap_test.go    | 45 -----------------------
 pdf/internal/textencoding/truetype.go     | 13 ++++---
 pdf/internal/textencoding/winansi_test.go |  5 +++
 pdf/model/font_simple.go                  |  4 +-
 pdf/model/fonts/ttfparser.go              | 10 ++++-
 6 files changed, 24 insertions(+), 89 deletions(-)
 delete mode 100644 pdf/internal/textencoding/cmap.go
 delete mode 100644 pdf/internal/textencoding/cmap_test.go

diff --git a/pdf/internal/textencoding/cmap.go b/pdf/internal/textencoding/cmap.go
deleted file mode 100644
index 63423c07..00000000
--- a/pdf/internal/textencoding/cmap.go
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * This file is subject to the terms and conditions defined in
- * file 'LICENSE.md', which is part of this source code package.
- */
-
-package textencoding
-
-import "github.com/unidoc/unidoc/pdf/core"
-
-// CID represents a character identifier.
-type CID uint16
-
-// CMap maps character codes to CIDs.
-type CMap interface {
-	CharacterCodesToCID(charcodes []byte) ([]CID, error)
-}
-
-// CMapIdentityH is a representation of the /Identity-H cmap.
-type CMapIdentityH struct {
-}
-
-// CharacterCodesToCID converts charcodes to CIDs for the Identity CMap, which maps
-// 2-byte character codes (from the raw data) from 0-65535 to the same 2-byte CID value.
-func (cmap CMapIdentityH) CharacterCodesToCID(raw []byte) ([]CID, error) {
-	if len(raw)%2 != 0 {
-		return nil, core.ErrRangeError
-	}
-
-	var cids []CID
-	for i := 0; i < len(raw); i += 2 {
-		b1 := CID(raw[i])
-		b2 := CID(raw[i+1])
-		cids = append(cids, (b1<<8)|b2)
-	}
-	return cids, nil
-}
diff --git a/pdf/internal/textencoding/cmap_test.go b/pdf/internal/textencoding/cmap_test.go
deleted file mode 100644
index 3698b6c2..00000000
--- a/pdf/internal/textencoding/cmap_test.go
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * This file is subject to the terms and conditions defined in
- * file 'LICENSE.md', which is part of this source code package.
- */
-
-package textencoding
-
-import "testing"
-
-func TestCMapIdentityH_CharacterCodesToCID(t *testing.T) {
-	identityCMap := CMapIdentityH{}
-
-	type dataPair struct {
-		raw      []byte
-		expected []CID
-		errs     bool
-	}
-
-	dataPairs := []dataPair{
-		{[]byte{0x00, 0x00, 0x04, 0xff}, []CID{0x0000, 0x04ff}, false},
-		{[]byte{0x00, 0x00, 0x04}, []CID{0x0000, 0x04ff}, true},
-	}
-
-	for _, data := range dataPairs {
-		cids, err := identityCMap.CharacterCodesToCID(data.raw)
-		if err != nil {
-			if data.errs {
-				continue
-			}
-			t.Errorf("Failed: %v", err)
-			return
-		}
-
-		if len(data.expected) != len(cids) {
-			t.Errorf("Length mismatch")
-			return
-		}
-
-		for i := 0; i < len(data.expected); i++ {
-			if cids[i] != data.expected[i] {
-				t.Errorf("Not equal")
-			}
-		}
-	}
-}
diff --git a/pdf/internal/textencoding/truetype.go b/pdf/internal/textencoding/truetype.go
index 6ba84cf3..b55ef087 100644
--- a/pdf/internal/textencoding/truetype.go
+++ b/pdf/internal/textencoding/truetype.go
@@ -17,13 +17,14 @@ import (
 // GID is a glyph index.
 type GID uint16
 
+// TODO(dennwc): should not mix Identity-H CMap and Encoding in the same object
+
 // TrueTypeFontEncoder handles text encoding for composite TrueType fonts.
 // It performs mapping between character ids and glyph ids.
 // It has a preloaded rune (unicode code point) to glyph index map that has been loaded from a font.
-// Corresponds to Identity-H.
+// Corresponds to Identity-H CMap and Identity encoding.
 type TrueTypeFontEncoder struct {
 	runeToGIDMap map[rune]GID
-	cmap         CMap
 }
 
 // NewTrueTypeFontEncoder creates a new text encoder for TTF fonts with a runeToGlyphIndexMap that
@@ -33,7 +34,6 @@ type TrueTypeFontEncoder struct {
 func NewTrueTypeFontEncoder(runeToGIDMap map[rune]GID) TrueTypeFontEncoder {
 	return TrueTypeFontEncoder{
 		runeToGIDMap: runeToGIDMap,
-		cmap:         CMapIdentityH{},
 	}
 }
 
@@ -75,7 +75,7 @@ func (enc TrueTypeFontEncoder) Encode(raw string) []byte {
 // The bool return flag is true if there was a match, and false otherwise.
 func (enc TrueTypeFontEncoder) CharcodeToGlyph(code CharCode) (GlyphName, bool) {
 	r, found := enc.CharcodeToRune(code)
-	if found && r == 0x20 {
+	if found && r == ' ' {
 		return "space", true
 	}
 
@@ -139,9 +139,10 @@ func (enc TrueTypeFontEncoder) CharcodeToRune(code CharCode) (rune, bool) {
 // RuneToGlyph returns the glyph name for rune `r`.
 // The bool return flag is true if there was a match, and false otherwise.
 func (enc TrueTypeFontEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
-	if r == 0x20 {
+	if r == ' ' {
 		return "space", true
 	}
+	// TODO(dennwc): this is wrong; font may override this with a "post" table that specifies glyph names
 	glyph := GlyphName(fmt.Sprintf("uni%.4X", r))
 	return glyph, true
 }
@@ -149,6 +150,7 @@ func (enc TrueTypeFontEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
 // GlyphToRune returns the rune corresponding to glyph name `glyph`.
 // The bool return flag is true if there was a match, and false otherwise.
 func (enc TrueTypeFontEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
+	// TODO(dennwc): this is wrong; font may override this with a "post" table that specifies glyph names
 	// String with "uniXXXX" format where XXXX is the hexcode.
 	if len(glyph) == 7 && glyph[0:3] == "uni" {
 		unicode := uint16(0)
@@ -168,5 +170,6 @@ func (enc TrueTypeFontEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
 
 // ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
 func (enc TrueTypeFontEncoder) ToPdfObject() core.PdfObject {
+	// TODO(dennwc): reasonable question: why it have to implement this interface then?
 	return core.MakeNull()
 }
diff --git a/pdf/internal/textencoding/winansi_test.go b/pdf/internal/textencoding/winansi_test.go
index 8eac31b8..3f2daa6c 100644
--- a/pdf/internal/textencoding/winansi_test.go
+++ b/pdf/internal/textencoding/winansi_test.go
@@ -15,6 +15,11 @@ func TestWinAnsiEncoder(t *testing.T) {
 		t.Errorf("Glyph != space")
 		return
 	}
+	code, found := enc.RuneToCharcode('þ')
+	if !found || code != 254 {
+		t.Errorf("code != 254")
+		return
+	}
 
 	glyph, found = enc.RuneToGlyph('þ')
 	if !found || glyph != "thorn" {
diff --git a/pdf/model/font_simple.go b/pdf/model/font_simple.go
index 417321a3..627c5c15 100644
--- a/pdf/model/font_simple.go
+++ b/pdf/model/font_simple.go
@@ -430,14 +430,14 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) {
 			continue
 		}
 
-		pos, ok := ttf.Chars[r]
+		gid, ok := ttf.Chars[r]
 		if !ok {
 			common.Log.Debug("Rune not in TTF Chars")
 			vals = append(vals, missingWidth)
 			continue
 		}
 
-		w := k * float64(ttf.Widths[pos])
+		w := k * float64(ttf.Widths[gid])
 
 		vals = append(vals, w)
 	}
diff --git a/pdf/model/fonts/ttfparser.go b/pdf/model/fonts/ttfparser.go
index 0622528f..d463e12d 100644
--- a/pdf/model/fonts/ttfparser.go
+++ b/pdf/model/fonts/ttfparser.go
@@ -47,6 +47,8 @@ import (
 // MakeEncoder returns an encoder built from the tables in `rec`.
 func (ttf *TtfType) MakeEncoder() (*textencoding.SimpleEncoder, error) {
 	encoding := make(map[textencoding.CharCode]GlyphName)
+	// TODO(dennwc): this is a bit strange, since TTF may contain more than 256 characters
+	//				 should probably make a different encoder here
 	for code := textencoding.CharCode(0); code <= 256; code++ {
 		r := rune(code) // TODO(dennwc): make sure this conversion is valid
 		gid, ok := ttf.Chars[r]
@@ -93,11 +95,14 @@ type TtfType struct {
 	UnderlineThickness     int16
 	Xmin, Ymin, Xmax, Ymax int16
 	CapHeight              int16
-	Widths                 []uint16
+	// Widths is a list of glyph widths indexed by GID.
+	Widths []uint16
 
 	// Chars maps rune values (unicode) to GIDs (the indexes in GlyphNames). i.e. GlyphNames[Chars[r]] is
 	// the glyph corresponding to rune r.
 	//
+	// TODO(dennwc): CharCode is currently defined as uint16, but some tables may store 32 bit charcodes
+	//				 not the case right now, but make sure to update it once we support those tables
 	// TODO(dennwc,peterwilliams97): it should map char codes to GIDs
 	Chars map[rune]GID
 	// GlyphNames is a list of glyphs from the "post" section of the TrueType file.
@@ -117,6 +122,9 @@ func (ttf *TtfType) MakeToUnicode() *cmap.CMap {
 		glyph := ttf.GlyphNames[gid]
 
 		// TODO(dennwc): 'code' is already a rune; do we need this extra lookup?
+		// TODO(dennwc): this cannot be done here; glyphNames might be empty
+		//				 the parent font may specify a different encoding
+		//				 so we should remap on a higher level
 		r, ok := textencoding.GlyphToRune(glyph)
 		if !ok {
 			common.Log.Debug("No rune. code=0x%04x glyph=%q", code, glyph)