From ac7696693b1c829c60aa04916e425e6871ab3d50 Mon Sep 17 00:00:00 2001 From: Denys Smirnov Date: Sat, 29 Dec 2018 19:01:05 +0200 Subject: [PATCH] fonts: describe few issues with the code; remove unused cmap type --- pdf/internal/textencoding/cmap.go | 36 ------------------ pdf/internal/textencoding/cmap_test.go | 45 ----------------------- pdf/internal/textencoding/truetype.go | 13 ++++--- pdf/internal/textencoding/winansi_test.go | 5 +++ pdf/model/font_simple.go | 4 +- pdf/model/fonts/ttfparser.go | 10 ++++- 6 files changed, 24 insertions(+), 89 deletions(-) delete mode 100644 pdf/internal/textencoding/cmap.go delete mode 100644 pdf/internal/textencoding/cmap_test.go diff --git a/pdf/internal/textencoding/cmap.go b/pdf/internal/textencoding/cmap.go deleted file mode 100644 index 63423c07..00000000 --- a/pdf/internal/textencoding/cmap.go +++ /dev/null @@ -1,36 +0,0 @@ -/* - * This file is subject to the terms and conditions defined in - * file 'LICENSE.md', which is part of this source code package. - */ - -package textencoding - -import "github.com/unidoc/unidoc/pdf/core" - -// CID represents a character identifier. -type CID uint16 - -// CMap maps character codes to CIDs. -type CMap interface { - CharacterCodesToCID(charcodes []byte) ([]CID, error) -} - -// CMapIdentityH is a representation of the /Identity-H cmap. -type CMapIdentityH struct { -} - -// CharacterCodesToCID converts charcodes to CIDs for the Identity CMap, which maps -// 2-byte character codes (from the raw data) from 0-65535 to the same 2-byte CID value. -func (cmap CMapIdentityH) CharacterCodesToCID(raw []byte) ([]CID, error) { - if len(raw)%2 != 0 { - return nil, core.ErrRangeError - } - - var cids []CID - for i := 0; i < len(raw); i += 2 { - b1 := CID(raw[i]) - b2 := CID(raw[i+1]) - cids = append(cids, (b1<<8)|b2) - } - return cids, nil -} diff --git a/pdf/internal/textencoding/cmap_test.go b/pdf/internal/textencoding/cmap_test.go deleted file mode 100644 index 3698b6c2..00000000 --- a/pdf/internal/textencoding/cmap_test.go +++ /dev/null @@ -1,45 +0,0 @@ -/* - * This file is subject to the terms and conditions defined in - * file 'LICENSE.md', which is part of this source code package. - */ - -package textencoding - -import "testing" - -func TestCMapIdentityH_CharacterCodesToCID(t *testing.T) { - identityCMap := CMapIdentityH{} - - type dataPair struct { - raw []byte - expected []CID - errs bool - } - - dataPairs := []dataPair{ - {[]byte{0x00, 0x00, 0x04, 0xff}, []CID{0x0000, 0x04ff}, false}, - {[]byte{0x00, 0x00, 0x04}, []CID{0x0000, 0x04ff}, true}, - } - - for _, data := range dataPairs { - cids, err := identityCMap.CharacterCodesToCID(data.raw) - if err != nil { - if data.errs { - continue - } - t.Errorf("Failed: %v", err) - return - } - - if len(data.expected) != len(cids) { - t.Errorf("Length mismatch") - return - } - - for i := 0; i < len(data.expected); i++ { - if cids[i] != data.expected[i] { - t.Errorf("Not equal") - } - } - } -} diff --git a/pdf/internal/textencoding/truetype.go b/pdf/internal/textencoding/truetype.go index 6ba84cf3..b55ef087 100644 --- a/pdf/internal/textencoding/truetype.go +++ b/pdf/internal/textencoding/truetype.go @@ -17,13 +17,14 @@ import ( // GID is a glyph index. type GID uint16 +// TODO(dennwc): should not mix Identity-H CMap and Encoding in the same object + // TrueTypeFontEncoder handles text encoding for composite TrueType fonts. // It performs mapping between character ids and glyph ids. // It has a preloaded rune (unicode code point) to glyph index map that has been loaded from a font. -// Corresponds to Identity-H. +// Corresponds to Identity-H CMap and Identity encoding. type TrueTypeFontEncoder struct { runeToGIDMap map[rune]GID - cmap CMap } // NewTrueTypeFontEncoder creates a new text encoder for TTF fonts with a runeToGlyphIndexMap that @@ -33,7 +34,6 @@ type TrueTypeFontEncoder struct { func NewTrueTypeFontEncoder(runeToGIDMap map[rune]GID) TrueTypeFontEncoder { return TrueTypeFontEncoder{ runeToGIDMap: runeToGIDMap, - cmap: CMapIdentityH{}, } } @@ -75,7 +75,7 @@ func (enc TrueTypeFontEncoder) Encode(raw string) []byte { // The bool return flag is true if there was a match, and false otherwise. func (enc TrueTypeFontEncoder) CharcodeToGlyph(code CharCode) (GlyphName, bool) { r, found := enc.CharcodeToRune(code) - if found && r == 0x20 { + if found && r == ' ' { return "space", true } @@ -139,9 +139,10 @@ func (enc TrueTypeFontEncoder) CharcodeToRune(code CharCode) (rune, bool) { // RuneToGlyph returns the glyph name for rune `r`. // The bool return flag is true if there was a match, and false otherwise. func (enc TrueTypeFontEncoder) RuneToGlyph(r rune) (GlyphName, bool) { - if r == 0x20 { + if r == ' ' { return "space", true } + // TODO(dennwc): this is wrong; font may override this with a "post" table that specifies glyph names glyph := GlyphName(fmt.Sprintf("uni%.4X", r)) return glyph, true } @@ -149,6 +150,7 @@ func (enc TrueTypeFontEncoder) RuneToGlyph(r rune) (GlyphName, bool) { // GlyphToRune returns the rune corresponding to glyph name `glyph`. // The bool return flag is true if there was a match, and false otherwise. func (enc TrueTypeFontEncoder) GlyphToRune(glyph GlyphName) (rune, bool) { + // TODO(dennwc): this is wrong; font may override this with a "post" table that specifies glyph names // String with "uniXXXX" format where XXXX is the hexcode. if len(glyph) == 7 && glyph[0:3] == "uni" { unicode := uint16(0) @@ -168,5 +170,6 @@ func (enc TrueTypeFontEncoder) GlyphToRune(glyph GlyphName) (rune, bool) { // ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file. func (enc TrueTypeFontEncoder) ToPdfObject() core.PdfObject { + // TODO(dennwc): reasonable question: why it have to implement this interface then? return core.MakeNull() } diff --git a/pdf/internal/textencoding/winansi_test.go b/pdf/internal/textencoding/winansi_test.go index 8eac31b8..3f2daa6c 100644 --- a/pdf/internal/textencoding/winansi_test.go +++ b/pdf/internal/textencoding/winansi_test.go @@ -15,6 +15,11 @@ func TestWinAnsiEncoder(t *testing.T) { t.Errorf("Glyph != space") return } + code, found := enc.RuneToCharcode('þ') + if !found || code != 254 { + t.Errorf("code != 254") + return + } glyph, found = enc.RuneToGlyph('þ') if !found || glyph != "thorn" { diff --git a/pdf/model/font_simple.go b/pdf/model/font_simple.go index 417321a3..627c5c15 100644 --- a/pdf/model/font_simple.go +++ b/pdf/model/font_simple.go @@ -430,14 +430,14 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) { continue } - pos, ok := ttf.Chars[r] + gid, ok := ttf.Chars[r] if !ok { common.Log.Debug("Rune not in TTF Chars") vals = append(vals, missingWidth) continue } - w := k * float64(ttf.Widths[pos]) + w := k * float64(ttf.Widths[gid]) vals = append(vals, w) } diff --git a/pdf/model/fonts/ttfparser.go b/pdf/model/fonts/ttfparser.go index 0622528f..d463e12d 100644 --- a/pdf/model/fonts/ttfparser.go +++ b/pdf/model/fonts/ttfparser.go @@ -47,6 +47,8 @@ import ( // MakeEncoder returns an encoder built from the tables in `rec`. func (ttf *TtfType) MakeEncoder() (*textencoding.SimpleEncoder, error) { encoding := make(map[textencoding.CharCode]GlyphName) + // TODO(dennwc): this is a bit strange, since TTF may contain more than 256 characters + // should probably make a different encoder here for code := textencoding.CharCode(0); code <= 256; code++ { r := rune(code) // TODO(dennwc): make sure this conversion is valid gid, ok := ttf.Chars[r] @@ -93,11 +95,14 @@ type TtfType struct { UnderlineThickness int16 Xmin, Ymin, Xmax, Ymax int16 CapHeight int16 - Widths []uint16 + // Widths is a list of glyph widths indexed by GID. + Widths []uint16 // Chars maps rune values (unicode) to GIDs (the indexes in GlyphNames). i.e. GlyphNames[Chars[r]] is // the glyph corresponding to rune r. // + // TODO(dennwc): CharCode is currently defined as uint16, but some tables may store 32 bit charcodes + // not the case right now, but make sure to update it once we support those tables // TODO(dennwc,peterwilliams97): it should map char codes to GIDs Chars map[rune]GID // GlyphNames is a list of glyphs from the "post" section of the TrueType file. @@ -117,6 +122,9 @@ func (ttf *TtfType) MakeToUnicode() *cmap.CMap { glyph := ttf.GlyphNames[gid] // TODO(dennwc): 'code' is already a rune; do we need this extra lookup? + // TODO(dennwc): this cannot be done here; glyphNames might be empty + // the parent font may specify a different encoding + // so we should remap on a higher level r, ok := textencoding.GlyphToRune(glyph) if !ok { common.Log.Debug("No rune. code=0x%04x glyph=%q", code, glyph)