Gunnsteinn Hall ad2a1e9c9d
Subsetting fixes (#346)
* Update unitype lib which improves subsetting

* Add text extraction check to creator font subsetting example

Helps ensure ToUnicode map is set correctly.

* Clean up import

* Fix spelling
2020-05-12 07:15:09 +00:00

165 lines
5.2 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package textencoding
import (
"fmt"
"sort"
"strings"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core"
)
// GID is a glyph index.
type GID uint16
// TODO(dennwc): should not mix Identity-H CMap and Encoding in the same object
// TrueTypeFontEncoder handles text encoding for composite TrueType fonts.
// It performs mapping between character ids and glyph ids.
// It has a preloaded rune (unicode code point) to glyph index map that has been loaded from a font.
// Corresponds to Identity-H CMap and Identity encoding.
type TrueTypeFontEncoder struct {
runeToGIDMap map[rune]GID
// runes registered by encoder for tracking what runes are used for subsetting.
registeredMap map[rune]struct{}
}
// SubsetRegistered subsets `enc` to only registered runes (that have been registered via encoding).
// NOTE: Make sure to call this soon before writing (once all needed runes have been registered).
func (enc *TrueTypeFontEncoder) SubsetRegistered() {
common.Log.Info("TTF Subset: Pruning")
for r := range enc.runeToGIDMap {
if _, has := enc.registeredMap[r]; !has {
delete(enc.runeToGIDMap, r)
}
}
}
// RegisteredRunes returns the slice of runes that have been registered as used by the encoder.
func (enc *TrueTypeFontEncoder) RegisteredRunes() []rune {
runes := make([]rune, len(enc.registeredMap))
i := 0
for r := range enc.registeredMap {
runes[i] = r
i++
}
return runes
}
// NewTrueTypeFontEncoder creates a new text encoder for TTF fonts with a runeToGlyphIndexMap that
// has been preloaded from the font file.
// The new instance is preloaded with a CMapIdentityH (Identity-H) CMap which maps 2-byte charcodes
// to CIDs (glyph index).
func NewTrueTypeFontEncoder(runeToGIDMap map[rune]GID) *TrueTypeFontEncoder {
return &TrueTypeFontEncoder{
runeToGIDMap: runeToGIDMap,
}
}
// ttEncoderMaxNumEntries is the maximum number of encoding entries shown in simpleEncoder.String().
const ttEncoderMaxNumEntries = 10
// String returns a string that describes `enc`.
func (enc *TrueTypeFontEncoder) String() string {
parts := []string{
fmt.Sprintf("%d entries", len(enc.runeToGIDMap)),
}
runes := make([]rune, 0, len(enc.runeToGIDMap))
for r := range enc.runeToGIDMap {
runes = append(runes, r)
}
sort.Slice(runes, func(i, j int) bool {
return runes[i] < runes[j]
})
n := len(runes)
if n > ttEncoderMaxNumEntries {
n = ttEncoderMaxNumEntries
}
for i := 0; i < n; i++ {
r := runes[i]
parts = append(parts, fmt.Sprintf("%d=0x%02x: %q",
r, r, enc.runeToGIDMap[r]))
}
return fmt.Sprintf("TRUETYPE_ENCODER{%s}", strings.Join(parts, ", "))
}
// Encode converts the Go unicode string to a PDF encoded string.
func (enc *TrueTypeFontEncoder) Encode(str string) []byte {
return encodeString16bit(enc, str)
}
// Decode converts PDF encoded string to a Go unicode string.
func (enc *TrueTypeFontEncoder) Decode(raw []byte) string {
return decodeString16bit(enc, raw)
}
// GlyphToCharcode returns character code matching the glyph name `glyph`.
// The bool return flag is true if there was a match, and false otherwise.
func (enc *TrueTypeFontEncoder) GlyphToCharcode(glyph GlyphName) (CharCode, bool) {
// String with "uniXXXX" format where XXXX is the hexcode.
if len(glyph) == 7 && glyph[0:3] == "uni" {
var unicode uint16
n, err := fmt.Sscanf(string(glyph), "uni%X", &unicode)
if n == 1 && err == nil {
return enc.RuneToCharcode(rune(unicode))
}
}
// Look in glyphlist.
if rune, found := glyphlistGlyphToRuneMap[glyph]; found {
return enc.RuneToCharcode(rune)
}
common.Log.Debug("Symbol encoding error: unable to find glyph->charcode entry (%s)", glyph)
return 0, false
}
// RuneToCharcode converts rune `r` to a PDF character code.
// The bool return flag is true if there was a match, and false otherwise.
func (enc *TrueTypeFontEncoder) RuneToCharcode(r rune) (CharCode, bool) {
glyphIndex, ok := enc.runeToGIDMap[r]
if !ok {
common.Log.Debug("Missing rune %d (%+q) from encoding", r, r)
return 0, false
}
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
// Identity : charcode <-> glyphIndex
// TODO(dennwc): Here charcode is probably the same as CID.
// TODO(dennwc): Find out what are the alternative mappings (enc.cmap?).
charcode := CharCode(glyphIndex)
return charcode, true
}
// CharcodeToRune converts PDF character code `code` to a rune.
// The bool return flag is true if there was a match, and false otherwise.
func (enc *TrueTypeFontEncoder) CharcodeToRune(code CharCode) (rune, bool) {
// TODO: Make a reverse map stored.
for r, gid := range enc.runeToGIDMap {
// Identity : glyphIndex <-> charcode
charcode := CharCode(gid)
if charcode == code {
return r, true
}
}
common.Log.Debug("CharcodeToRune: No match. code=0x%04x enc=%s", code, enc)
return 0, false
}
// ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
func (enc *TrueTypeFontEncoder) ToPdfObject() core.PdfObject {
// TODO(dennwc): reasonable question: why it have to implement this interface then?
return core.MakeNull()
}