Gunnsteinn Hall 11f692bc3a
Font subsetting and font optimization improvements (#362)
* Track runes in IdentityEncoder (for subsetting), track decoded runes

* Working with the identity encoder in font_composite.go

* Add GetFilterArray to multi encoder.  Add comments.

* Add NewFromContents constructor to extractor only requiring contents and resources

* golint fixes

* Optimizer compress streams - improved detection of raw streams

* Optimize - CleanContentStream optimizer that removes redundant operands

* WIP Optimize - clean fonts

Will support both font file reduction and subsetting. (WIP)

* Optimize - image processing - try combined DCT and Flate

* Update options.go

* Update optimizer.go

* Create utils.go for optimize with common methods needed for optimization

* Optimizer - add font subsetting method

Covers XObject Forms, annotaitons etc.  Uses extractor package to extract text marks covering what fonts and glyphs are used.  Package truetype used for subsetting.

* Add some comments

* Fix cmap parsing rune conversion

* Error checking for extractor.  Add some comments.

* Update Jenkinsfile

* Update modules
2020-06-16 21:19:10 +00:00

121 lines
3.8 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package textencoding
import (
"fmt"
"strconv"
"strings"
"github.com/unidoc/unipdf/v3/core"
)
// IdentityEncoder represents an 2-byte identity encoding.
// NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode
// meaning, except via the ToUnicode maps.
// TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing.
// Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual
// runes and character codes, CMaps together.
type IdentityEncoder struct {
baseName string
// runes registered by encoder for tracking what runes are used for subsetting.
registeredMap map[rune]struct{}
}
// NewIdentityTextEncoder returns a new IdentityEncoder based on predefined
// encoding `baseName` and difference map `differences`.
func NewIdentityTextEncoder(baseName string) *IdentityEncoder {
return &IdentityEncoder{
baseName: baseName,
}
}
// RegisteredRunes returns the slice of runes that have been registered as used by the encoder.
func (enc *IdentityEncoder) RegisteredRunes() []rune {
runes := make([]rune, len(enc.registeredMap))
i := 0
for r := range enc.registeredMap {
runes[i] = r
i++
}
return runes
}
// String returns a string that describes `enc`.
func (enc *IdentityEncoder) String() string {
return enc.baseName
}
// Encode converts the Go unicode string to a PDF encoded string.
func (enc *IdentityEncoder) Encode(str string) []byte {
return encodeString16bit(enc, str)
}
// Decode converts PDF encoded string to a Go unicode string.
func (enc *IdentityEncoder) Decode(raw []byte) string {
return decodeString16bit(enc, raw)
}
// RuneToCharcode converts rune `r` to a PDF character code.
// The bool return flag is true if there was a match, and false otherwise.
// TODO: Here the `r` is an actual rune.
func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
return CharCode(r), true
}
// CharcodeToRune converts PDF character code `code` to a rune.
// The bool return flag is true if there was a match, and false otherwise.
// TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both).
func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
// TODO: The rune(code) is confusing and is not an actual utf8 rune.
enc.registeredMap[rune(code)] = struct{}{}
return rune(code), true
}
// RuneToGlyph returns the glyph name for rune `r`.
// The bool return flag is true if there was a match, and false otherwise.
func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
if r == ' ' {
return "space", true
}
glyph := GlyphName(fmt.Sprintf("uni%.4X", r))
return glyph, true
}
// GlyphToRune returns the rune corresponding to glyph name `glyph`.
// The bool return flag is true if there was a match, and false otherwise.
func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
// String with "uniXXXX" format where XXXX is the hexcode.
if glyph == "space" {
return ' ', true
} else if !strings.HasPrefix(string(glyph), "uni") || len(glyph) != 7 {
return 0, false
}
r, err := strconv.ParseUint(string(glyph[3:]), 16, 16)
if err != nil {
return 0, false
}
return rune(r), true
}
// ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
func (enc *IdentityEncoder) ToPdfObject() core.PdfObject {
if enc.baseName != "" {
return core.MakeName(enc.baseName)
}
return core.MakeNull()
}