unipdf/pdf/model/fontfile.go
Peter Williams 08c3211590 Refactored simple textencoding
Made GlyphToCode work for all tables
Moved more aliases into glyphAliases rather than leaving the duplicates in the base maps.
Use SimpleEncoder explictly for simple fonts
2018-07-31 11:52:24 +10:00

283 lines
7.3 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*
/*
* A font file is a stream containing a Type 1 font program. It appears in PDF files as a
* /FontFile entry in a /FontDescriptor dictionary.
*
* 9.9 Embedded Font Programs (page 289)
*
* TODO: Add Type1C support
*/
package model
import (
"errors"
"fmt"
"regexp"
"strconv"
"strings"
"unicode"
"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/model/textencoding"
)
// fontFile represents a font file.
// Currently this is just the identifying information and the text encoder created from the font
// file's encoding section.
type fontFile struct {
name string
subtype string
encoder *textencoding.SimpleEncoder
}
// String returns a human readable description of `fontfile`.
func (fontfile *fontFile) String() string {
encoding := "[None]"
if fontfile.encoder != nil {
encoding = fontfile.encoder.String()
}
return fmt.Sprintf("FONTFILE{%#q encoder=%s}", fontfile.name, encoding)
}
// newFontFileFromPdfObject loads a FontFile from a PdfObject. Can either be a
// *PdfIndirectObject or a *PdfObjectDictionary.
func newFontFileFromPdfObject(obj core.PdfObject) (*fontFile, error) {
common.Log.Trace("newFontFileFromPdfObject: obj=%s", obj)
fontfile := &fontFile{}
obj = core.TraceToDirectObject(obj)
streamObj, ok := obj.(*core.PdfObjectStream)
if !ok {
common.Log.Debug("ERROR: FontFile must be a stream (%T)", obj)
return nil, core.ErrTypeError
}
d := streamObj.PdfObjectDictionary
data, err := core.DecodeStream(streamObj)
if err != nil {
return nil, err
}
subtype, ok := core.GetNameVal(d.Get("Subtype"))
if !ok {
fontfile.subtype = subtype
if subtype == "Type1C" {
// XXX: TODO Add Type1C support
common.Log.Debug("Type1C fonts are currently not supported")
return nil, ErrType1CFontNotSupported
}
}
length1, _ := core.GetIntVal(d.Get("Length1"))
length2, _ := core.GetIntVal(d.Get("Length2"))
if length1 > len(data) {
length1 = len(data)
}
if length1+length2 > len(data) {
length2 = len(data) - length1
}
segment1 := data[:length1]
segment2 := []byte{}
if length2 > 0 {
segment2 = data[length1 : length1+length2]
}
// empty streams are ignored
if length1 > 0 && length2 > 0 {
err := fontfile.loadFromSegments(segment1, segment2)
if err != nil {
return nil, err
}
}
return fontfile, nil
}
// loadFromSegments loads a Type1Font object from two header-less .pfb segments.
// Based on pdfbox
func (fontfile *fontFile) loadFromSegments(segment1, segment2 []byte) error {
common.Log.Trace("loadFromSegments: %d %d", len(segment1), len(segment2))
err := fontfile.parseAsciiPart(segment1)
if err != nil {
return err
}
common.Log.Trace("fontfile=%s", fontfile)
if len(segment2) == 0 {
return nil
}
common.Log.Trace("fontfile=%s", fontfile)
return nil
}
// parseAsciiPart parses the ASCII part of the FontFile.
func (fontfile *fontFile) parseAsciiPart(data []byte) error {
// Uncomment these lines to see the contents of the font file. For debugging.
// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~^^^~~~~~~~~~~~~~~~~~~~~~~~")
// fmt.Printf("data=%s\n", string(data))
// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~!!!~~~~~~~~~~~~~~~~~~~~~~~")
// The start of a FontFile looks like
// %!PS-AdobeFont-1.0: MyArial 003.002
// %%Title: MyArial
// or
// %!FontType1-1.0
if len(data) < 2 || string(data[:2]) != "%!" {
return errors.New("Invalid start of ASCII segment")
}
keySection, encodingSection, err := getAsciiSections(data)
if err != nil {
return err
}
keyValues := getKeyValues(keySection)
fontfile.name = keyValues["FontName"]
if fontfile.name == "" {
common.Log.Debug("ERROR: FontFile has no /FontName")
return ErrRequiredAttributeMissing
}
if encodingSection != "" {
encodings, err := getEncodings(encodingSection)
if err != nil {
return err
}
encoder, err := textencoding.NewCustomSimpleTextEncoder(encodings, nil)
if err != nil {
// XXX: Logging an error because we need to fix all these misses.
common.Log.Error("UNKNOWN GLYPH: err=%v", err)
return nil
}
fontfile.encoder = encoder
}
return nil
}
var (
reDictBegin = regexp.MustCompile(`\d+ dict\s+(dup\s+)?begin`)
reKeyVal = regexp.MustCompile(`^\s*/(\S+?)\s+(.+?)\s+def\s*$`)
reEncoding = regexp.MustCompile(`^\s*dup\s+(\d+)\s*/(\w+?)(?:\.\d+)?\s+put$`)
encodingBegin = "/Encoding 256 array"
encodingEnd = "readonly def"
binaryStart = "currentfile eexec"
)
// getAsciiSections returns two sections of `data`, the ASCII part of the FontFile
// - the general key values in `keySection`
// - the encoding in `encodingSection`
func getAsciiSections(data []byte) (keySection, encodingSection string, err error) {
common.Log.Trace("getAsciiSections: %d ", len(data))
loc := reDictBegin.FindIndex(data)
if loc == nil {
common.Log.Debug("ERROR: getAsciiSections. No dict.")
return "", "", core.ErrTypeError
}
i0 := loc[1]
i := strings.Index(string(data[i0:]), encodingBegin)
if i < 0 {
keySection = string(data[i0:])
return keySection, "", nil
}
i1 := i0 + i
keySection = string(data[i0:i1])
i2 := i1
i = strings.Index(string(data[i2:]), encodingEnd)
if i < 0 {
common.Log.Debug("ERROR: getAsciiSections. err=%v", err)
return "", "", core.ErrTypeError
}
i3 := i2 + i
encodingSection = string(data[i2:i3])
return keySection, encodingSection, nil
}
// ~/testdata/private/invoice61781040.pdf has \r line endings
var reEndline = regexp.MustCompile(`[\n\r]+`)
// getKeyValues returns the map encoded in `data`.
func getKeyValues(data string) map[string]string {
lines := reEndline.Split(data, -1)
keyValues := map[string]string{}
for _, line := range lines {
matches := reKeyVal.FindStringSubmatch(line)
if matches == nil {
continue
}
k, v := matches[1], matches[2]
keyValues[k] = v
}
return keyValues
}
// getEncodings returns the encodings encoded in `data`.
func getEncodings(data string) (map[uint16]string, error) {
lines := strings.Split(data, "\n")
keyValues := map[uint16]string{}
for _, line := range lines {
matches := reEncoding.FindStringSubmatch(line)
if matches == nil {
continue
}
k, glyph := matches[1], matches[2]
code, err := strconv.Atoi(k)
if err != nil {
common.Log.Debug("ERROR: Bad encoding line. %q", line)
return nil, core.ErrTypeError
}
keyValues[uint16(code)] = glyph
}
common.Log.Trace("getEncodings: keyValues=%#v", keyValues)
return keyValues, nil
}
// decodeEexec returns the decoding of the eexec bytes `data`
func decodeEexec(data []byte) []byte {
const c1 = 52845
const c2 = 22719
seed := 55665 // eexec key
// Run the seed through the encoder 4 times
for _, b := range data[:4] {
seed = (int(b)+seed)*c1 + c2
}
decoded := make([]byte, len(data)-4)
for i, b := range data[4:] {
decoded[i] = byte(int(b) ^ seed>>8)
seed = (int(b)+seed)*c1 + c2
}
return decoded
}
// isBinary returns true if `data` is binary. See Adobe Type 1 Font Format specification
// 7.2 eexec encryption
func isBinary(data []byte) bool {
if len(data) < 4 {
return true
}
for b := range data[:4] {
r := rune(b)
if !unicode.Is(unicode.ASCII_Hex_Digit, r) && !unicode.IsSpace(r) {
return true
}
}
return false
}
// truncate returns the first `n` characters f string `s`.
func truncate(s string, n int) string {
if len(s) < n {
return s
}
return s[:n]
}