unipdf/pdf/model/fontfile.go

275 lines
7.3 KiB
Go
Raw Normal View History

2018-07-24 21:32:02 +10:00
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*
/*
* A font file is a stream containing a Type 1 font program. It appears in PDF files as a
* /FontFile entry in a /FontDescriptor dictionary.
*
* 9.9 Embedded Font Programs (page 289)
*
* TODO: Add Type1C support
*/
2018-07-02 16:46:43 +10:00
package model
import (
"errors"
"fmt"
"regexp"
"strconv"
"strings"
"unicode"
"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/core"
2018-08-03 21:15:21 +00:00
"github.com/unidoc/unidoc/pdf/internal/textencoding"
2018-07-02 16:46:43 +10:00
)
2018-07-24 21:32:02 +10:00
// fontFile represents a font file.
// Currently this is just the identifying information and the text encoder created from the font
// file's encoding section.
2018-07-02 16:46:43 +10:00
type fontFile struct {
name string
2018-07-03 14:26:42 +10:00
subtype string
encoder *textencoding.SimpleEncoder
2018-07-02 16:46:43 +10:00
}
// String returns a human readable description of `fontfile`.
2018-07-03 14:26:42 +10:00
func (fontfile *fontFile) String() string {
encoding := "[None]"
if fontfile.encoder != nil {
encoding = fontfile.encoder.String()
}
return fmt.Sprintf("FONTFILE{%#q encoder=%s}", fontfile.name, encoding)
2018-07-03 14:26:42 +10:00
}
2018-07-02 16:46:43 +10:00
// newFontFileFromPdfObject loads a FontFile from a PdfObject. Can either be a
// *PdfIndirectObject or a *PdfObjectDictionary.
func newFontFileFromPdfObject(obj core.PdfObject) (*fontFile, error) {
2018-07-21 08:53:59 +10:00
common.Log.Trace("newFontFileFromPdfObject: obj=%s", obj)
2018-07-02 16:46:43 +10:00
fontfile := &fontFile{}
obj = core.TraceToDirectObject(obj)
2018-07-02 16:46:43 +10:00
streamObj, ok := obj.(*core.PdfObjectStream)
2018-07-02 16:46:43 +10:00
if !ok {
common.Log.Debug("ERROR: FontFile must be a stream (%T)", obj)
return nil, core.ErrTypeError
2018-07-02 16:46:43 +10:00
}
d := streamObj.PdfObjectDictionary
data, err := core.DecodeStream(streamObj)
2018-07-02 16:46:43 +10:00
if err != nil {
return nil, err
}
2018-07-24 21:32:02 +10:00
subtype, ok := core.GetNameVal(d.Get("Subtype"))
if !ok {
2018-07-03 14:26:42 +10:00
fontfile.subtype = subtype
if subtype == "Type1C" {
2018-12-09 21:37:27 +02:00
// TODO: Add Type1C support
common.Log.Debug("Type1C fonts are currently not supported")
return nil, ErrType1CFontNotSupported
2018-07-03 14:26:42 +10:00
}
}
2018-07-02 16:46:43 +10:00
2018-07-24 21:32:02 +10:00
length1, _ := core.GetIntVal(d.Get("Length1"))
length2, _ := core.GetIntVal(d.Get("Length2"))
2018-07-03 14:26:42 +10:00
if length1 > len(data) {
length1 = len(data)
}
if length1+length2 > len(data) {
length2 = len(data) - length1
}
2018-07-02 16:46:43 +10:00
2018-07-03 14:26:42 +10:00
segment1 := data[:length1]
var segment2 []byte
2018-07-03 14:26:42 +10:00
if length2 > 0 {
segment2 = data[length1 : length1+length2]
}
2018-07-02 16:46:43 +10:00
// empty streams are ignored
2018-07-03 14:26:42 +10:00
if length1 > 0 && length2 > 0 {
err := fontfile.loadFromSegments(segment1, segment2)
if err != nil {
return nil, err
2018-07-02 16:46:43 +10:00
}
}
2018-07-03 14:26:42 +10:00
2018-07-02 16:46:43 +10:00
return fontfile, nil
}
2018-07-03 14:26:42 +10:00
// loadFromSegments loads a Type1Font object from two header-less .pfb segments.
// Based on pdfbox
2018-07-02 16:46:43 +10:00
func (fontfile *fontFile) loadFromSegments(segment1, segment2 []byte) error {
2018-07-21 08:53:59 +10:00
common.Log.Trace("loadFromSegments: %d %d", len(segment1), len(segment2))
2018-07-03 14:26:42 +10:00
err := fontfile.parseAsciiPart(segment1)
2018-07-02 16:46:43 +10:00
if err != nil {
return err
}
2018-07-21 08:53:59 +10:00
common.Log.Trace("fontfile=%s", fontfile)
2018-07-02 16:46:43 +10:00
if len(segment2) == 0 {
return nil
}
2018-07-21 08:53:59 +10:00
common.Log.Trace("fontfile=%s", fontfile)
2018-07-02 16:46:43 +10:00
return nil
}
2018-07-03 14:26:42 +10:00
// parseAsciiPart parses the ASCII part of the FontFile.
func (fontfile *fontFile) parseAsciiPart(data []byte) error {
2018-07-24 21:32:02 +10:00
// Uncomment these lines to see the contents of the font file. For debugging.
// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~^^^~~~~~~~~~~~~~~~~~~~~~~~")
// fmt.Printf("data=%s\n", string(data))
// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~!!!~~~~~~~~~~~~~~~~~~~~~~~")
2018-07-03 14:26:42 +10:00
// The start of a FontFile looks like
// %!PS-AdobeFont-1.0: MyArial 003.002
// %%Title: MyArial
// or
// %!FontType1-1.0
2018-07-02 16:46:43 +10:00
if len(data) < 2 || string(data[:2]) != "%!" {
return errors.New("Invalid start of ASCII segment")
}
2018-07-03 14:26:42 +10:00
keySection, encodingSection, err := getAsciiSections(data)
2018-07-02 16:46:43 +10:00
if err != nil {
return err
}
keyValues := getKeyValues(keySection)
fontfile.name = keyValues["FontName"]
if fontfile.name == "" {
2018-07-03 14:26:42 +10:00
common.Log.Debug("ERROR: FontFile has no /FontName")
return ErrRequiredAttributeMissing
2018-07-02 16:46:43 +10:00
}
if encodingSection != "" {
encodings, err := getEncodings(encodingSection)
if err != nil {
return err
}
encoder, err := textencoding.NewCustomSimpleTextEncoder(encodings, nil)
if err != nil {
2018-12-09 21:37:27 +02:00
// TODO: Logging an error because we need to fix all these misses.
common.Log.Error("UNKNOWN GLYPH: err=%v", err)
2018-07-03 14:26:42 +10:00
return nil
2018-07-02 16:46:43 +10:00
}
fontfile.encoder = encoder
}
return nil
}
var (
2018-07-03 14:26:42 +10:00
reDictBegin = regexp.MustCompile(`\d+ dict\s+(dup\s+)?begin`)
2018-07-02 16:46:43 +10:00
reKeyVal = regexp.MustCompile(`^\s*/(\S+?)\s+(.+?)\s+def\s*$`)
reEncoding = regexp.MustCompile(`^\s*dup\s+(\d+)\s*/(\w+?)(?:\.\d+)?\s+put$`)
2018-07-02 16:46:43 +10:00
encodingBegin = "/Encoding 256 array"
encodingEnd = "readonly def"
binaryStart = "currentfile eexec"
)
2018-07-03 14:26:42 +10:00
// getAsciiSections returns two sections of `data`, the ASCII part of the FontFile
// - the general key values in `keySection`
// - the encoding in `encodingSection`
func getAsciiSections(data []byte) (keySection, encodingSection string, err error) {
2018-07-21 08:53:59 +10:00
common.Log.Trace("getAsciiSections: %d ", len(data))
2018-07-02 16:46:43 +10:00
loc := reDictBegin.FindIndex(data)
if loc == nil {
2018-07-21 08:53:59 +10:00
common.Log.Debug("ERROR: getAsciiSections. No dict.")
return "", "", core.ErrTypeError
2018-07-02 16:46:43 +10:00
}
i0 := loc[1]
2018-07-03 14:26:42 +10:00
i := strings.Index(string(data[i0:]), encodingBegin)
2018-07-02 16:46:43 +10:00
if i < 0 {
2018-07-03 14:26:42 +10:00
keySection = string(data[i0:])
return keySection, "", nil
2018-07-02 16:46:43 +10:00
}
i1 := i0 + i
keySection = string(data[i0:i1])
2018-07-03 14:26:42 +10:00
i2 := i1
2018-07-02 16:46:43 +10:00
i = strings.Index(string(data[i2:]), encodingEnd)
if i < 0 {
2018-07-21 08:53:59 +10:00
common.Log.Debug("ERROR: getAsciiSections. err=%v", err)
return "", "", core.ErrTypeError
2018-07-02 16:46:43 +10:00
}
i3 := i2 + i
encodingSection = string(data[i2:i3])
return keySection, encodingSection, nil
2018-07-02 16:46:43 +10:00
}
2018-07-24 21:32:02 +10:00
// ~/testdata/private/invoice61781040.pdf has \r line endings
var reEndline = regexp.MustCompile(`[\n\r]+`)
2018-07-03 14:26:42 +10:00
// getKeyValues returns the map encoded in `data`.
2018-07-02 16:46:43 +10:00
func getKeyValues(data string) map[string]string {
lines := reEndline.Split(data, -1)
2018-07-02 16:46:43 +10:00
keyValues := map[string]string{}
for _, line := range lines {
matches := reKeyVal.FindStringSubmatch(line)
if matches == nil {
continue
}
k, v := matches[1], matches[2]
keyValues[k] = v
}
return keyValues
}
2018-07-03 14:26:42 +10:00
// getEncodings returns the encodings encoded in `data`.
2018-11-29 23:24:40 +02:00
func getEncodings(data string) (map[textencoding.CharCode]textencoding.GlyphName, error) {
2018-07-02 16:46:43 +10:00
lines := strings.Split(data, "\n")
2018-11-29 23:24:40 +02:00
keyValues := make(map[textencoding.CharCode]textencoding.GlyphName)
2018-07-02 16:46:43 +10:00
for _, line := range lines {
matches := reEncoding.FindStringSubmatch(line)
if matches == nil {
continue
}
k, glyph := matches[1], matches[2]
code, err := strconv.Atoi(k)
if err != nil {
common.Log.Debug("ERROR: Bad encoding line. %q", line)
return nil, core.ErrTypeError
2018-07-02 16:46:43 +10:00
}
2018-11-29 23:24:40 +02:00
keyValues[textencoding.CharCode(code)] = textencoding.GlyphName(glyph)
2018-07-02 16:46:43 +10:00
}
2018-07-21 08:53:59 +10:00
common.Log.Trace("getEncodings: keyValues=%#v", keyValues)
2018-07-02 16:46:43 +10:00
return keyValues, nil
}
2018-07-03 14:26:42 +10:00
// decodeEexec returns the decoding of the eexec bytes `data`
func decodeEexec(data []byte) []byte {
2018-07-02 16:46:43 +10:00
const c1 = 52845
const c2 = 22719
2018-07-03 14:26:42 +10:00
seed := 55665 // eexec key
// Run the seed through the encoder 4 times
for _, b := range data[:4] {
seed = (int(b)+seed)*c1 + c2
2018-07-02 16:46:43 +10:00
}
2018-07-03 14:26:42 +10:00
decoded := make([]byte, len(data)-4)
for i, b := range data[4:] {
decoded[i] = byte(int(b) ^ seed>>8)
seed = (int(b)+seed)*c1 + c2
}
return decoded
2018-07-02 16:46:43 +10:00
}
2018-07-03 14:26:42 +10:00
// isBinary returns true if `data` is binary. See Adobe Type 1 Font Format specification
2018-07-02 16:46:43 +10:00
// 7.2 eexec encryption
func isBinary(data []byte) bool {
if len(data) < 4 {
return true
}
for b := range data[:4] {
r := rune(b)
if !unicode.Is(unicode.ASCII_Hex_Digit, r) && !unicode.IsSpace(r) {
return true
}
}
return false
}