unipdf/pdf/model/fontfile.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 *

 /*
  * A font file is a stream containing a Type 1 font program. It appears in PDF files as a
  * /FontFile entry in a /FontDescriptor dictionary.
  *
  * 9.9 Embedded Font Programs (page 289)
  *
  * TODO: Add Type1C support
*/

package model

import (
	"errors"
	"fmt"
	"regexp"
	"strconv"
	"strings"
	"unicode"

	"github.com/unidoc/unidoc/common"
	"github.com/unidoc/unidoc/pdf/core"
	"github.com/unidoc/unidoc/pdf/model/textencoding"
)

// fontFile represents a font file.
// Currently this is just the identifying information and the text encoder created from the font
// file's encoding section.
type fontFile struct {
	name    string
	subtype string
	encoder *textencoding.SimpleEncoder
}

// String returns a human readable description of `fontfile`.
func (fontfile *fontFile) String() string {
	encoding := "[None]"
	if fontfile.encoder != nil {
		encoding = fontfile.encoder.String()
	}
	return fmt.Sprintf("FONTFILE{%#q encoder=%s}", fontfile.name, encoding)
}

// newFontFileFromPdfObject loads a FontFile from a PdfObject.  Can either be a
// *PdfIndirectObject or a *PdfObjectDictionary.
func newFontFileFromPdfObject(obj core.PdfObject) (*fontFile, error) {
	common.Log.Trace("newFontFileFromPdfObject: obj=%s", obj)
	fontfile := &fontFile{}

	obj = core.TraceToDirectObject(obj)

	streamObj, ok := obj.(*core.PdfObjectStream)
	if !ok {
		common.Log.Debug("ERROR: FontFile must be a stream (%T)", obj)
		return nil, core.ErrTypeError
	}
	d := streamObj.PdfObjectDictionary
	data, err := core.DecodeStream(streamObj)
	if err != nil {
		return nil, err
	}

	subtype, ok := core.GetNameVal(d.Get("Subtype"))
	if !ok {
		fontfile.subtype = subtype
		if subtype == "Type1C" {
			// XXX: TODO Add Type1C support
			common.Log.Debug("Type1C fonts are currently not supported")
			return nil, ErrType1CFontNotSupported
		}
	}

	length1, _ := core.GetIntVal(d.Get("Length1"))
	length2, _ := core.GetIntVal(d.Get("Length2"))

	if length1 > len(data) {
		length1 = len(data)
	}
	if length1+length2 > len(data) {
		length2 = len(data) - length1
	}

	segment1 := data[:length1]
	segment2 := []byte{}
	if length2 > 0 {
		segment2 = data[length1 : length1+length2]
	}

	// empty streams are ignored
	if length1 > 0 && length2 > 0 {
		err := fontfile.loadFromSegments(segment1, segment2)
		if err != nil {
			return nil, err
		}
	}

	return fontfile, nil
}

// loadFromSegments loads a Type1Font object from two header-less .pfb segments.
// Based on pdfbox
func (fontfile *fontFile) loadFromSegments(segment1, segment2 []byte) error {
	common.Log.Trace("loadFromSegments: %d %d", len(segment1), len(segment2))
	err := fontfile.parseAsciiPart(segment1)
	if err != nil {
		return err
	}
	common.Log.Trace("fontfile=%s", fontfile)
	if len(segment2) == 0 {
		return nil
	}
	common.Log.Trace("fontfile=%s", fontfile)
	return nil
}

// parseAsciiPart parses the ASCII part of the FontFile.
func (fontfile *fontFile) parseAsciiPart(data []byte) error {

	// Uncomment these lines to see the contents of the font file. For debugging.
	// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~^^^~~~~~~~~~~~~~~~~~~~~~~~")
	// fmt.Printf("data=%s\n", string(data))
	// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~!!!~~~~~~~~~~~~~~~~~~~~~~~")

	// The start of a FontFile looks like
	//     %!PS-AdobeFont-1.0: MyArial 003.002
	//     %%Title: MyArial
	// or
	//     %!FontType1-1.0
	if len(data) < 2 || string(data[:2]) != "%!" {
		return errors.New("Invalid start of ASCII segment")
	}

	keySection, encodingSection, err := getAsciiSections(data)
	if err != nil {
		return err
	}
	keyValues := getKeyValues(keySection)

	fontfile.name = keyValues["FontName"]
	if fontfile.name == "" {
		common.Log.Debug("ERROR: FontFile has no /FontName")
		return ErrRequiredAttributeMissing
	}

	if encodingSection != "" {
		encodings, err := getEncodings(encodingSection)
		if err != nil {
			return err
		}
		encoder, err := textencoding.NewCustomSimpleTextEncoder(encodings, nil)
		if err != nil {
			// XXX: Logging an error because we need to fix all these misses.
			common.Log.Error("UNKNOWN GLYPH: err=%v", err)
			return nil
		}
		fontfile.encoder = encoder
	}
	return nil
}

var (
	reDictBegin   = regexp.MustCompile(`\d+ dict\s+(dup\s+)?begin`)
	reKeyVal      = regexp.MustCompile(`^\s*/(\S+?)\s+(.+?)\s+def\s*$`)
	reEncoding    = regexp.MustCompile(`^\s*dup\s+(\d+)\s*/(\w+?)(?:\.\d+)?\s+put$`)
	encodingBegin = "/Encoding 256 array"
	encodingEnd   = "readonly def"
	binaryStart   = "currentfile eexec"
)

// getAsciiSections returns two sections of `data`, the ASCII part of the FontFile
//   - the general key values in `keySection`
//   - the encoding in `encodingSection`
func getAsciiSections(data []byte) (keySection, encodingSection string, err error) {
	common.Log.Trace("getAsciiSections: %d ", len(data))
	loc := reDictBegin.FindIndex(data)
	if loc == nil {
		common.Log.Debug("ERROR: getAsciiSections. No dict.")
		return "", "", core.ErrTypeError
	}
	i0 := loc[1]
	i := strings.Index(string(data[i0:]), encodingBegin)
	if i < 0 {
		keySection = string(data[i0:])
		return keySection, "", nil
	}
	i1 := i0 + i
	keySection = string(data[i0:i1])

	i2 := i1
	i = strings.Index(string(data[i2:]), encodingEnd)
	if i < 0 {
		common.Log.Debug("ERROR: getAsciiSections. err=%v", err)
		return "", "", core.ErrTypeError
	}
	i3 := i2 + i
	encodingSection = string(data[i2:i3])
	return keySection, encodingSection, nil
}

// ~/testdata/private/invoice61781040.pdf has \r line endings
var reEndline = regexp.MustCompile(`[\n\r]+`)

// getKeyValues returns the map encoded in `data`.
func getKeyValues(data string) map[string]string {
	lines := reEndline.Split(data, -1)
	keyValues := map[string]string{}
	for _, line := range lines {
		matches := reKeyVal.FindStringSubmatch(line)
		if matches == nil {
			continue
		}
		k, v := matches[1], matches[2]
		keyValues[k] = v
	}
	return keyValues
}

// getEncodings returns the encodings encoded in `data`.
func getEncodings(data string) (map[uint16]string, error) {
	lines := strings.Split(data, "\n")
	keyValues := map[uint16]string{}
	for _, line := range lines {
		matches := reEncoding.FindStringSubmatch(line)
		if matches == nil {
			continue
		}
		k, glyph := matches[1], matches[2]
		code, err := strconv.Atoi(k)
		if err != nil {
			common.Log.Debug("ERROR: Bad encoding line. %q", line)
			return nil, core.ErrTypeError
		}
		keyValues[uint16(code)] = glyph
	}
	common.Log.Trace("getEncodings: keyValues=%#v", keyValues)
	return keyValues, nil
}

// decodeEexec returns the decoding of the eexec bytes `data`
func decodeEexec(data []byte) []byte {
	const c1 = 52845
	const c2 = 22719

	seed := 55665 // eexec key
	// Run the seed through the encoder 4 times
	for _, b := range data[:4] {
		seed = (int(b)+seed)*c1 + c2
	}
	decoded := make([]byte, len(data)-4)
	for i, b := range data[4:] {
		decoded[i] = byte(int(b) ^ seed>>8)
		seed = (int(b)+seed)*c1 + c2
	}
	return decoded
}

// isBinary returns true if `data` is binary. See Adobe Type 1 Font Format specification
// 7.2 eexec encryption
func isBinary(data []byte) bool {
	if len(data) < 4 {
		return true
	}
	for b := range data[:4] {
		r := rune(b)
		if !unicode.Is(unicode.ASCII_Hex_Digit, r) && !unicode.IsSpace(r) {
			return true
		}
	}
	return false
}

// truncate returns the first `n` characters f string `s`.
func truncate(s string, n int) string {
	if len(s) < n {
		return s
	}
	return s[:n]
}