unipdf/extractor/text.go

1300 lines
39 KiB
Go
Raw Normal View History

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"errors"
"fmt"
"math"
"sort"
"strings"
2018-11-28 18:06:03 +11:00
"unicode"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/contentstream"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/internal/transform"
"github.com/unidoc/unipdf/v3/model"
2018-11-28 18:06:03 +11:00
"golang.org/x/text/unicode/norm"
)
// ExtractText processes and extracts all text data in content streams and returns as a string.
// It takes into account character encodings in the PDF file, which are decoded by
// CharcodeBytesToUnicode.
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
func (e *Extractor) ExtractText() (string, error) {
2018-11-28 23:25:17 +00:00
text, _, _, err := e.ExtractTextWithStats()
return text, err
}
2019-01-04 16:07:03 +11:00
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
2019-01-04 16:07:03 +11:00
pageText, numChars, numMisses, err := e.ExtractPageText()
if err != nil {
return "", numChars, numMisses, err
}
2019-01-04 16:07:03 +11:00
return pageText.ToText(), numChars, numMisses, nil
}
2019-01-04 16:07:03 +11:00
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
return e.extractPageText(e.contents, e.resources, 0)
}
2019-01-04 16:07:03 +11:00
// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
// PageText.
2018-12-27 21:33:31 +11:00
// This can be called on a page or a form XObject.
2019-01-04 16:07:03 +11:00
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (*PageText, int, int, error) {
2019-01-04 16:07:03 +11:00
common.Log.Trace("extractPageText: level=%d", level)
pageText := &PageText{}
state := newTextState()
fontStack := fontStacker{}
var to *textObject
cstreamParser := contentstream.NewContentStreamParser(contents)
operations, err := cstreamParser.Parse()
if err != nil {
2019-01-04 16:07:03 +11:00
common.Log.Debug("ERROR: extractPageText parse failed. err=%v", err)
return pageText, state.numChars, state.numMisses, err
}
processor := contentstream.NewContentStreamProcessor(*operations)
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
resources *model.PdfPageResources) error {
operand := op.Operand
switch operand {
case "q":
if !fontStack.empty() {
common.Log.Trace("Save font state: %s\n%s",
fontStack.peek(), fontStack.String())
fontStack.push(fontStack.peek())
}
2019-01-02 10:39:30 +11:00
if state.tfont != nil {
common.Log.Trace("Save font state: %s\n->%s\n%s",
2019-01-02 10:39:30 +11:00
fontStack.peek(), state.tfont, fontStack.String())
fontStack.push(state.tfont)
}
case "Q":
if !fontStack.empty() {
common.Log.Trace("Restore font state: %s\n->%s\n%s",
fontStack.peek(), fontStack.get(-2), fontStack.String())
fontStack.pop()
}
if len(fontStack) >= 2 {
common.Log.Trace("Restore font state: %s\n->%s\n%s",
2019-01-02 10:39:30 +11:00
state.tfont, fontStack.peek(), fontStack.String())
state.tfont = fontStack.pop()
}
case "BT": // Begin text
// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
// not appear before an ET.
if to != nil {
common.Log.Debug("BT called while in a text object")
}
to = newTextObject(e, resources, gs, &state, &fontStack)
case "ET": // End Text
2019-01-05 14:10:54 +11:00
pageText.marks = append(pageText.marks, to.marks...)
to = nil
case "T*": // Move to start of next text line
to.nextLine()
case "Td": // Move text location
if ok, err := to.checkOp(op, 2, true); !ok {
common.Log.Debug("ERROR: err=%v", err)
return err
}
x, y, err := toFloatXY(op.Params)
if err != nil {
return err
}
to.moveText(x, y)
case "TD": // Move text location and set leading.
if ok, err := to.checkOp(op, 2, true); !ok {
common.Log.Debug("ERROR: err=%v", err)
return err
}
x, y, err := toFloatXY(op.Params)
if err != nil {
common.Log.Debug("ERROR: err=%v", err)
return err
}
to.moveTextSetLeading(x, y)
2018-12-27 21:33:31 +11:00
case "Tj": // Show text.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: Tj op=%s err=%v", op, err)
return err
}
charcodes, ok := core.GetStringBytes(op.Params[0])
if !ok {
common.Log.Debug("ERROR: Tj op=%s GetStringBytes failed", op)
return core.ErrTypeError
}
return to.showText(charcodes)
2018-12-27 21:33:31 +11:00
case "TJ": // Show text with adjustable spacing.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: TJ err=%v", err)
return err
}
2018-07-25 13:19:09 +10:00
args, ok := core.GetArray(op.Params[0])
if !ok {
common.Log.Debug("ERROR: TJ op=%s GetArrayVal failed", op)
return err
}
return to.showTextAdjusted(args)
2018-12-27 21:33:31 +11:00
case "'": // Move to next line and show text.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: ' err=%v", err)
return err
}
charcodes, ok := core.GetStringBytes(op.Params[0])
if !ok {
common.Log.Debug("ERROR: ' op=%s GetStringBytes failed", op)
return core.ErrTypeError
}
to.nextLine()
return to.showText(charcodes)
2018-12-27 21:33:31 +11:00
case `"`: // Set word and character spacing, move to next line, and show text.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: \" err=%v", err)
return err
}
x, y, err := toFloatXY(op.Params[:2])
if err != nil {
return err
}
charcodes, ok := core.GetStringBytes(op.Params[2])
if !ok {
common.Log.Debug("ERROR: \" op=%s GetStringBytes failed", op)
return core.ErrTypeError
}
to.setCharSpacing(x)
to.setWordSpacing(y)
to.nextLine()
return to.showText(charcodes)
2018-12-27 21:33:31 +11:00
case "TL": // Set text leading.
y, err := floatParam(op)
if err != nil {
common.Log.Debug("ERROR: TL err=%v", err)
return err
}
to.setTextLeading(y)
2018-12-27 21:33:31 +11:00
case "Tc": // Set character spacing.
y, err := floatParam(op)
if err != nil {
common.Log.Debug("ERROR: Tc err=%v", err)
return err
}
to.setCharSpacing(y)
2018-12-27 21:33:31 +11:00
case "Tf": // Set font.
if to == nil {
2018-11-30 23:01:04 +00:00
// This is needed for 26-Hazard-Thermal-environment.pdf
to = newTextObject(e, resources, gs, &state, &fontStack)
}
if ok, err := to.checkOp(op, 2, true); !ok {
common.Log.Debug("ERROR: Tf err=%v", err)
return err
}
name, ok := core.GetNameVal(op.Params[0])
if !ok {
common.Log.Debug("ERROR: Tf op=%s GetNameVal failed", op)
return core.ErrTypeError
}
size, err := core.GetNumberAsFloat(op.Params[1])
if !ok {
common.Log.Debug("ERROR: Tf op=%s GetFloatVal failed. err=%v", op, err)
return err
}
err = to.setFont(name, size)
if err != nil {
return err
}
2018-12-27 21:33:31 +11:00
case "Tm": // Set text matrix.
if ok, err := to.checkOp(op, 6, true); !ok {
common.Log.Debug("ERROR: Tm err=%v", err)
return err
}
floats, err := core.GetNumbersAsFloat(op.Params)
if err != nil {
common.Log.Debug("ERROR: err=%v", err)
return err
}
to.setTextMatrix(floats)
2018-12-27 21:33:31 +11:00
case "Tr": // Set text rendering mode.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: Tr err=%v", err)
return err
}
mode, ok := core.GetIntVal(op.Params[0])
if !ok {
common.Log.Debug("ERROR: Tr op=%s GetIntVal failed", op)
return core.ErrTypeError
}
to.setTextRenderMode(mode)
2018-12-27 21:33:31 +11:00
case "Ts": // Set text rise.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: Ts err=%v", err)
return err
}
y, err := core.GetNumberAsFloat(op.Params[0])
if err != nil {
common.Log.Debug("ERROR: err=%v", err)
return err
}
to.setTextRise(y)
2018-12-27 21:33:31 +11:00
case "Tw": // Set word spacing.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: err=%v", err)
return err
}
y, err := core.GetNumberAsFloat(op.Params[0])
if err != nil {
common.Log.Debug("ERROR: err=%v", err)
return err
}
to.setWordSpacing(y)
2018-12-27 21:33:31 +11:00
case "Tz": // Set horizontal scaling.
if ok, err := to.checkOp(op, 1, true); !ok {
common.Log.Debug("ERROR: err=%v", err)
return err
}
y, err := core.GetNumberAsFloat(op.Params[0])
if err != nil {
common.Log.Debug("ERROR: err=%v", err)
return err
}
to.setHorizScaling(y)
case "Do":
2018-12-27 21:33:31 +11:00
// Handle XObjects by recursing through form XObjects.
name := *op.Params[0].(*core.PdfObjectName)
_, xtype := resources.GetXObjectByName(name)
if xtype != model.XObjectTypeForm {
break
}
2018-12-27 20:53:37 +11:00
// Only process each form once.
formResult, ok := e.formResults[string(name)]
if !ok {
xform, err := resources.GetXObjectFormByName(name)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
formContent, err := xform.GetContentStream()
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
formResources := xform.Resources
if formResources == nil {
formResources = resources
}
2019-01-04 16:07:03 +11:00
tList, numChars, numMisses, err := e.extractPageText(string(formContent),
formResources, level+1)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
formResult = textResult{*tList, numChars, numMisses}
e.formResults[string(name)] = formResult
}
2019-01-05 14:10:54 +11:00
pageText.marks = append(pageText.marks, formResult.pageText.marks...)
state.numChars += formResult.numChars
state.numMisses += formResult.numMisses
}
return nil
})
err = processor.Process(resources)
if err != nil {
common.Log.Debug("ERROR: Processing: err=%v", err)
}
2019-01-04 16:07:03 +11:00
return pageText, state.numChars, state.numMisses, err
}
type textResult struct {
2019-01-04 16:07:03 +11:00
pageText PageText
numChars int
numMisses int
}
//
// Text operators
//
// moveText "Td" Moves start of text by `tx`,`ty`.
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
// tx and ty are in unscaled text space units.
func (to *textObject) moveText(tx, ty float64) {
to.moveTo(tx, ty)
}
// moveTextSetLeading "TD" Move text location and set leading.
// Move to the start of the next line, offset from the start of the current line by (tx, ty). As a
// side effect, this operator shall set the leading parameter in the text state. This operator shall
// have the same effect as this code:
// ty TL
// tx ty Td
func (to *textObject) moveTextSetLeading(tx, ty float64) {
2019-01-02 10:39:30 +11:00
to.state.tl = -ty
to.moveTo(tx, ty)
}
2019-01-03 15:41:36 +11:00
// nextLine "T*"" Moves start of text line to next text line
// Move to the start of the next line. This operator has the same effect as the code
// 0 -Tl Td
// where Tl denotes the current leading parameter in the text state. The negative of Tl is used
// here because Tl is the text leading expressed as a positive number. Going to the next line
// entails decreasing the y coordinate. (page 250)
func (to *textObject) nextLine() {
2019-01-02 10:39:30 +11:00
to.moveTo(0, -to.state.tl)
}
// setTextMatrix "Tm".
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
2018-11-27 13:37:12 +11:00
// in `f` (page 250).
func (to *textObject) setTextMatrix(f []float64) {
2018-11-28 23:25:17 +00:00
if len(f) != 6 {
common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
return
}
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
2019-01-02 10:39:30 +11:00
to.tm = transform.NewMatrix(a, b, c, d, tx, ty)
to.tlm = to.tm
}
2018-11-27 13:37:12 +11:00
// showText "Tj". Show a text string.
func (to *textObject) showText(charcodes []byte) error {
2018-07-02 16:46:43 +10:00
return to.renderText(charcodes)
}
2018-11-27 13:37:12 +11:00
// showTextAdjusted "TJ". Show text with adjustable spacing.
2018-07-25 13:19:09 +10:00
func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
vertical := false
2018-07-25 13:19:09 +10:00
for _, o := range args.Elements() {
switch o.(type) {
case *core.PdfObjectFloat, *core.PdfObjectInteger:
x, err := core.GetNumberAsFloat(o)
if err != nil {
2018-11-27 13:37:12 +11:00
common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
return err
}
2019-01-02 10:39:30 +11:00
dx, dy := -x*0.001*to.state.tfs, 0.0
if vertical {
dy, dx = dx, dy
}
td := translationMatrix(transform.Point{X: dx, Y: dy})
to.tm.Concat(td)
2019-01-02 10:39:30 +11:00
common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.tm)
case *core.PdfObjectString:
charcodes, ok := core.GetStringBytes(o)
if !ok {
2018-11-27 13:37:12 +11:00
common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
return core.ErrTypeError
}
to.renderText(charcodes)
default:
2018-11-27 13:37:12 +11:00
common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
return core.ErrTypeError
}
}
return nil
}
2018-11-27 13:37:12 +11:00
// setTextLeading "TL". Set text leading.
func (to *textObject) setTextLeading(y float64) {
2019-01-02 10:39:30 +11:00
if to == nil || to.state == nil {
return
}
2019-01-02 10:39:30 +11:00
to.state.tl = y
}
2018-11-27 13:37:12 +11:00
// setCharSpacing "Tc". Set character spacing.
func (to *textObject) setCharSpacing(x float64) {
if to == nil {
return
}
2019-01-02 10:39:30 +11:00
to.state.tc = x
}
2018-11-27 13:37:12 +11:00
// setFont "Tf". Set font.
func (to *textObject) setFont(name string, size float64) error {
if to == nil {
return nil
}
font, err := to.getFont(name)
2018-07-03 14:26:42 +10:00
if err == nil {
2019-01-02 10:39:30 +11:00
to.state.tfont = font
if len(*to.fontStack) == 0 {
to.fontStack.push(font)
} else {
(*to.fontStack)[len(*to.fontStack)-1] = font
}
} else if err == model.ErrFontNotSupported {
// TODO(peterwilliams97): Do we need to handle this case in a special way?
return err
2018-07-03 14:26:42 +10:00
} else {
return err
}
2019-01-02 10:39:30 +11:00
to.state.tfs = size
return nil
}
2018-11-27 13:37:12 +11:00
// setTextRenderMode "Tr". Set text rendering mode.
func (to *textObject) setTextRenderMode(mode int) {
if to == nil {
return
}
2019-01-02 10:39:30 +11:00
to.state.tmode = RenderMode(mode)
}
2018-11-27 13:37:12 +11:00
// setTextRise "Ts". Set text rise.
func (to *textObject) setTextRise(y float64) {
if to == nil {
return
}
2019-01-02 10:39:30 +11:00
to.state.trise = y
}
2018-11-27 13:37:12 +11:00
// setWordSpacing "Tw". Set word spacing.
func (to *textObject) setWordSpacing(y float64) {
if to == nil {
return
}
2019-01-02 10:39:30 +11:00
to.state.tw = y
}
2018-11-27 13:37:12 +11:00
// setHorizScaling "Tz". Set horizontal scaling.
func (to *textObject) setHorizScaling(y float64) {
if to == nil {
return
}
2019-01-02 10:39:30 +11:00
to.state.th = y
}
2018-11-28 23:25:17 +00:00
// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
// a single float parameter or we aren't in a text stream.
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
if len(op.Params) != 1 {
err := errors.New("incorrect parameter count")
common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
op.Operand, 1, len(op.Params), op.Params)
return 0.0, err
}
return core.GetNumberAsFloat(op.Params[0])
}
// checkOp returns true if we are in a text stream and `op` has `numParams` params.
// If `hard` is true and the number of params don't match, an error is returned.
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
hard bool) (ok bool, err error) {
if to == nil {
2018-11-28 23:25:17 +00:00
var params []core.PdfObject
if numParams > 0 {
params = op.Params
if len(params) > numParams {
params = params[:numParams]
}
}
common.Log.Debug("%#q operand outside text. params=%+v", op.Operand, params)
}
if numParams >= 0 {
if len(op.Params) != numParams {
if hard {
err = errors.New("incorrect parameter count")
}
common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
op.Operand, numParams, len(op.Params), op.Params)
return false, err
}
}
return true, nil
}
// fontStacker is the PDF font stack implementation.
type fontStacker []*model.PdfFont
// String returns a string describing the current state of the font stack.
func (fontStack *fontStacker) String() string {
parts := []string{"---- font stack"}
for i, font := range *fontStack {
s := "<nil>"
if font != nil {
s = font.String()
}
parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
}
return strings.Join(parts, "\n")
}
// push pushes `font` onto the font stack.
func (fontStack *fontStacker) push(font *model.PdfFont) {
*fontStack = append(*fontStack, font)
}
// pop pops and returns the element on the top of the font stack if there is one or nil if there isn't.
func (fontStack *fontStacker) pop() *model.PdfFont {
if fontStack.empty() {
return nil
}
font := (*fontStack)[len(*fontStack)-1]
*fontStack = (*fontStack)[:len(*fontStack)-1]
return font
}
// peek returns the element on the top of the font stack if there is one or nil if there isn't.
func (fontStack *fontStacker) peek() *model.PdfFont {
if fontStack.empty() {
return nil
}
return (*fontStack)[len(*fontStack)-1]
}
// get returns the `idx`'th element of the font stack if there is one or nil if there isn't.
// idx = 0: bottom of font stack
// idx = len(fontstack) - 1: top of font stack
// idx = -n is same as dx = len(fontstack) - n, so fontstack.get(-1) is same as fontstack.peek()
func (fontStack *fontStacker) get(idx int) *model.PdfFont {
if idx < 0 {
idx += fontStack.size()
}
if idx < 0 || idx > fontStack.size()-1 {
return nil
}
return (*fontStack)[idx]
}
// empty returns true if the font stack is empty.
func (fontStack *fontStacker) empty() bool {
return len(*fontStack) == 0
}
// size returns the number of elements in the font stack.
func (fontStack *fontStacker) size() int {
return len(*fontStack)
}
// 9.3 Text State Parameters and Operators (page 243)
// Some of these parameters are expressed in unscaled text space units. This means that they shall
// be specified in a coordinate system that shall be defined by the text matrix, Tm but shall not be
// scaled by the font size parameter, Tfs.
// textState represents the text state.
type textState struct {
2019-01-02 10:39:30 +11:00
tc float64 // Character spacing. Unscaled text space units.
tw float64 // Word spacing. Unscaled text space units.
th float64 // Horizontal scaling.
tl float64 // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
tfs float64 // Text font size.
tmode RenderMode // Text rendering mode.
trise float64 // Text rise. Unscaled text space units. Set by Ts.
tfont *model.PdfFont // Text font.
// For debugging
numChars int
numMisses int
}
// 9.4.1 General (page 248)
// A PDF text object consists of operators that may show text strings, move the text position, and
// set text state and certain other parameters. In addition, two parameters may be specified only
// within a text object and shall not persist from one text object to the next:
// • Tm, the text matrix
// • Tlm, the text line matrix
//
// Text space is converted to device space by this transform (page 252)
2018-09-20 11:49:44 +10:00
// Trm is the text rendering matrix
// | Tfs x Th 0 0 |
// Trm = | 0 Tfs 0 | × Tm × CTM
// | 0 Trise 1 |
2018-09-20 11:49:44 +10:00
// This corresponds to the following code in renderText()
// trm := to.gs.CTM.Mult(stateMatrix).Mult(to.tm)
// textObject represents a PDF text object.
type textObject struct {
e *Extractor
resources *model.PdfPageResources
gs contentstream.GraphicsState
fontStack *fontStacker
2019-01-02 10:39:30 +11:00
state *textState
tm transform.Matrix // Text matrix. For the character pointer.
tlm transform.Matrix // Text line matrix. For the start of line pointer.
marks []textMark // Text marks get written here.
}
// newTextState returns a default textState.
func newTextState() textState {
return textState{
2019-01-02 10:39:30 +11:00
th: 100,
tmode: RenderModeFill,
}
}
// newTextObject returns a default textObject.
func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
state *textState,
fontStack *fontStacker) *textObject {
return &textObject{
e: e,
resources: resources,
gs: gs,
fontStack: fontStack,
2019-01-02 10:39:30 +11:00
state: state,
tm: transform.IdentityMatrix(),
tlm: transform.IdentityMatrix(),
}
}
2018-11-28 23:25:17 +00:00
// renderText processes and renders byte array `data` for extraction purposes.
func (to *textObject) renderText(data []byte) error {
2018-09-20 11:49:44 +10:00
font := to.getCurrentFont()
2018-09-18 12:18:04 +10:00
2018-10-30 21:55:30 +11:00
charcodes := font.BytesToCharcodes(data)
runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes)
2018-11-28 18:06:03 +11:00
if numMisses > 0 {
common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
}
2018-10-30 21:55:30 +11:00
2019-01-02 10:39:30 +11:00
to.state.numChars += numChars
to.state.numMisses += numMisses
2019-01-02 10:39:30 +11:00
state := to.state
tfs := state.tfs
th := state.th / 100.0
spaceMetrics, ok := font.GetRuneMetrics(' ')
if !ok {
spaceMetrics, ok = font.GetCharMetrics(32)
}
if !ok {
spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
}
spaceWidth := spaceMetrics.Wx * glyphTextRatio
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs)
2018-09-20 11:49:44 +10:00
stateMatrix := transform.NewMatrix(
2018-09-20 11:49:44 +10:00
tfs*th, 0,
0, tfs,
2019-01-02 10:39:30 +11:00
0, state.trise)
2018-09-20 11:49:44 +10:00
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
for i, r := range runes {
2018-11-30 23:01:04 +00:00
// TODO(peterwilliams97): Need to find and fix cases where this happens.
2018-12-07 18:43:24 +02:00
if r == '\x00' {
2018-11-28 18:06:03 +11:00
continue
}
2018-10-30 21:55:30 +11:00
code := charcodes[i]
2018-09-20 11:49:44 +10:00
// The location of the text on the page in device coordinates is given by trm, the text
// rendering matrix.
trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)
2018-09-20 11:49:44 +10:00
// calculate the text location displacement due to writing `r`. We will use this to update
2019-01-02 10:39:30 +11:00
// to.tm
2018-09-20 11:49:44 +10:00
// w is the unscaled movement at the end of a word.
w := 0.0
2018-12-07 18:43:24 +02:00
if r == ' ' {
2019-01-02 10:39:30 +11:00
w = state.tw
2018-09-20 11:49:44 +10:00
}
2018-10-30 21:55:30 +11:00
m, ok := font.GetCharMetrics(code)
if !ok {
common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
2018-10-30 21:55:30 +11:00
return errors.New("no char metrics")
2018-09-20 11:49:44 +10:00
}
2018-10-30 21:55:30 +11:00
// c is the character size in unscaled text units.
c := transform.Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
2018-10-30 21:55:30 +11:00
2018-11-27 13:37:12 +11:00
// t0 is the end of this character.
// t is the displacement of the text cursor when the character is rendered.
t0 := transform.Point{X: (c.X*tfs + w) * th}
2019-01-02 10:39:30 +11:00
t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
2018-09-20 11:49:44 +10:00
// td, td0 are t, t0 in matrix form.
2018-11-27 13:37:12 +11:00
// td0 is where this character ends. td is where the next character starts.
td0 := translationMatrix(t0)
td := translationMatrix(t)
2019-01-02 10:39:30 +11:00
common.Log.Trace("\"%c\" stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.tm)
common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.tc, w, state.tw)
common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.tm).Mult(to.gs.CTM))
mark := to.newTextMark(
string(r),
trm,
translation(to.gs.CTM.Mult(to.tm).Mult(td0)),
spaceWidth*trm.ScalingFactorX())
common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
to.marks = append(to.marks, mark)
// update the text matrix by the displacement of the text location.
to.tm.Concat(td)
2019-01-02 10:39:30 +11:00
common.Log.Trace("to.tm=%s", to.tm)
2018-09-20 11:49:44 +10:00
}
return nil
}
2018-09-20 11:49:44 +10:00
// glyphTextRatio converts Glyph metrics units to unscaled text space units.
const glyphTextRatio = 1.0 / 1000.0
// translation returns the translation part of `m`.
func translation(m transform.Matrix) transform.Point {
2018-09-20 11:49:44 +10:00
tx, ty := m.Translation()
return transform.Point{X: tx, Y: ty}
2018-09-20 11:49:44 +10:00
}
// translationMatrix returns a matrix that translates by `p`.
func translationMatrix(p transform.Point) transform.Matrix {
return transform.TranslationMatrix(p.X, p.Y)
}
// moveTo moves the start of line pointer by `tx`,`ty` and sets the text pointer to the
// start of line pointer.
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
// `tx` and `ty` are in unscaled text space units.
func (to *textObject) moveTo(tx, ty float64) {
to.tlm.Concat(transform.NewMatrix(1, 0, 0, 1, tx, ty))
2019-01-02 10:39:30 +11:00
to.tm = to.tlm
}
2019-01-02 10:39:30 +11:00
// textMark represents text drawn on a page and its position in device coordinates.
2018-11-27 13:37:12 +11:00
// All dimensions are in device coordinates.
2019-01-02 10:39:30 +11:00
type textMark struct {
text string // The text.
orient int // The text orientation in degrees. This is the current TRM rounded to 10°.
orientedStart transform.Point // Left of text in orientation where text is horizontal.
orientedEnd transform.Point // Right of text in orientation where text is horizontal.
height float64 // Text height.
spaceWidth float64 // Best guess at the width of a space in the font the text was rendered with.
count int64 // To help with reading debug logs.
2018-11-27 13:37:12 +11:00
}
2019-01-03 15:41:36 +11:00
// newTextMark returns an textMark for text `text` rendered with text rendering matrix (TRM) `trm` and end
2018-11-27 13:37:12 +11:00
// of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in
// the font the text is rendered in device coordinates.
2019-01-03 15:41:36 +11:00
func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point, spaceWidth float64) textMark {
2018-11-27 13:37:12 +11:00
to.e.textCount++
theta := trm.Angle()
orient := nearestMultiple(theta, 10)
var height float64
if orient%180 != 90 {
2018-11-28 18:06:03 +11:00
height = trm.ScalingFactorY()
} else {
height = trm.ScalingFactorX()
}
2019-01-02 10:39:30 +11:00
return textMark{
text: text,
orient: orient,
orientedStart: translation(trm).Rotate(theta),
orientedEnd: end.Rotate(theta),
height: height,
spaceWidth: spaceWidth,
2018-11-27 13:37:12 +11:00
count: to.e.textCount,
}
}
// nearestMultiple return the integer multiple of `m` that is closest to `x`.
func nearestMultiple(x float64, m int) int {
if m == 0 {
m = 1
}
fac := float64(m)
return int(math.Round(x/fac) * fac)
}
// String returns a string describing `t`.
2019-01-02 10:39:30 +11:00
func (t textMark) String() string {
return fmt.Sprintf("textMark{@%03d [%.3f,%.3f] %.1f %d° %q}",
t.count, t.orientedStart.X, t.orientedStart.Y, t.Width(), t.orient, truncate(t.text, 100))
}
2019-01-02 10:39:30 +11:00
// Width returns the width of `t`.text in the text direction.
func (t textMark) Width() float64 {
return math.Abs(t.orientedStart.X - t.orientedEnd.X)
}
2019-01-04 16:07:03 +11:00
// PageText represents the layout of text on a device page.
// It's implementation is opaque to allow for future optimizations.
2019-01-04 16:07:03 +11:00
type PageText struct {
// PageText is currently implemented as a list of texts and their positions on a PDF page.
marks []textMark
}
2019-01-04 16:07:03 +11:00
// String returns a string describing `pt`.
func (pt PageText) String() string {
parts := []string{fmt.Sprintf("PageText: %d elements", pt.length())}
for _, t := range pt.marks {
parts = append(parts, t.String())
}
return strings.Join(parts, "\n")
}
2019-01-04 16:07:03 +11:00
// length returns the number of elements in `pt.marks`.
func (pt PageText) length() int {
return len(pt.marks)
}
2019-01-04 16:07:03 +11:00
// height returns the max height of the elements in `pt.marks`.
func (pt PageText) height() float64 {
2018-11-28 18:06:03 +11:00
fontHeight := 0.0
2019-01-04 16:07:03 +11:00
for _, t := range pt.marks {
2019-01-02 10:39:30 +11:00
if t.height > fontHeight {
fontHeight = t.height
2018-11-28 18:06:03 +11:00
}
}
return fontHeight
}
2019-01-04 16:07:03 +11:00
// ToText returns the contents of `pt` as a single string.
func (pt PageText) ToText() string {
fontHeight := pt.height()
// We sort with a y tolerance to allow for subscripts, diacritics etc.
2018-11-30 23:01:04 +00:00
tol := minFloat(fontHeight*0.2, 5.0)
2019-01-04 16:07:03 +11:00
common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)
// Uncomment the 2 following Trace statements to see the effects of sorting/
2019-01-04 16:07:03 +11:00
// common.Log.Trace("ToText: Before sorting %s", pt)
pt.sortPosition(tol)
// common.Log.Trace("ToText: After sorting %s", pt)
2019-01-04 16:07:03 +11:00
lines := pt.toLines(tol)
2018-11-28 23:25:17 +00:00
texts := make([]string, 0, len(lines))
for _, l := range lines {
2019-01-02 10:39:30 +11:00
texts = append(texts, l.text)
}
return strings.Join(texts, "\n")
}
2019-01-02 10:39:30 +11:00
// sortPosition sorts a text list by its elements' position on a page.
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
// is horizontal.
2019-01-04 16:07:03 +11:00
func (pt *PageText) sortPosition(tol float64) {
2019-01-05 09:14:10 +11:00
sort.SliceStable(pt.marks, func(i, j int) bool {
ti, tj := pt.marks[i], pt.marks[j]
2019-01-02 10:39:30 +11:00
if ti.orient != tj.orient {
return ti.orient < tj.orient
}
2019-01-02 10:39:30 +11:00
if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol {
return ti.orientedStart.Y > tj.orientedStart.Y
}
2019-01-02 10:39:30 +11:00
return ti.orientedStart.X < tj.orientedStart.X
})
}
2019-01-02 10:39:30 +11:00
// textLine represents a line of text on a page.
type textLine struct {
y float64 // y position of line.
dxList []float64 // x distance between successive words in line.
text string // text in the line.
words []string // words in the line.
}
2019-01-04 16:07:03 +11:00
// toLines returns the text and positions in `pt.marks` as a slice of textLine.
2018-12-02 18:41:48 +11:00
// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
// that text is horizontal) before calling this function.
2019-01-04 16:07:03 +11:00
func (pt PageText) toLines(tol float64) []textLine {
// We divide `pt.marks` into slices which contain texts with the same orientation, extract the lines
2018-11-27 13:37:12 +11:00
// for each orientation then return the concatention of these lines sorted by orientation.
2019-01-04 16:07:03 +11:00
tlOrient := make(map[int][]textMark, len(pt.marks))
for _, t := range pt.marks {
2019-01-02 10:39:30 +11:00
tlOrient[t.orient] = append(tlOrient[t.orient], t)
}
2019-01-02 10:39:30 +11:00
var lines []textLine
2018-11-27 13:37:12 +11:00
for _, o := range orientKeys(tlOrient) {
2019-01-04 16:07:03 +11:00
lines = append(lines, PageText{tlOrient[o]}.toLinesOrient(tol)...)
}
return lines
}
2019-01-04 16:07:03 +11:00
// toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
2018-11-27 13:37:12 +11:00
// NOTE: This function only works on text lists where all text is the same orientation so it should
// only be called from toLines.
2018-12-02 18:41:48 +11:00
// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
2018-11-27 13:37:12 +11:00
// that text is horizontal) before calling this function.
2019-01-04 16:07:03 +11:00
func (pt PageText) toLinesOrient(tol float64) []textLine {
if len(pt.marks) == 0 {
2019-01-02 10:39:30 +11:00
return []textLine{}
}
2019-01-02 10:39:30 +11:00
var lines []textLine
2018-11-28 23:25:17 +00:00
var words []string
var x []float64
2019-01-04 16:07:03 +11:00
y := pt.marks[0].orientedStart.Y
scanning := false
2018-11-28 23:25:17 +00:00
averageCharWidth := exponAve{}
wordSpacing := exponAve{}
2019-01-04 16:07:03 +11:00
lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X
2019-01-04 16:07:03 +11:00
for _, t := range pt.marks {
2019-01-02 10:39:30 +11:00
if t.orientedStart.Y+tol < y {
if len(words) > 0 {
2018-10-09 13:47:43 +11:00
line := newLine(y, x, words)
if averageCharWidth.running {
// FIXME(peterwilliams97): Fix and reinstate combineDiacritics.
// line = combineDiacritics(line, averageCharWidth.ave)
2018-10-09 13:47:43 +11:00
line = removeDuplicates(line, averageCharWidth.ave)
}
lines = append(lines, line)
}
2018-10-09 13:47:43 +11:00
words = []string{}
x = []float64{}
2019-01-02 10:39:30 +11:00
y = t.orientedStart.Y
scanning = false
}
// Detect text movements that represent spaces on the printed page.
// We use a heuristic from PdfBox: If the next character starts to the right of where a
// character after a space at "normal spacing" would start, then there is a space before it.
// The tricky thing to guess here is the width of a space at normal spacing.
2018-11-30 23:01:04 +00:00
// We follow PdfBox and use minFloat(deltaSpace, deltaCharWidth).
deltaSpace := 0.0
2019-01-02 10:39:30 +11:00
if t.spaceWidth == 0 {
deltaSpace = math.MaxFloat64
} else {
2019-01-02 10:39:30 +11:00
wordSpacing.update(t.spaceWidth)
deltaSpace = wordSpacing.ave * 0.5
}
2018-10-09 19:05:38 +11:00
averageCharWidth.update(t.Width())
deltaCharWidth := averageCharWidth.ave * 0.3
isSpace := false
2018-11-28 23:25:17 +00:00
nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
2019-01-02 10:39:30 +11:00
if scanning && t.text != " " {
isSpace = nextWordX < t.orientedStart.X
}
common.Log.Trace("t=%s", t)
common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
2018-11-28 23:25:17 +00:00
t.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
2019-01-02 10:39:30 +11:00
t.text, t.orientedStart.X, t.orientedStart.Y, lastEndX, nextWordX,
nextWordX-t.orientedStart.X, isSpace)
if isSpace {
words = append(words, " ")
2019-01-02 10:39:30 +11:00
x = append(x, (lastEndX+t.orientedStart.X)*0.5)
}
// Add the text to the line.
2019-01-02 10:39:30 +11:00
lastEndX = t.orientedEnd.X
words = append(words, t.text)
x = append(x, t.orientedStart.X)
scanning = true
common.Log.Trace("lastEndX=%.2f", lastEndX)
}
if len(words) > 0 {
2018-10-09 13:47:43 +11:00
line := newLine(y, x, words)
if averageCharWidth.running {
line = removeDuplicates(line, averageCharWidth.ave)
}
lines = append(lines, line)
}
return lines
}
2018-11-27 13:37:12 +11:00
// orientKeys returns the keys of `tlOrient` as a sorted slice.
func orientKeys(tlOrient map[int][]textMark) []int {
2018-11-27 13:37:12 +11:00
keys := []int{}
for k := range tlOrient {
keys = append(keys, k)
}
sort.Ints(keys)
return keys
}
2018-11-28 23:25:17 +00:00
// exponAve implements an exponential average.
type exponAve struct {
ave float64 // Current average value.
running bool // Has `ave` been set?
}
// update updates the exponential average `exp.ave` and returns it.
2018-11-28 23:25:17 +00:00
func (exp *exponAve) update(x float64) float64 {
if !exp.running {
exp.ave = x
2018-10-09 13:47:43 +11:00
exp.running = true
} else {
// NOTE(peterwilliams97): 0.5 is a guess. It may be possible to improve average character
// and space width estimation by tuning this value. It may be that different exponents
// would work better for character and space estimation.
exp.ave = (exp.ave + x) * 0.5
}
return exp.ave
}
2019-01-02 10:39:30 +11:00
// newLine returns the textLine representation of strings `words` with y coordinate `y` and x
2018-09-03 16:38:58 +10:00
// coordinates `x`.
2019-01-02 10:39:30 +11:00
func newLine(y float64, x []float64, words []string) textLine {
dxList := make([]float64, 0, len(x))
for i := 1; i < len(x); i++ {
2019-01-02 10:39:30 +11:00
dxList = append(dxList, x[i]-x[i-1])
}
2019-01-02 10:39:30 +11:00
return textLine{y: y, dxList: dxList, text: strings.Join(words, ""), words: words}
2018-10-09 13:47:43 +11:00
}
// removeDuplicates returns `line` with duplicate characters removed. `charWidth` is the average
// character width for the line.
2019-01-02 10:39:30 +11:00
func removeDuplicates(line textLine, charWidth float64) textLine {
if len(line.dxList) == 0 {
2018-10-09 13:47:43 +11:00
return line
}
// NOTE(peterwilliams97) 0.3 is a guess. It may be possible to tune this to a better value.
2018-10-09 13:47:43 +11:00
tol := charWidth * 0.3
2019-01-02 10:39:30 +11:00
words := []string{line.words[0]}
var dxList []float64
2018-10-09 19:05:38 +11:00
2019-01-02 10:39:30 +11:00
w0 := line.words[0]
for i, dx := range line.dxList {
w := line.words[i+1]
2018-10-09 13:47:43 +11:00
if w != w0 || dx > tol {
words = append(words, w)
dxList = append(dxList, dx)
}
w0 = w
}
2019-01-02 10:39:30 +11:00
return textLine{y: line.y, dxList: dxList, text: strings.Join(words, ""), words: words}
}
2018-11-28 18:06:03 +11:00
// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
// `charWidth` is the average character width for the line.
// We have to do this because PDF can render diacritics separately to the characters they attach to
// in extracted text.
2019-01-02 10:39:30 +11:00
func combineDiacritics(line textLine, charWidth float64) textLine {
if len(line.dxList) == 0 {
2018-11-28 18:06:03 +11:00
return line
}
// NOTE(peterwilliams97) 0.2 is a guess. It may be possible to tune this to a better value.
2018-11-28 18:06:03 +11:00
tol := charWidth * 0.2
common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)
2018-11-30 23:01:04 +00:00
var words []string
var dxList []float64
2019-01-02 10:39:30 +11:00
w := line.words[0]
2018-11-28 18:06:03 +11:00
w, c := countDiacritic(w)
delta := 0.0
dx0 := 0.0
parts := []string{w}
numChars := c
2019-01-02 10:39:30 +11:00
for i := 0; i < len(line.dxList); i++ {
w = line.words[i+1]
2018-11-28 18:06:03 +11:00
w, c := countDiacritic(w)
2019-01-02 10:39:30 +11:00
dx := line.dxList[i]
2018-11-28 18:06:03 +11:00
if numChars+c <= 1 && delta+dx <= tol {
if len(parts) == 0 {
dx0 = dx
} else {
delta += dx
}
parts = append(parts, w)
numChars += c
} else {
if len(parts) > 0 {
if len(words) > 0 {
dxList = append(dxList, dx0)
}
words = append(words, combine(parts))
}
parts = []string{w}
numChars = c
dx0 = dx
delta = 0.0
}
}
if len(parts) > 0 {
if len(words) > 0 {
dxList = append(dxList, dx0)
}
words = append(words, combine(parts))
}
if len(words) != len(dxList)+1 {
common.Log.Error("Inconsistent: \nwords=%d %q\ndxList=%d %.2f",
len(words), words, len(dxList), dxList)
return line
}
2019-01-02 10:39:30 +11:00
return textLine{y: line.y, dxList: dxList, text: strings.Join(words, ""), words: words}
2018-11-28 18:06:03 +11:00
}
// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
func combine(parts []string) string {
if len(parts) == 1 {
// Must be a non-diacritic.
return parts[0]
}
// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
diacritic := map[string]bool{}
for _, w := range parts {
r := []rune(w)[0]
diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
}
sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })
// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
for i, w := range parts {
parts[i] = strings.TrimSpace(norm.NFKC.String(w))
}
return strings.Join(parts, "")
}
// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
// non-diacritics in `w` (0 or 1).
2018-11-28 18:06:03 +11:00
func countDiacritic(w string) (string, int) {
runes := []rune(w)
if len(runes) != 1 {
return w, 1
}
r := runes[0]
c := 1
if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
r != '\'' && r != '"' && r != '`' {
2018-11-28 18:06:03 +11:00
c = 0
}
if w2, ok := diacritics[r]; ok {
c = 0
w = w2
}
return w, c
}
// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
// (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
2018-11-28 18:06:03 +11:00
var diacritics = map[rune]string{
0x0060: "\u0300",
0x02CB: "\u0300",
0x0027: "\u0301",
0x02B9: "\u0301",
0x02CA: "\u0301",
0x005e: "\u0302",
0x02C6: "\u0302",
0x007E: "\u0303",
0x02C9: "\u0304",
0x00B0: "\u030A",
0x02BA: "\u030B",
0x02C7: "\u030C",
0x02C8: "\u030D",
0x0022: "\u030E",
0x02BB: "\u0312",
0x02BC: "\u0313",
0x0486: "\u0313",
0x055A: "\u0313",
0x02BD: "\u0314",
0x0485: "\u0314",
0x0559: "\u0314",
0x02D4: "\u031D",
0x02D5: "\u031E",
0x02D6: "\u031F",
0x02D7: "\u0320",
0x02B2: "\u0321",
0x02CC: "\u0329",
0x02B7: "\u032B",
0x02CD: "\u0331",
0x005F: "\u0332",
0x204E: "\u0359",
}
2018-09-20 11:49:44 +10:00
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
// empty.
func (to *textObject) getCurrentFont() *model.PdfFont {
if to.fontStack.empty() {
common.Log.Debug("ERROR: No font defined. Using default.")
return model.DefaultFont()
}
return to.fontStack.peek()
}
// getFont returns the font named `name` if it exists in the page's resources or an error if it
2018-09-17 12:12:06 +10:00
// doesn't. It caches the returned fonts.
func (to *textObject) getFont(name string) (*model.PdfFont, error) {
if to.e.fontCache != nil {
to.e.accessCount++
entry, ok := to.e.fontCache[name]
if ok {
entry.access = to.e.accessCount
return entry.font, nil
}
2018-09-17 12:12:06 +10:00
}
// Font not in cache. Load it.
font, err := to.getFontDirect(name)
if err != nil {
return nil, err
}
if to.e.fontCache != nil {
entry := fontEntry{font, to.e.accessCount}
// Eject a victim if the cache is full.
if len(to.e.fontCache) >= maxFontCache {
var names []string
for name := range to.e.fontCache {
names = append(names, name)
}
sort.Slice(names, func(i, j int) bool {
return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
})
delete(to.e.fontCache, names[0])
2018-09-17 12:12:06 +10:00
}
to.e.fontCache[name] = entry
2018-09-17 12:12:06 +10:00
}
return font, nil
}
2018-09-21 16:43:10 +10:00
// fontEntry is a entry in the font cache.
2018-09-17 12:12:06 +10:00
type fontEntry struct {
font *model.PdfFont // The font being cached.
access int64 // Last access. Used to determine LRU cache victims.
}
// maxFontCache is the maximum number of PdfFont's in fontCache.
const maxFontCache = 10
// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
2018-11-28 23:25:17 +00:00
// it doesn't. Accesses page resources directly (not cached).
2018-09-17 12:12:06 +10:00
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
fontObj, err := to.getFontDict(name)
if err != nil {
return nil, err
}
font, err := model.NewPdfFontFromPdfObject(fontObj)
if err != nil {
2018-09-17 12:12:06 +10:00
common.Log.Debug("getFontDirect: NewPdfFontFromPdfObject failed. name=%#q err=%v", name, err)
}
return font, err
}
2018-12-27 21:33:31 +11:00
// getFontDict returns the font dict with key `name` if it exists in the page's or form's Font
// resources or an error if it doesn't.
func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
resources := to.resources
if resources == nil {
common.Log.Debug("getFontDict. No resources. name=%#q", name)
return nil, nil
}
fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
if !found {
common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
return nil, errors.New("font not in resources")
}
return fontObj, nil
}