2018-03-22 14:03:47 +00:00
|
|
|
|
/*
|
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
|
package extractor
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"errors"
|
2018-07-13 17:40:27 +10:00
|
|
|
|
"fmt"
|
2018-10-09 11:49:59 +11:00
|
|
|
|
"math"
|
2018-08-22 12:29:34 +10:00
|
|
|
|
"sort"
|
2018-07-13 17:40:27 +10:00
|
|
|
|
"strings"
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
|
|
|
|
"github.com/unidoc/unidoc/common"
|
|
|
|
|
"github.com/unidoc/unidoc/pdf/contentstream"
|
2018-07-15 16:28:56 +10:00
|
|
|
|
"github.com/unidoc/unidoc/pdf/core"
|
2018-03-22 13:01:04 +00:00
|
|
|
|
"github.com/unidoc/unidoc/pdf/model"
|
|
|
|
|
)
|
|
|
|
|
|
2018-06-27 12:25:59 +10:00
|
|
|
|
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// It takes into account character encodings in the PDF file, which are decoded by
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// CharcodeBytesToUnicode.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
2018-03-22 13:01:04 +00:00
|
|
|
|
func (e *Extractor) ExtractText() (string, error) {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
text, _, _, err := e.ExtractText2()
|
|
|
|
|
return text, err
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// ExtractText2 works like ExtractText but returns the number of characters in the output and the
|
|
|
|
|
// the number of characters that were not decoded.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (e *Extractor) ExtractText2() (string, int, int, error) {
|
|
|
|
|
textList, numChars, numMisses, err := e.ExtractXYText()
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
return "", numChars, numMisses, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-13 17:40:27 +10:00
|
|
|
|
return textList.ToText(), numChars, numMisses, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ExtractXYText returns the text contents of `e` as a TextList.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
textList := &TextList{}
|
|
|
|
|
state := newTextState()
|
2018-07-13 17:40:27 +10:00
|
|
|
|
fontStack := fontStacker{}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
var to *textObject
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
|
|
|
|
cstreamParser := contentstream.NewContentStreamParser(e.contents)
|
|
|
|
|
operations, err := cstreamParser.Parse()
|
|
|
|
|
if err != nil {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
common.Log.Debug("ExtractXYText: parse failed. err=%v", err)
|
2018-07-13 17:40:27 +10:00
|
|
|
|
return textList, state.numChars, state.numMisses, err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
processor := contentstream.NewContentStreamProcessor(*operations)
|
|
|
|
|
|
|
|
|
|
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
|
2018-06-27 16:31:28 +10:00
|
|
|
|
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
|
|
|
|
|
resources *model.PdfPageResources) error {
|
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
|
operand := op.Operand
|
2018-06-27 16:31:28 +10:00
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
|
switch operand {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
case "q":
|
|
|
|
|
if !fontStack.empty() {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Save font state: %s\n%s",
|
2018-07-13 17:40:27 +10:00
|
|
|
|
fontStack.peek(), fontStack.String())
|
|
|
|
|
fontStack.push(fontStack.peek())
|
|
|
|
|
}
|
|
|
|
|
if state.Tf != nil {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Save font state: %s\n->%s\n%s",
|
2018-07-13 17:40:27 +10:00
|
|
|
|
fontStack.peek(), state.Tf, fontStack.String())
|
|
|
|
|
fontStack.push(state.Tf)
|
|
|
|
|
}
|
|
|
|
|
case "Q":
|
|
|
|
|
if !fontStack.empty() {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Restore font state: %s\n->%s\n%s",
|
2018-07-13 17:40:27 +10:00
|
|
|
|
fontStack.peek(), fontStack.get(-2), fontStack.String())
|
|
|
|
|
fontStack.pop()
|
|
|
|
|
}
|
|
|
|
|
if len(fontStack) >= 2 {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Restore font state: %s\n->%s\n%s",
|
2018-07-13 17:40:27 +10:00
|
|
|
|
state.Tf, fontStack.peek(), fontStack.String())
|
|
|
|
|
state.Tf = fontStack.pop()
|
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
case "BT": // Begin text
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
|
|
|
|
|
// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
|
|
|
|
|
// not appear before an ET.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if to != nil {
|
|
|
|
|
common.Log.Debug("BT called while in a text object")
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-13 17:40:27 +10:00
|
|
|
|
to = newTextObject(e, gs, &state, &fontStack)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
case "ET": // End Text
|
|
|
|
|
*textList = append(*textList, to.Texts...)
|
|
|
|
|
to = nil
|
|
|
|
|
case "T*": // Move to start of next text line
|
|
|
|
|
to.nextLine()
|
|
|
|
|
case "Td": // Move text location
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 2, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
x, y, err := toFloatXY(op.Params)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.moveText(x, y)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
case "TD": // Move text location and set leading
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 2, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
x, y, err := toFloatXY(op.Params)
|
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
to.moveTextSetLeading(x, y)
|
|
|
|
|
case "Tj": // Show text
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tj op=%s err=%v", op, err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
charcodes, ok := core.GetStringBytes(op.Params[0])
|
|
|
|
|
if !ok {
|
|
|
|
|
common.Log.Debug("ERROR: Tj op=%s GetStringBytes failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return to.showText(charcodes)
|
|
|
|
|
case "TJ": // Show text with adjustable spacing
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: TJ err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-25 13:19:09 +10:00
|
|
|
|
args, ok := core.GetArray(op.Params[0])
|
2018-07-21 21:20:39 +10:00
|
|
|
|
if !ok {
|
|
|
|
|
common.Log.Debug("ERROR: Tj op=%s GetArrayVal failed", op)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return to.showTextAdjusted(args)
|
|
|
|
|
case "'": // Move to next line and show text
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: ' err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
charcodes, ok := core.GetStringBytes(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: ' op=%s GetStringBytes failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
to.nextLine()
|
|
|
|
|
return to.showText(charcodes)
|
|
|
|
|
case `"`: // Set word and character spacing, move to next line, and show text
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: \" err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
x, y, err := toFloatXY(op.Params[:2])
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
charcodes, ok := core.GetStringBytes(op.Params[2])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: \" op=%s GetStringBytes failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.setCharSpacing(x)
|
|
|
|
|
to.setWordSpacing(y)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
to.nextLine()
|
|
|
|
|
return to.showText(charcodes)
|
|
|
|
|
case "TL": // Set text leading
|
2018-07-15 16:45:47 +10:00
|
|
|
|
y, err := floatParam(op)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if err != nil {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: TL err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setTextLeading(y)
|
|
|
|
|
case "Tc": // Set character spacing
|
2018-07-15 16:45:47 +10:00
|
|
|
|
y, err := floatParam(op)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if err != nil {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tc err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setCharSpacing(y)
|
|
|
|
|
case "Tf": // Set font
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 2, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tf err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
name, ok := core.GetNameVal(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tf op=%s GetNameVal failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
size, err := core.GetNumberAsFloat(op.Params[1])
|
2018-07-21 21:20:39 +10:00
|
|
|
|
if !ok {
|
|
|
|
|
common.Log.Debug("ERROR: Tf op=%s GetFloatVal failed. err=%v", op, err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
err = to.setFont(name, size)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
case "Tm": // Set text matrix
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 6, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tm err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
floats, err := core.GetNumbersAsFloat(op.Params)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setTextMatrix(floats)
|
|
|
|
|
case "Tr": // Set text rendering mode
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tr err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
mode, ok := core.GetIntVal(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tr op=%s GetIntVal failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
to.setTextRenderMode(mode)
|
|
|
|
|
case "Ts": // Set text rise
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Ts err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
y, err := core.GetNumberAsFloat(op.Params[0])
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setTextRise(y)
|
|
|
|
|
case "Tw": // Set word spacing
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
y, err := core.GetNumberAsFloat(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-07-21 21:20:39 +10:00
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
to.setWordSpacing(y)
|
|
|
|
|
case "Tz": // Set horizontal scaling
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
y, err := core.GetNumberAsFloat(op.Params[0])
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setHorizScaling(y)
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
err = processor.Process(e.resources)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
|
|
|
|
common.Log.Error("ERROR: Processing: err=%v", err)
|
|
|
|
|
}
|
2018-07-13 17:40:27 +10:00
|
|
|
|
return textList, state.numChars, state.numMisses, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// Text operators
|
|
|
|
|
//
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// moveText "Td" Moves start of text by `tx`,`ty`.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
|
|
|
|
|
// tx and ty are in unscaled text space units.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) moveText(tx, ty float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.moveTo(tx, ty)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// moveTextSetLeading "TD" Move text location and set leading.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// Move to the start of the next line, offset from the start of the current line by (tx, ty). As a
|
|
|
|
|
// side effect, this operator shall set the leading parameter in the text state. This operator shall
|
|
|
|
|
// have the same effect as this code:
|
|
|
|
|
// −ty TL
|
|
|
|
|
// tx ty Td
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) moveTextSetLeading(tx, ty float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.State.Tl = -ty
|
|
|
|
|
to.moveTo(tx, ty)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// nextLine "T*"" Moves start of text `Line` to next text line
|
|
|
|
|
// Move to the start of the next line. This operator has the same effect as the code
|
|
|
|
|
// 0 -Tl Td
|
|
|
|
|
// where Tl denotes the current leading parameter in the text state. The negative of Tl is used
|
|
|
|
|
// here because Tl is the text leading expressed as a positive number. Going to the next line
|
|
|
|
|
// entails decreasing the y coordinate. (page 250)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) nextLine() {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.moveTo(0, -to.State.Tl)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// setTextMatrix "Tm"
|
|
|
|
|
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
|
|
|
|
|
// in `f` (page 250)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextMatrix(f []float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
|
|
|
|
|
to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
|
|
|
|
to.Tlm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-09-17 12:12:06 +10:00
|
|
|
|
// showText "Tj" Show a text string.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) showText(charcodes []byte) error {
|
2018-07-02 16:46:43 +10:00
|
|
|
|
return to.renderText(charcodes)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-09-17 12:12:06 +10:00
|
|
|
|
// showTextAdjusted "TJ" Show text with adjustable spacing.
|
2018-07-25 13:19:09 +10:00
|
|
|
|
func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
vertical := false
|
2018-07-25 13:19:09 +10:00
|
|
|
|
for _, o := range args.Elements() {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
switch o.(type) {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
case *core.PdfObjectFloat, *core.PdfObjectInteger:
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// The following is supposed to be equivalent to the existing Unidoc implementation.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
x, err := core.GetNumberAsFloat(o)
|
|
|
|
|
if err != nil {
|
|
|
|
|
common.Log.Debug("showTextAdjusted: Bad numerical arg. o=%s args=%+v", o, args)
|
|
|
|
|
return err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
dx, dy := -x*0.001*to.State.Tfs, 0.0
|
|
|
|
|
if vertical {
|
|
|
|
|
dy, dx = dx, dy
|
|
|
|
|
}
|
|
|
|
|
to.Tm.Translate(dx, dy)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
case *core.PdfObjectString:
|
2018-07-21 21:20:39 +10:00
|
|
|
|
charcodes, ok := core.GetStringBytes(o)
|
|
|
|
|
if !ok {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
common.Log.Debug("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
|
2018-07-21 21:20:39 +10:00
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.renderText(charcodes)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
default:
|
2018-08-22 12:29:34 +10:00
|
|
|
|
common.Log.Debug("showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// setTextLeading "TL" Set text leading.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextLeading(y float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
to.State.Tl = y
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// setCharSpacing "Tc" Set character spacing.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setCharSpacing(x float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
to.State.Tc = x
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// setFont "Tf" Set font.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setFont(name string, size float64) error {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
font, err := to.getFont(name)
|
2018-07-03 14:26:42 +10:00
|
|
|
|
if err == nil {
|
|
|
|
|
to.State.Tf = font
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if len(*to.fontStack) == 0 {
|
|
|
|
|
to.fontStack.push(font)
|
|
|
|
|
} else {
|
|
|
|
|
(*to.fontStack)[len(*to.fontStack)-1] = font
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
} else if err == model.ErrFontNotSupported {
|
2018-07-24 21:32:02 +10:00
|
|
|
|
// XXX: Do we need to handle this case in a special way?
|
2018-07-04 18:00:37 +10:00
|
|
|
|
return err
|
2018-07-03 14:26:42 +10:00
|
|
|
|
} else {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.State.Tfs = size
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// setTextRenderMode "Tr" Set text rendering mode.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextRenderMode(mode int) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
to.State.Tmode = RenderMode(mode)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// setTextRise "Ts" Set text rise.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextRise(y float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
to.State.Trise = y
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// setWordSpacing "Tw" Set word spacing.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setWordSpacing(y float64) {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// Not implemented yet
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// setHorizScaling "Tz" Set horizontal scaling.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setHorizScaling(y float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
to.State.Th = y
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// floatParam returns the single float parameter of operatr `op`, or an error if it doesn't have
|
|
|
|
|
// a single float parameter or we aren't in a text stream.
|
2018-07-15 16:45:47 +10:00
|
|
|
|
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
|
|
|
|
|
if len(op.Params) != 1 {
|
|
|
|
|
err := errors.New("Incorrect parameter count")
|
|
|
|
|
common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
|
|
|
|
|
op.Operand, 1, len(op.Params), op.Params)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return 0.0, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return core.GetNumberAsFloat(op.Params[0])
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// checkOp returns true if we are in a text stream and `op` has `numParams` params.
|
|
|
|
|
// If `hard` is true and the number of params don't match, an error is returned.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
|
2018-07-15 16:28:56 +10:00
|
|
|
|
hard bool) (ok bool, err error) {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
common.Log.Debug("%#q operand outside text", op.Operand)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return false, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
if numParams >= 0 {
|
|
|
|
|
if len(op.Params) != numParams {
|
|
|
|
|
if hard {
|
|
|
|
|
err = errors.New("Incorrect parameter count")
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
|
2018-06-27 16:31:28 +10:00
|
|
|
|
op.Operand, numParams, len(op.Params), op.Params)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return false, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return true, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// fontStacker is the PDF font stack implementation.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
type fontStacker []*model.PdfFont
|
|
|
|
|
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// String returns a string describing the current state of the font stack.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) String() string {
|
|
|
|
|
parts := []string{"---- font stack"}
|
|
|
|
|
for i, font := range *fontStack {
|
|
|
|
|
s := "<nil>"
|
|
|
|
|
if font != nil {
|
|
|
|
|
s = font.String()
|
|
|
|
|
}
|
|
|
|
|
parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
|
|
|
|
|
}
|
|
|
|
|
return strings.Join(parts, "\n")
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
|
|
|
|
// push pushes `font` onto the font stack.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) push(font *model.PdfFont) {
|
|
|
|
|
*fontStack = append(*fontStack, font)
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// pop pops and returns the element on the top of the font stack if there is one or nil if there isn't.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
func (fontStack *fontStacker) pop() *model.PdfFont {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if fontStack.empty() {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return nil
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
font := (*fontStack)[len(*fontStack)-1]
|
2018-07-13 17:40:27 +10:00
|
|
|
|
*fontStack = (*fontStack)[:len(*fontStack)-1]
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return font
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// peek returns the element on the top of the font stack if there is one or nil if there isn't.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (fontStack *fontStacker) peek() *model.PdfFont {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if fontStack.empty() {
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return (*fontStack)[len(*fontStack)-1]
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// get returns the `idx`'th element of the font stack if there is one or nil if there isn't.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// idx = 0: bottom of font stack
|
|
|
|
|
// idx = len(fontstack) - 1: top of font stack
|
|
|
|
|
// idx = -n is same as dx = len(fontstack) - n, so fontstack.get(-1) is same as fontstack.peek()
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (fontStack *fontStacker) get(idx int) *model.PdfFont {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if idx < 0 {
|
|
|
|
|
idx += fontStack.size()
|
|
|
|
|
}
|
|
|
|
|
if idx < 0 || idx > fontStack.size()-1 {
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return (*fontStack)[idx]
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
|
|
|
|
// empty returns true if the font stack is empty.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) empty() bool {
|
|
|
|
|
return len(*fontStack) == 0
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
|
|
|
|
// size returns the number of elements in the font stack.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) size() int {
|
|
|
|
|
return len(*fontStack)
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// 9.3 Text State Parameters and Operators (page 243)
|
|
|
|
|
// Some of these parameters are expressed in unscaled text space units. This means that they shall
|
|
|
|
|
// be specified in a coordinate system that shall be defined by the text matrix, Tm but shall not be
|
|
|
|
|
// scaled by the font size parameter, Tfs.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
|
|
|
|
|
// textState represents the text state.
|
|
|
|
|
type textState struct {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
Tc float64 // Character spacing. Unscaled text space units.
|
|
|
|
|
Tw float64 // Word spacing. Unscaled text space units.
|
2018-09-20 11:49:44 +10:00
|
|
|
|
Th float64 // Horizontal scaling.
|
|
|
|
|
Tl float64 // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
|
|
|
|
|
Tfs float64 // Text font size.
|
|
|
|
|
Tmode RenderMode // Text rendering mode.
|
|
|
|
|
Trise float64 // Text rise. Unscaled text space units. Set by Ts.
|
|
|
|
|
Tf *model.PdfFont // Text font.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
// For debugging
|
|
|
|
|
numChars int
|
|
|
|
|
numMisses int
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 9.4.1 General (page 248)
|
|
|
|
|
// A PDF text object consists of operators that may show text strings, move the text position, and
|
|
|
|
|
// set text state and certain other parameters. In addition, two parameters may be specified only
|
|
|
|
|
// within a text object and shall not persist from one text object to the next:
|
2018-06-28 11:11:43 +10:00
|
|
|
|
// • Tm, the text matrix
|
|
|
|
|
// • Tlm, the text line matrix
|
2018-06-27 16:31:28 +10:00
|
|
|
|
//
|
|
|
|
|
// Text space is converted to device space by this transform (page 252)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// Trm is the text rendering matrix
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// | Tfs x Th 0 0 |
|
|
|
|
|
// Trm = | 0 Tfs 0 | × Tm × CTM
|
|
|
|
|
// | 0 Trise 1 |
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// This corresponds to the following code in renderText()
|
|
|
|
|
// trm := stateMatrix.Mult(to.Tm).Mult(to.gs.CTM))
|
2018-07-25 12:00:49 +10:00
|
|
|
|
|
|
|
|
|
// textObject represents a PDF text object.
|
|
|
|
|
type textObject struct {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
e *Extractor
|
|
|
|
|
gs contentstream.GraphicsState
|
|
|
|
|
fontStack *fontStacker
|
2018-07-25 12:00:49 +10:00
|
|
|
|
State *textState
|
2018-08-22 12:29:34 +10:00
|
|
|
|
Tm contentstream.Matrix // Text matrix. For the character pointer.
|
|
|
|
|
Tlm contentstream.Matrix // Text line matrix. For the start of line pointer.
|
|
|
|
|
Texts []XYText // Text gets written here.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
|
|
|
|
|
// These fields are used to implement existing UniDoc behaviour.
|
|
|
|
|
xPos, yPos float64
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// newTextState returns a default textState.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func newTextState() textState {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
return textState{
|
|
|
|
|
Th: 100,
|
|
|
|
|
Tmode: RenderModeFill,
|
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// newTextObject returns a default textObject.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textState,
|
|
|
|
|
fontStack *fontStacker) *textObject {
|
|
|
|
|
return &textObject{
|
2018-07-13 17:40:27 +10:00
|
|
|
|
e: e,
|
|
|
|
|
gs: gs,
|
|
|
|
|
fontStack: fontStack,
|
|
|
|
|
State: state,
|
2018-08-22 12:29:34 +10:00
|
|
|
|
Tm: contentstream.IdentityMatrix(),
|
|
|
|
|
Tlm: contentstream.IdentityMatrix(),
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-10-09 11:49:59 +11:00
|
|
|
|
// !@#$ hack
|
|
|
|
|
var numRenders = 0
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// renderText emits byte array `data` to the calling program.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) renderText(data []byte) error {
|
2018-10-09 11:49:59 +11:00
|
|
|
|
numRenders++
|
|
|
|
|
if numRenders > 2000 {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2018-09-20 11:49:44 +10:00
|
|
|
|
font := to.getCurrentFont()
|
2018-09-18 12:18:04 +10:00
|
|
|
|
|
|
|
|
|
text, numChars, numMisses := font.CharcodeBytesToUnicode(data)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
runes := []rune(text)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
to.State.numChars += numChars
|
|
|
|
|
to.State.numMisses += numMisses
|
|
|
|
|
|
2018-09-20 11:49:44 +10:00
|
|
|
|
state := to.State
|
|
|
|
|
tfs := state.Tfs
|
|
|
|
|
th := state.Th / 100.0
|
2018-10-09 11:49:59 +11:00
|
|
|
|
spaceMetrics, err := font.GetRuneCharMetrics(' ')
|
|
|
|
|
if err != nil {
|
|
|
|
|
spaceMetrics, _ = model.DefaultFont().GetRuneCharMetrics(' ')
|
|
|
|
|
}
|
|
|
|
|
spaceWidth := spaceMetrics.Wx * glyphTextRatio
|
|
|
|
|
common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, text,
|
|
|
|
|
font, tfs)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
|
|
|
|
|
stateMatrix := contentstream.NewMatrix(
|
|
|
|
|
tfs*th, 0,
|
|
|
|
|
0, tfs,
|
|
|
|
|
0, state.Trise)
|
|
|
|
|
|
|
|
|
|
for _, r := range runes {
|
|
|
|
|
// The location of the text on the page in device coordinates is given by trm, the text
|
|
|
|
|
// rendering matrix.
|
|
|
|
|
trm := stateMatrix.Mult(to.Tm).Mult(to.gs.CTM)
|
|
|
|
|
|
|
|
|
|
// calculate the text location displacement due to writing `r`. We will use this to update
|
|
|
|
|
// to.Tm
|
|
|
|
|
|
|
|
|
|
// w is the unscaled movement at the end of a word.
|
|
|
|
|
w := 0.0
|
|
|
|
|
if r == ' ' {
|
|
|
|
|
w = state.Tw
|
|
|
|
|
}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
2018-09-20 11:49:44 +10:00
|
|
|
|
m, err := font.GetRuneCharMetrics(r)
|
|
|
|
|
if err != nil {
|
|
|
|
|
common.Log.Debug("ERROR: No metric for 0x%04x=%c %s", r, r, font)
|
|
|
|
|
return err
|
|
|
|
|
}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
// c is the character size in unscaled text units.
|
2018-09-20 11:49:44 +10:00
|
|
|
|
c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
// cScaled is the character size
|
|
|
|
|
cScaled := Point{X: c.X * tfs * th}
|
|
|
|
|
// t is the displacement of the text cursor when the character is rendered.
|
2018-09-20 11:49:44 +10:00
|
|
|
|
t := Point{X: (c.X*tfs + state.Tc + w) * th}
|
|
|
|
|
|
2018-10-09 11:49:59 +11:00
|
|
|
|
common.Log.Debug("t=%s cScaled=%s c=%s tfs=%.2f state.Tc=%.2f w=%.2f th=%.2f",
|
|
|
|
|
t.String(), cScaled.String(), c.String(), tfs, state.Tc, w, th)
|
|
|
|
|
|
|
|
|
|
// td is t in matrix from
|
|
|
|
|
td := translationMatrix(t)
|
|
|
|
|
common.Log.Debug("displacement=%s t=%s td=%s m=%s",
|
|
|
|
|
c.String(), t.String(), td.String(), m.String())
|
|
|
|
|
|
|
|
|
|
nextTm := to.Tm.Mult(td)
|
|
|
|
|
common.Log.Debug(" next: td=%s %s->%s", td, to.Tm, nextTm)
|
|
|
|
|
|
|
|
|
|
xyt := XYText{Text: string(r),
|
|
|
|
|
Point: translation(trm),
|
|
|
|
|
End: translation(trm).Displace(cScaled),
|
|
|
|
|
SpaceWidth: spaceWidth * trm.ScalingFactorX(),
|
|
|
|
|
}
|
|
|
|
|
to.Texts = append(to.Texts, xyt)
|
|
|
|
|
common.Log.Debug(" xyt=%s", xyt.String())
|
|
|
|
|
|
|
|
|
|
// update the text matrix by the displacement of the text location.
|
|
|
|
|
to.Tm = nextTm
|
2018-09-20 11:49:44 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// glyphTextRatio converts Glyph metrics units to unscaled text space units.
|
|
|
|
|
const glyphTextRatio = 1.0 / 1000.0
|
|
|
|
|
|
|
|
|
|
// translation returns the translation part of `m`.
|
|
|
|
|
func translation(m contentstream.Matrix) Point {
|
|
|
|
|
tx, ty := m.Translation()
|
|
|
|
|
return Point{tx, ty}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// translationMatrix returns a matrix that translates by `p`.
|
|
|
|
|
func translationMatrix(p Point) contentstream.Matrix {
|
|
|
|
|
return contentstream.TranslationMatrix(p.X, p.Y)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// moveTo moves the start of line pointer by `tx`,`ty` and sets the text pointer to the
|
|
|
|
|
// start of line pointer.
|
|
|
|
|
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
|
|
|
|
|
// `tx` and `ty` are in unscaled text space units.
|
|
|
|
|
func (to *textObject) moveTo(tx, ty float64) {
|
|
|
|
|
to.Tlm.Concat(contentstream.NewMatrix(1, 0, 0, 1, tx, ty))
|
|
|
|
|
to.Tm = to.Tlm
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-03 16:38:58 +10:00
|
|
|
|
// XYText represents text drawn on a page and its position in device coordinates.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
type XYText struct {
|
2018-10-09 11:49:59 +11:00
|
|
|
|
Point // Position of text. Left-bottom?
|
|
|
|
|
End Point // End of text. Right-top?
|
2018-08-22 12:29:34 +10:00
|
|
|
|
ColorStroking model.PdfColor // Colour that text is stroked with, if any.
|
|
|
|
|
ColorNonStroking model.PdfColor // Colour that text is filled with, if any.
|
|
|
|
|
Orient contentstream.Orientation
|
|
|
|
|
Text string
|
2018-10-09 11:49:59 +11:00
|
|
|
|
SpaceWidth float64
|
|
|
|
|
Font string
|
|
|
|
|
FontSize float64
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// String returns a string describing `t`.
|
2018-09-20 11:49:44 +10:00
|
|
|
|
func (t XYText) String() string {
|
2018-10-09 11:49:59 +11:00
|
|
|
|
return fmt.Sprintf("%s,%s %.1f %q",
|
|
|
|
|
t.Point.String(), t.End.String(), t.End.X-t.X, truncate(t.Text, 100))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Width returns the width of `t`.Text in its orientation.
|
|
|
|
|
func (t XYText) Width() float64 {
|
|
|
|
|
var w float64
|
|
|
|
|
switch t.Orient {
|
|
|
|
|
case contentstream.OrientationLandscape:
|
|
|
|
|
w = math.Abs(t.End.Y - t.Y)
|
|
|
|
|
default:
|
|
|
|
|
w = math.Abs(t.End.X - t.X)
|
|
|
|
|
}
|
|
|
|
|
common.Log.Debug(" Width %q (%s %s) -> %.1f", t.Text, t.Point.String(), t.End.String(), w)
|
|
|
|
|
return w
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-09-03 16:38:58 +10:00
|
|
|
|
// TextList is a list of texts and their positions on a PDF page.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
type TextList []XYText
|
|
|
|
|
|
2018-09-03 16:38:58 +10:00
|
|
|
|
// Length returns the number of elements in `tl`.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
func (tl *TextList) Length() int {
|
|
|
|
|
return len(*tl)
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// AppendText appends the location and contents of `text` to a text list.
|
2018-10-09 11:49:59 +11:00
|
|
|
|
func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text string, spaceWidth float64) {
|
|
|
|
|
t := XYText{
|
|
|
|
|
Point: p,
|
|
|
|
|
End: e,
|
|
|
|
|
ColorStroking: gs.ColorStroking,
|
|
|
|
|
ColorNonStroking: gs.ColorNonStroking,
|
|
|
|
|
Orient: gs.PageOrientation(),
|
|
|
|
|
Text: text,
|
|
|
|
|
SpaceWidth: spaceWidth,
|
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
common.Log.Debug("AppendText: %s", t.String())
|
|
|
|
|
*tl = append(*tl, t)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ToText returns the contents of `tl` as a single string.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
func (tl *TextList) ToText() string {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
tl.SortPosition()
|
|
|
|
|
|
|
|
|
|
lines := tl.toLines()
|
|
|
|
|
texts := []string{}
|
|
|
|
|
for _, l := range lines {
|
|
|
|
|
texts = append(texts, l.Text)
|
|
|
|
|
}
|
|
|
|
|
return strings.Join(texts, "\n")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SortPosition sorts a text list by its elements' position on a page. Top to bottom, left to right.
|
|
|
|
|
func (tl *TextList) SortPosition() {
|
|
|
|
|
sort.SliceStable(*tl, func(i, j int) bool {
|
|
|
|
|
ti, tj := (*tl)[i], (*tl)[j]
|
|
|
|
|
if ti.Y != tj.Y {
|
|
|
|
|
return ti.Y > tj.Y
|
|
|
|
|
}
|
|
|
|
|
return ti.X < tj.X
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Line represents a line of text on a page.
|
|
|
|
|
type Line struct {
|
2018-10-09 13:47:43 +11:00
|
|
|
|
Y float64 // y position of line.
|
|
|
|
|
Dx []float64 // x distance between successive words in line.
|
|
|
|
|
Text string // text in the line.
|
|
|
|
|
Words []string // words in the line
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// toLines return the text and positions in `tl` as a slice of Line.
|
|
|
|
|
// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
|
|
|
|
|
func (tl *TextList) toLines() []Line {
|
2018-10-09 11:49:59 +11:00
|
|
|
|
const wordCharCount = 1 // !@#$ needs to include diactritics
|
2018-09-20 11:49:44 +10:00
|
|
|
|
tl.printTexts()
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if len(*tl) == 0 {
|
|
|
|
|
return []Line{}
|
|
|
|
|
}
|
|
|
|
|
lines := []Line{}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
words := []string{}
|
|
|
|
|
x := []float64{}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
y := (*tl)[0].Y
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
|
|
|
|
scanning := false
|
|
|
|
|
|
|
|
|
|
averageCharWidth := ExponAve{}
|
|
|
|
|
wordSpacing := ExponAve{}
|
|
|
|
|
lastEndX := 0.0 // (*tl)[i-1).End.X
|
|
|
|
|
|
|
|
|
|
for i, t := range *tl {
|
|
|
|
|
common.Log.Debug("%d --------------------------", i)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if t.Y < y {
|
|
|
|
|
if len(words) > 0 {
|
2018-10-09 13:47:43 +11:00
|
|
|
|
line := newLine(y, x, words)
|
|
|
|
|
if averageCharWidth.running {
|
|
|
|
|
line = removeDuplicates(line, averageCharWidth.ave)
|
|
|
|
|
}
|
|
|
|
|
lines = append(lines, line)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2018-10-09 13:47:43 +11:00
|
|
|
|
words = []string{}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
x = []float64{}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
y = t.Y
|
|
|
|
|
scanning = false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Detect text movements that represent spaces on the printed page.
|
|
|
|
|
// We use a heuristic from PdfBox: If the next character starts to the right of where a
|
|
|
|
|
// character after a space at "normal spacing" would start, then there is a space before it.
|
|
|
|
|
// The tricky thing to guess here is the width of a space at normal spacing.
|
|
|
|
|
// We follow PdfBox and use min(deltaSpace, deltaCharWidth).
|
|
|
|
|
deltaSpace := 0.0
|
|
|
|
|
if t.SpaceWidth == 0 {
|
|
|
|
|
deltaSpace = math.MaxFloat64
|
|
|
|
|
} else {
|
|
|
|
|
wordSpacing.update(t.SpaceWidth)
|
|
|
|
|
deltaSpace = wordSpacing.ave * 0.5
|
|
|
|
|
}
|
|
|
|
|
averageCharWidth.update(t.Width() / wordCharCount)
|
|
|
|
|
deltaCharWidth := averageCharWidth.ave * 0.3
|
|
|
|
|
|
|
|
|
|
common.Log.Debug(" averageCharWidth=%.1f deltaCharWidth=%1.f"+
|
|
|
|
|
" [SpaceWidth=%.1f wordSpacing=%.1f deltaSpace=%.2f]"+
|
|
|
|
|
" positionWidth=%.1f wordCharCount=%d",
|
|
|
|
|
averageCharWidth.ave, deltaCharWidth,
|
|
|
|
|
t.SpaceWidth, wordSpacing.ave, deltaSpace,
|
|
|
|
|
t.Width(), wordCharCount)
|
|
|
|
|
|
|
|
|
|
isSpace := false
|
|
|
|
|
if scanning && t.Text != " " {
|
|
|
|
|
nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
|
|
|
|
|
isSpace = nextWordX < t.X
|
|
|
|
|
common.Log.Debug("[%.1f, %.1f] lastEndX=%.1f nextWordX=%.1f",
|
|
|
|
|
t.Y, t.X, lastEndX, nextWordX)
|
|
|
|
|
}
|
|
|
|
|
if isSpace {
|
|
|
|
|
common.Log.Debug("SPACE")
|
|
|
|
|
words = append(words, " ")
|
2018-10-09 13:47:43 +11:00
|
|
|
|
x = append(x, (lastEndX+t.X)*0.5)
|
|
|
|
|
if len(words) != len(x) {
|
|
|
|
|
fmt.Printf("words=%d %+v\n", len(words), words)
|
|
|
|
|
fmt.Printf(" x=%d %+v\n", len(x), x)
|
|
|
|
|
panic("AAAAA 111")
|
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
|
|
|
|
// Add the text to the line.
|
|
|
|
|
lastEndX = t.End.X
|
2018-08-22 12:29:34 +10:00
|
|
|
|
words = append(words, t.Text)
|
|
|
|
|
x = append(x, t.X)
|
2018-10-09 13:47:43 +11:00
|
|
|
|
if len(words) != len(x) {
|
|
|
|
|
fmt.Printf("words=%d %+v\n", len(words), words)
|
|
|
|
|
fmt.Printf(" x=%d %+v\n", len(x), x)
|
|
|
|
|
panic("AAAAA")
|
|
|
|
|
}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
scanning = true
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
if len(words) > 0 {
|
2018-10-09 13:47:43 +11:00
|
|
|
|
line := newLine(y, x, words)
|
|
|
|
|
if averageCharWidth.running {
|
|
|
|
|
line = removeDuplicates(line, averageCharWidth.ave)
|
|
|
|
|
}
|
|
|
|
|
lines = append(lines, line)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
return lines
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-09 11:49:59 +11:00
|
|
|
|
// min returns the less of `a` and `b`.
|
|
|
|
|
func min(a, b float64) float64 {
|
|
|
|
|
if a < b {
|
|
|
|
|
return a
|
|
|
|
|
}
|
|
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ExponAve implements an exponential average.
|
|
|
|
|
type ExponAve struct {
|
|
|
|
|
ave float64 // Current average value.
|
|
|
|
|
running bool // Has `ave` been set?
|
2018-10-09 13:47:43 +11:00
|
|
|
|
vals []float64
|
|
|
|
|
aves []float64
|
2018-10-09 11:49:59 +11:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// update updates the exponential average `exp.ave` and returns it
|
|
|
|
|
func (exp *ExponAve) update(x float64) float64 {
|
|
|
|
|
if !exp.running {
|
|
|
|
|
exp.ave = x
|
2018-10-09 13:47:43 +11:00
|
|
|
|
exp.running = true
|
2018-10-09 11:49:59 +11:00
|
|
|
|
} else {
|
|
|
|
|
exp.ave = (exp.ave + x) * 0.5
|
|
|
|
|
}
|
2018-10-09 13:47:43 +11:00
|
|
|
|
exp.vals = append(exp.vals, x)
|
|
|
|
|
exp.aves = append(exp.aves, exp.ave)
|
|
|
|
|
if len(exp.vals) > 20 && 0.0 < exp.ave && exp.ave < 0.001 {
|
|
|
|
|
for i, v := range exp.vals[len(exp.vals)-20:] {
|
|
|
|
|
fmt.Printf("%4d: %.2f %.2f\n", i, v, exp.aves[i])
|
|
|
|
|
}
|
|
|
|
|
panic("you")
|
|
|
|
|
}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
return exp.ave
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// printTexts is a debugging function. XXX Remove this.
|
|
|
|
|
func (tl *TextList) printTexts() {
|
|
|
|
|
return
|
|
|
|
|
common.Log.Error("=====================================")
|
|
|
|
|
common.Log.Error("%d texts", len(*tl))
|
|
|
|
|
for i, t := range (*tl)[1:] {
|
|
|
|
|
fmt.Printf("%5d: %s\n", i, t.String())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-03 16:38:58 +10:00
|
|
|
|
// newLine returns the Line representation of strings `words` with y coordinate `y` and x
|
|
|
|
|
// coordinates `x`.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
func newLine(y float64, x []float64, words []string) Line {
|
|
|
|
|
dx := []float64{}
|
|
|
|
|
for i := 1; i < len(x); i++ {
|
|
|
|
|
dx = append(dx, x[i]-x[i-1])
|
|
|
|
|
}
|
2018-10-09 13:47:43 +11:00
|
|
|
|
// text := strings.Join(words, "")
|
|
|
|
|
if len(x) != len(words) {
|
|
|
|
|
panic("aaa")
|
|
|
|
|
}
|
|
|
|
|
if len(dx)+1 != len(words) {
|
|
|
|
|
panic("eee")
|
|
|
|
|
}
|
|
|
|
|
// if len(text) != len(words) {
|
|
|
|
|
// fmt.Printf("words=%d %q\n", len(words), words)
|
|
|
|
|
// fmt.Printf(" text=%d %q\n", len(text), text)
|
|
|
|
|
// panic("fff")
|
|
|
|
|
// }
|
|
|
|
|
return Line{Y: y, Dx: dx, Text: strings.Join(words, ""), Words: words}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func removeDuplicates(line Line, charWidth float64) Line {
|
|
|
|
|
if len(line.Dx) == 0 {
|
|
|
|
|
return line
|
|
|
|
|
}
|
|
|
|
|
// width := 0.0
|
|
|
|
|
// for _, dx := range line.Dx {
|
|
|
|
|
// width += dx
|
|
|
|
|
// }
|
|
|
|
|
// tol := 0.3 * width / float64(len(line.Dx))
|
|
|
|
|
tol := charWidth * 0.3
|
|
|
|
|
words := []string{line.Words[0]}
|
|
|
|
|
dxList := []float64{}
|
|
|
|
|
text := strings.Replace(line.Text, " ", "~", -1)
|
|
|
|
|
if len(text) != len(line.Text) {
|
|
|
|
|
panic("FFFF")
|
|
|
|
|
}
|
|
|
|
|
// fmt.Printf("%d %d %q\n", len(line.Words), len(line.Text), text)
|
|
|
|
|
// fmt.Printf("tol=%.2f Dx=%d[%.2f]\n", tol, len(line.Dx), line.Dx)
|
|
|
|
|
w0 := line.Words[0]
|
|
|
|
|
for i, dx := range line.Dx {
|
|
|
|
|
w := line.Words[i+1]
|
|
|
|
|
if w != w0 || dx > tol {
|
|
|
|
|
words = append(words, w)
|
|
|
|
|
dxList = append(dxList, dx)
|
|
|
|
|
} else {
|
|
|
|
|
fmt.Printf("OUT[%d:%s:%.2f<%.2f]", i, line.Words[i+1], dx, tol)
|
|
|
|
|
}
|
|
|
|
|
w0 = w
|
|
|
|
|
}
|
|
|
|
|
// fmt.Printf(" dxList=%d[%.2f]\n", len(dxList), dxList)
|
|
|
|
|
return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-09-03 16:38:58 +10:00
|
|
|
|
// PageOrientation is a heuristic for the orientation of a page.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// XXX TODO: Use Page's Rotate flag instead.
|
|
|
|
|
func (tl *TextList) PageOrientation() contentstream.Orientation {
|
|
|
|
|
landscapeCount := 0
|
|
|
|
|
for _, t := range *tl {
|
|
|
|
|
if t.Orient == contentstream.OrientationLandscape {
|
|
|
|
|
landscapeCount++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
portraitCount := len(*tl) - landscapeCount
|
|
|
|
|
if landscapeCount > portraitCount {
|
|
|
|
|
return contentstream.OrientationLandscape
|
|
|
|
|
}
|
|
|
|
|
return contentstream.OrientationPortrait
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-03 16:38:58 +10:00
|
|
|
|
// Transform transforms all points in `tl` by the affine transformation a, b, c, d, tx, ty.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
func (tl *TextList) Transform(a, b, c, d, tx, ty float64) {
|
|
|
|
|
m := contentstream.NewMatrix(a, b, c, d, tx, ty)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
for _, t := range *tl {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
t.X, t.Y = m.Transform(t.X, t.Y)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
|
|
|
|
|
// empty.
|
|
|
|
|
func (to *textObject) getCurrentFont() *model.PdfFont {
|
|
|
|
|
if to.fontStack.empty() {
|
|
|
|
|
common.Log.Debug("ERROR: No font defined. Using default.")
|
|
|
|
|
return model.DefaultFont()
|
|
|
|
|
}
|
|
|
|
|
return to.fontStack.peek()
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-28 11:11:43 +10:00
|
|
|
|
// getFont returns the font named `name` if it exists in the page's resources or an error if it
|
2018-09-17 12:12:06 +10:00
|
|
|
|
// doesn't. It caches the returned fonts.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) getFont(name string) (*model.PdfFont, error) {
|
2018-09-22 09:28:18 +10:00
|
|
|
|
if to.e.fontCache != nil {
|
|
|
|
|
to.e.accessCount++
|
|
|
|
|
entry, ok := to.e.fontCache[name]
|
|
|
|
|
if ok {
|
|
|
|
|
entry.access = to.e.accessCount
|
|
|
|
|
return entry.font, nil
|
|
|
|
|
}
|
2018-09-17 12:12:06 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Font not in cache. Load it.
|
|
|
|
|
font, err := to.getFontDirect(name)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-22 09:28:18 +10:00
|
|
|
|
if to.e.fontCache != nil {
|
|
|
|
|
entry := fontEntry{font, to.e.accessCount}
|
|
|
|
|
|
|
|
|
|
// Eject a victim if the cache is full.
|
|
|
|
|
if len(to.e.fontCache) >= maxFontCache {
|
|
|
|
|
names := []string{}
|
|
|
|
|
for name := range to.e.fontCache {
|
|
|
|
|
names = append(names, name)
|
|
|
|
|
}
|
|
|
|
|
sort.Slice(names, func(i, j int) bool {
|
|
|
|
|
return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
|
|
|
|
|
})
|
|
|
|
|
delete(to.e.fontCache, names[0])
|
2018-09-17 12:12:06 +10:00
|
|
|
|
}
|
2018-09-22 09:28:18 +10:00
|
|
|
|
to.e.fontCache[name] = entry
|
2018-09-17 12:12:06 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return font, nil
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-21 16:43:10 +10:00
|
|
|
|
// fontEntry is a entry in the font cache.
|
2018-09-17 12:12:06 +10:00
|
|
|
|
type fontEntry struct {
|
|
|
|
|
font *model.PdfFont // The font being cached.
|
|
|
|
|
access int64 // Last access. Used to determine LRU cache victims.
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// maxFontCache is the maximum number of PdfFont's in fontCache.
|
|
|
|
|
const maxFontCache = 10
|
|
|
|
|
|
|
|
|
|
// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
|
|
|
|
|
// is doesn't.
|
2018-09-22 09:28:18 +10:00
|
|
|
|
// This is a direct (uncached access).
|
2018-09-17 12:12:06 +10:00
|
|
|
|
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
|
|
|
|
|
// This is a hack for testing.
|
2018-07-15 17:22:00 +10:00
|
|
|
|
if name == "UniDocCourier" {
|
2018-09-07 19:11:58 +10:00
|
|
|
|
return model.NewStandard14FontMustCompile(model.Courier), nil
|
2018-07-07 09:45:55 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-06-27 16:31:28 +10:00
|
|
|
|
fontObj, err := to.getFontDict(name)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
font, err := model.NewPdfFontFromPdfObject(fontObj)
|
|
|
|
|
if err != nil {
|
2018-09-17 12:12:06 +10:00
|
|
|
|
common.Log.Debug("getFontDirect: NewPdfFontFromPdfObject failed. name=%#q err=%v", name, err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
return font, err
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-17 12:12:06 +10:00
|
|
|
|
// getFontDict returns the font dict with key `name` if it exists in the page's Font resources or
|
2018-07-25 12:00:49 +10:00
|
|
|
|
// an error if it doesn't.
|
|
|
|
|
func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
resources := to.e.resources
|
|
|
|
|
if resources == nil {
|
|
|
|
|
common.Log.Debug("getFontDict. No resources. name=%#q", name)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if !found {
|
2018-07-25 12:00:49 +10:00
|
|
|
|
common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
|
|
|
|
|
return nil, errors.New("Font not in resources")
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return fontObj, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|