2018-03-22 14:03:47 +00:00
|
|
|
|
/*
|
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
|
package extractor
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"errors"
|
2018-07-13 17:40:27 +10:00
|
|
|
|
"fmt"
|
2018-10-09 11:49:59 +11:00
|
|
|
|
"math"
|
2018-08-22 12:29:34 +10:00
|
|
|
|
"sort"
|
2018-07-13 17:40:27 +10:00
|
|
|
|
"strings"
|
2018-11-28 18:06:03 +11:00
|
|
|
|
"unicode"
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2019-05-16 23:08:40 +03:00
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
2019-05-16 23:44:51 +03:00
|
|
|
|
"github.com/unidoc/unipdf/v3/contentstream"
|
|
|
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
|
|
|
"github.com/unidoc/unipdf/v3/internal/transform"
|
|
|
|
|
"github.com/unidoc/unipdf/v3/model"
|
2018-11-28 18:06:03 +11:00
|
|
|
|
"golang.org/x/text/unicode/norm"
|
2018-03-22 13:01:04 +00:00
|
|
|
|
)
|
|
|
|
|
|
2018-06-27 12:25:59 +10:00
|
|
|
|
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// It takes into account character encodings in the PDF file, which are decoded by
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// CharcodeBytesToUnicode.
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
2018-03-22 13:01:04 +00:00
|
|
|
|
func (e *Extractor) ExtractText() (string, error) {
|
2018-11-28 23:25:17 +00:00
|
|
|
|
text, _, _, err := e.ExtractTextWithStats()
|
2018-07-13 17:40:27 +10:00
|
|
|
|
return text, err
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
|
|
|
|
|
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
|
2018-11-30 16:53:48 +00:00
|
|
|
|
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
|
2019-01-04 16:07:03 +11:00
|
|
|
|
pageText, numChars, numMisses, err := e.ExtractPageText()
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
return "", numChars, numMisses, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
return pageText.Text(), numChars, numMisses, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
|
|
|
|
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, numChars, numMisses, err
|
|
|
|
|
}
|
|
|
|
|
pt.computeViews()
|
2019-08-04 09:29:21 +00:00
|
|
|
|
procBuf(pt)
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
return pt, numChars, numMisses, err
|
2018-12-27 20:51:34 +11:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
|
|
|
|
|
// PageText.
|
2018-12-27 21:33:31 +11:00
|
|
|
|
// This can be called on a page or a form XObject.
|
2019-07-18 16:41:47 +10:00
|
|
|
|
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (
|
|
|
|
|
*PageText, int, int, error) {
|
2019-01-04 16:07:03 +11:00
|
|
|
|
common.Log.Trace("extractPageText: level=%d", level)
|
|
|
|
|
pageText := &PageText{}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
state := newTextState()
|
2018-07-13 17:40:27 +10:00
|
|
|
|
fontStack := fontStacker{}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
var to *textObject
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2018-12-27 20:51:34 +11:00
|
|
|
|
cstreamParser := contentstream.NewContentStreamParser(contents)
|
2018-03-22 13:01:04 +00:00
|
|
|
|
operations, err := cstreamParser.Parse()
|
|
|
|
|
if err != nil {
|
2019-01-04 16:07:03 +11:00
|
|
|
|
common.Log.Debug("ERROR: extractPageText parse failed. err=%v", err)
|
|
|
|
|
return pageText, state.numChars, state.numMisses, err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
processor := contentstream.NewContentStreamProcessor(*operations)
|
|
|
|
|
|
|
|
|
|
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
|
2018-06-27 16:31:28 +10:00
|
|
|
|
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
|
|
|
|
|
resources *model.PdfPageResources) error {
|
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
|
operand := op.Operand
|
2018-06-27 16:31:28 +10:00
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
|
switch operand {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
case "q":
|
|
|
|
|
if !fontStack.empty() {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Save font state: %s\n%s",
|
2018-07-13 17:40:27 +10:00
|
|
|
|
fontStack.peek(), fontStack.String())
|
|
|
|
|
fontStack.push(fontStack.peek())
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
if state.tfont != nil {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Save font state: %s\n->%s\n%s",
|
2019-01-02 10:39:30 +11:00
|
|
|
|
fontStack.peek(), state.tfont, fontStack.String())
|
|
|
|
|
fontStack.push(state.tfont)
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
|
|
|
|
case "Q":
|
|
|
|
|
if !fontStack.empty() {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Restore font state: %s\n->%s\n%s",
|
2018-07-13 17:40:27 +10:00
|
|
|
|
fontStack.peek(), fontStack.get(-2), fontStack.String())
|
|
|
|
|
fontStack.pop()
|
|
|
|
|
}
|
|
|
|
|
if len(fontStack) >= 2 {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Trace("Restore font state: %s\n->%s\n%s",
|
2019-01-02 10:39:30 +11:00
|
|
|
|
state.tfont, fontStack.peek(), fontStack.String())
|
|
|
|
|
state.tfont = fontStack.pop()
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
case "BT": // Begin text
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
|
|
|
|
|
// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
|
|
|
|
|
// not appear before an ET.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if to != nil {
|
|
|
|
|
common.Log.Debug("BT called while in a text object")
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-12-27 20:51:34 +11:00
|
|
|
|
to = newTextObject(e, resources, gs, &state, &fontStack)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
case "ET": // End Text
|
2019-01-05 14:10:54 +11:00
|
|
|
|
pageText.marks = append(pageText.marks, to.marks...)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
to = nil
|
|
|
|
|
case "T*": // Move to start of next text line
|
|
|
|
|
to.nextLine()
|
|
|
|
|
case "Td": // Move text location
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 2, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
x, y, err := toFloatXY(op.Params)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.moveText(x, y)
|
2018-11-26 08:09:52 +11:00
|
|
|
|
case "TD": // Move text location and set leading.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 2, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
x, y, err := toFloatXY(op.Params)
|
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
to.moveTextSetLeading(x, y)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Tj": // Show text.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tj op=%s err=%v", op, err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
charcodes, ok := core.GetStringBytes(op.Params[0])
|
|
|
|
|
if !ok {
|
|
|
|
|
common.Log.Debug("ERROR: Tj op=%s GetStringBytes failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return to.showText(charcodes)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "TJ": // Show text with adjustable spacing.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: TJ err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-25 13:19:09 +10:00
|
|
|
|
args, ok := core.GetArray(op.Params[0])
|
2018-07-21 21:20:39 +10:00
|
|
|
|
if !ok {
|
2019-01-01 12:22:39 +11:00
|
|
|
|
common.Log.Debug("ERROR: TJ op=%s GetArrayVal failed", op)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return to.showTextAdjusted(args)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "'": // Move to next line and show text.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: ' err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
charcodes, ok := core.GetStringBytes(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: ' op=%s GetStringBytes failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
to.nextLine()
|
|
|
|
|
return to.showText(charcodes)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case `"`: // Set word and character spacing, move to next line, and show text.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: \" err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
x, y, err := toFloatXY(op.Params[:2])
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
charcodes, ok := core.GetStringBytes(op.Params[2])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: \" op=%s GetStringBytes failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.setCharSpacing(x)
|
|
|
|
|
to.setWordSpacing(y)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
to.nextLine()
|
|
|
|
|
return to.showText(charcodes)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "TL": // Set text leading.
|
2018-07-15 16:45:47 +10:00
|
|
|
|
y, err := floatParam(op)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if err != nil {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: TL err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setTextLeading(y)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Tc": // Set character spacing.
|
2018-07-15 16:45:47 +10:00
|
|
|
|
y, err := floatParam(op)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if err != nil {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tc err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setCharSpacing(y)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Tf": // Set font.
|
2018-11-21 13:14:11 +11:00
|
|
|
|
if to == nil {
|
2018-11-30 23:01:04 +00:00
|
|
|
|
// This is needed for 26-Hazard-Thermal-environment.pdf
|
2018-12-27 20:51:34 +11:00
|
|
|
|
to = newTextObject(e, resources, gs, &state, &fontStack)
|
2018-11-21 13:14:11 +11:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 2, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tf err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
name, ok := core.GetNameVal(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tf op=%s GetNameVal failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
size, err := core.GetNumberAsFloat(op.Params[1])
|
2018-07-21 21:20:39 +10:00
|
|
|
|
if !ok {
|
|
|
|
|
common.Log.Debug("ERROR: Tf op=%s GetFloatVal failed. err=%v", op, err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
err = to.setFont(name, size)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Tm": // Set text matrix.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 6, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tm err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
floats, err := core.GetNumbersAsFloat(op.Params)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setTextMatrix(floats)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Tr": // Set text rendering mode.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tr err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-21 21:20:39 +10:00
|
|
|
|
mode, ok := core.GetIntVal(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Tr op=%s GetIntVal failed", op)
|
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
to.setTextRenderMode(mode)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Ts": // Set text rise.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-21 21:20:39 +10:00
|
|
|
|
common.Log.Debug("ERROR: Ts err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
y, err := core.GetNumberAsFloat(op.Params[0])
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setTextRise(y)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Tw": // Set word spacing.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
y, err := core.GetNumberAsFloat(op.Params[0])
|
2018-03-22 13:01:04 +00:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
to.setWordSpacing(y)
|
2018-12-27 21:33:31 +11:00
|
|
|
|
case "Tz": // Set horizontal scaling.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
if ok, err := to.checkOp(op, 1, true); !ok {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
y, err := core.GetNumberAsFloat(op.Params[0])
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-07-07 09:45:55 +10:00
|
|
|
|
common.Log.Debug("ERROR: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
to.setHorizScaling(y)
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
2018-12-27 20:51:34 +11:00
|
|
|
|
case "Do":
|
2018-12-27 21:33:31 +11:00
|
|
|
|
// Handle XObjects by recursing through form XObjects.
|
2018-12-27 20:51:34 +11:00
|
|
|
|
name := *op.Params[0].(*core.PdfObjectName)
|
|
|
|
|
_, xtype := resources.GetXObjectByName(name)
|
|
|
|
|
if xtype != model.XObjectTypeForm {
|
|
|
|
|
break
|
|
|
|
|
}
|
2018-12-27 20:53:37 +11:00
|
|
|
|
// Only process each form once.
|
2018-12-27 20:51:34 +11:00
|
|
|
|
formResult, ok := e.formResults[string(name)]
|
|
|
|
|
if !ok {
|
|
|
|
|
xform, err := resources.GetXObjectFormByName(name)
|
|
|
|
|
if err != nil {
|
|
|
|
|
common.Log.Debug("ERROR: %v", err)
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
formContent, err := xform.GetContentStream()
|
|
|
|
|
if err != nil {
|
|
|
|
|
common.Log.Debug("ERROR: %v", err)
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
formResources := xform.Resources
|
|
|
|
|
if formResources == nil {
|
|
|
|
|
formResources = resources
|
|
|
|
|
}
|
2019-01-04 16:07:03 +11:00
|
|
|
|
tList, numChars, numMisses, err := e.extractPageText(string(formContent),
|
2018-12-27 20:51:34 +11:00
|
|
|
|
formResources, level+1)
|
|
|
|
|
if err != nil {
|
|
|
|
|
common.Log.Debug("ERROR: %v", err)
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
formResult = textResult{*tList, numChars, numMisses}
|
|
|
|
|
e.formResults[string(name)] = formResult
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-05 14:10:54 +11:00
|
|
|
|
pageText.marks = append(pageText.marks, formResult.pageText.marks...)
|
2018-12-27 20:51:34 +11:00
|
|
|
|
state.numChars += formResult.numChars
|
|
|
|
|
state.numMisses += formResult.numMisses
|
|
|
|
|
}
|
2018-03-22 13:01:04 +00:00
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
2018-12-27 20:51:34 +11:00
|
|
|
|
err = processor.Process(resources)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if err != nil {
|
2018-11-18 17:21:30 +11:00
|
|
|
|
common.Log.Debug("ERROR: Processing: err=%v", err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2019-01-04 16:07:03 +11:00
|
|
|
|
return pageText, state.numChars, state.numMisses, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-12-27 20:51:34 +11:00
|
|
|
|
type textResult struct {
|
2019-01-04 16:07:03 +11:00
|
|
|
|
pageText PageText
|
2018-12-27 20:51:34 +11:00
|
|
|
|
numChars int
|
|
|
|
|
numMisses int
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-27 16:31:28 +10:00
|
|
|
|
//
|
|
|
|
|
// Text operators
|
|
|
|
|
//
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// moveText "Td" Moves start of text by `tx`,`ty`.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
|
|
|
|
|
// tx and ty are in unscaled text space units.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) moveText(tx, ty float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.moveTo(tx, ty)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// moveTextSetLeading "TD" Move text location and set leading.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// Move to the start of the next line, offset from the start of the current line by (tx, ty). As a
|
|
|
|
|
// side effect, this operator shall set the leading parameter in the text state. This operator shall
|
|
|
|
|
// have the same effect as this code:
|
|
|
|
|
// −ty TL
|
|
|
|
|
// tx ty Td
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) moveTextSetLeading(tx, ty float64) {
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.tl = -ty
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.moveTo(tx, ty)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-03 15:41:36 +11:00
|
|
|
|
// nextLine "T*"" Moves start of text line to next text line
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// Move to the start of the next line. This operator has the same effect as the code
|
|
|
|
|
// 0 -Tl Td
|
|
|
|
|
// where Tl denotes the current leading parameter in the text state. The negative of Tl is used
|
|
|
|
|
// here because Tl is the text leading expressed as a positive number. Going to the next line
|
|
|
|
|
// entails decreasing the y coordinate. (page 250)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) nextLine() {
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.moveTo(0, -to.state.tl)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-10 21:19:02 +11:00
|
|
|
|
// setTextMatrix "Tm".
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// in `f` (page 250).
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextMatrix(f []float64) {
|
2018-11-28 23:25:17 +00:00
|
|
|
|
if len(f) != 6 {
|
|
|
|
|
common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
|
|
|
|
|
return
|
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.tm = transform.NewMatrix(a, b, c, d, tx, ty)
|
|
|
|
|
to.tlm = to.tm
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// showText "Tj". Show a text string.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) showText(charcodes []byte) error {
|
2018-07-02 16:46:43 +10:00
|
|
|
|
return to.renderText(charcodes)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// showTextAdjusted "TJ". Show text with adjustable spacing.
|
2018-07-25 13:19:09 +10:00
|
|
|
|
func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
vertical := false
|
2018-07-25 13:19:09 +10:00
|
|
|
|
for _, o := range args.Elements() {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
switch o.(type) {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
case *core.PdfObjectFloat, *core.PdfObjectInteger:
|
2018-08-22 12:29:34 +10:00
|
|
|
|
x, err := core.GetNumberAsFloat(o)
|
|
|
|
|
if err != nil {
|
2018-11-27 13:37:12 +11:00
|
|
|
|
common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
return err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
dx, dy := -x*0.001*to.state.tfs, 0.0
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if vertical {
|
|
|
|
|
dy, dx = dx, dy
|
|
|
|
|
}
|
2018-11-30 16:53:48 +00:00
|
|
|
|
td := translationMatrix(transform.Point{X: dx, Y: dy})
|
2019-01-22 18:18:27 +11:00
|
|
|
|
to.tm.Concat(td)
|
2019-01-02 10:39:30 +11:00
|
|
|
|
common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.tm)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
case *core.PdfObjectString:
|
2018-07-21 21:20:39 +10:00
|
|
|
|
charcodes, ok := core.GetStringBytes(o)
|
|
|
|
|
if !ok {
|
2018-11-27 13:37:12 +11:00
|
|
|
|
common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
|
2018-07-21 21:20:39 +10:00
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
to.renderText(charcodes)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
default:
|
2018-11-27 13:37:12 +11:00
|
|
|
|
common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return core.ErrTypeError
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// setTextLeading "TL". Set text leading.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextLeading(y float64) {
|
2019-01-02 10:39:30 +11:00
|
|
|
|
if to == nil || to.state == nil {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
return
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.tl = y
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// setCharSpacing "Tc". Set character spacing.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setCharSpacing(x float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.tc = x
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// setFont "Tf". Set font.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setFont(name string, size float64) error {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
font, err := to.getFont(name)
|
2018-07-03 14:26:42 +10:00
|
|
|
|
if err == nil {
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.tfont = font
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if len(*to.fontStack) == 0 {
|
|
|
|
|
to.fontStack.push(font)
|
|
|
|
|
} else {
|
|
|
|
|
(*to.fontStack)[len(*to.fontStack)-1] = font
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
} else if err == model.ErrFontNotSupported {
|
2018-12-27 12:40:55 +02:00
|
|
|
|
// TODO(peterwilliams97): Do we need to handle this case in a special way?
|
2018-07-04 18:00:37 +10:00
|
|
|
|
return err
|
2018-07-03 14:26:42 +10:00
|
|
|
|
} else {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return err
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.tfs = size
|
2018-06-27 16:31:28 +10:00
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// setTextRenderMode "Tr". Set text rendering mode.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextRenderMode(mode int) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.tmode = RenderMode(mode)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// setTextRise "Ts". Set text rise.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setTextRise(y float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.trise = y
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// setWordSpacing "Tw". Set word spacing.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setWordSpacing(y float64) {
|
2018-11-22 22:01:04 +11:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.tw = y
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// setHorizScaling "Tz". Set horizontal scaling.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) setHorizScaling(y float64) {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
if to == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.th = y
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-28 23:25:17 +00:00
|
|
|
|
// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// a single float parameter or we aren't in a text stream.
|
2018-07-15 16:45:47 +10:00
|
|
|
|
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
|
|
|
|
|
if len(op.Params) != 1 {
|
2018-11-21 13:14:11 +11:00
|
|
|
|
err := errors.New("incorrect parameter count")
|
2018-07-15 16:45:47 +10:00
|
|
|
|
common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
|
|
|
|
|
op.Operand, 1, len(op.Params), op.Params)
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return 0.0, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return core.GetNumberAsFloat(op.Params[0])
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// checkOp returns true if we are in a text stream and `op` has `numParams` params.
|
|
|
|
|
// If `hard` is true and the number of params don't match, an error is returned.
|
2019-07-18 16:41:47 +10:00
|
|
|
|
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int, hard bool) (
|
|
|
|
|
ok bool, err error) {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if to == nil {
|
2018-11-28 23:25:17 +00:00
|
|
|
|
var params []core.PdfObject
|
2018-11-21 13:14:11 +11:00
|
|
|
|
if numParams > 0 {
|
|
|
|
|
params = op.Params
|
|
|
|
|
if len(params) > numParams {
|
|
|
|
|
params = params[:numParams]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
common.Log.Debug("%#q operand outside text. params=%+v", op.Operand, params)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
if numParams >= 0 {
|
|
|
|
|
if len(op.Params) != numParams {
|
|
|
|
|
if hard {
|
2018-11-21 13:14:11 +11:00
|
|
|
|
err = errors.New("incorrect parameter count")
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
|
2018-06-27 16:31:28 +10:00
|
|
|
|
op.Operand, numParams, len(op.Params), op.Params)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return false, err
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return true, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// fontStacker is the PDF font stack implementation.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
type fontStacker []*model.PdfFont
|
|
|
|
|
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// String returns a string describing the current state of the font stack.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) String() string {
|
|
|
|
|
parts := []string{"---- font stack"}
|
|
|
|
|
for i, font := range *fontStack {
|
|
|
|
|
s := "<nil>"
|
|
|
|
|
if font != nil {
|
|
|
|
|
s = font.String()
|
|
|
|
|
}
|
|
|
|
|
parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
|
|
|
|
|
}
|
|
|
|
|
return strings.Join(parts, "\n")
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
|
|
|
|
// push pushes `font` onto the font stack.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) push(font *model.PdfFont) {
|
|
|
|
|
*fontStack = append(*fontStack, font)
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// pop pops and returns the element on the top of the font stack if there is one or nil if there isn't.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
func (fontStack *fontStacker) pop() *model.PdfFont {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if fontStack.empty() {
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return nil
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
font := (*fontStack)[len(*fontStack)-1]
|
2018-07-13 17:40:27 +10:00
|
|
|
|
*fontStack = (*fontStack)[:len(*fontStack)-1]
|
2018-07-15 16:28:56 +10:00
|
|
|
|
return font
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// peek returns the element on the top of the font stack if there is one or nil if there isn't.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (fontStack *fontStacker) peek() *model.PdfFont {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if fontStack.empty() {
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return (*fontStack)[len(*fontStack)-1]
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// get returns the `idx`'th element of the font stack if there is one or nil if there isn't.
|
2018-07-15 16:28:56 +10:00
|
|
|
|
// idx = 0: bottom of font stack
|
|
|
|
|
// idx = len(fontstack) - 1: top of font stack
|
|
|
|
|
// idx = -n is same as dx = len(fontstack) - n, so fontstack.get(-1) is same as fontstack.peek()
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (fontStack *fontStacker) get(idx int) *model.PdfFont {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
if idx < 0 {
|
|
|
|
|
idx += fontStack.size()
|
|
|
|
|
}
|
|
|
|
|
if idx < 0 || idx > fontStack.size()-1 {
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return (*fontStack)[idx]
|
2018-07-13 17:40:27 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
|
|
|
|
// empty returns true if the font stack is empty.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) empty() bool {
|
|
|
|
|
return len(*fontStack) == 0
|
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
|
|
|
|
|
// size returns the number of elements in the font stack.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
func (fontStack *fontStacker) size() int {
|
|
|
|
|
return len(*fontStack)
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// 9.3 Text State Parameters and Operators (page 243)
|
|
|
|
|
// Some of these parameters are expressed in unscaled text space units. This means that they shall
|
|
|
|
|
// be specified in a coordinate system that shall be defined by the text matrix, Tm but shall not be
|
|
|
|
|
// scaled by the font size parameter, Tfs.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
|
|
|
|
|
// textState represents the text state.
|
|
|
|
|
type textState struct {
|
2019-01-02 10:39:30 +11:00
|
|
|
|
tc float64 // Character spacing. Unscaled text space units.
|
|
|
|
|
tw float64 // Word spacing. Unscaled text space units.
|
|
|
|
|
th float64 // Horizontal scaling.
|
|
|
|
|
tl float64 // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
|
|
|
|
|
tfs float64 // Text font size.
|
|
|
|
|
tmode RenderMode // Text rendering mode.
|
|
|
|
|
trise float64 // Text rise. Unscaled text space units. Set by Ts.
|
|
|
|
|
tfont *model.PdfFont // Text font.
|
2018-07-13 17:40:27 +10:00
|
|
|
|
// For debugging
|
|
|
|
|
numChars int
|
|
|
|
|
numMisses int
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 9.4.1 General (page 248)
|
|
|
|
|
// A PDF text object consists of operators that may show text strings, move the text position, and
|
|
|
|
|
// set text state and certain other parameters. In addition, two parameters may be specified only
|
|
|
|
|
// within a text object and shall not persist from one text object to the next:
|
2018-06-28 11:11:43 +10:00
|
|
|
|
// • Tm, the text matrix
|
|
|
|
|
// • Tlm, the text line matrix
|
2018-06-27 16:31:28 +10:00
|
|
|
|
//
|
|
|
|
|
// Text space is converted to device space by this transform (page 252)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// Trm is the text rendering matrix
|
2018-06-27 16:31:28 +10:00
|
|
|
|
// | Tfs x Th 0 0 |
|
|
|
|
|
// Trm = | 0 Tfs 0 | × Tm × CTM
|
|
|
|
|
// | 0 Trise 1 |
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// This corresponds to the following code in renderText()
|
2019-01-22 18:18:27 +11:00
|
|
|
|
// trm := to.gs.CTM.Mult(stateMatrix).Mult(to.tm)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
|
|
|
|
|
// textObject represents a PDF text object.
|
|
|
|
|
type textObject struct {
|
2018-07-13 17:40:27 +10:00
|
|
|
|
e *Extractor
|
2018-12-27 20:51:34 +11:00
|
|
|
|
resources *model.PdfPageResources
|
2018-07-13 17:40:27 +10:00
|
|
|
|
gs contentstream.GraphicsState
|
|
|
|
|
fontStack *fontStacker
|
2019-01-02 10:39:30 +11:00
|
|
|
|
state *textState
|
|
|
|
|
tm transform.Matrix // Text matrix. For the character pointer.
|
|
|
|
|
tlm transform.Matrix // Text line matrix. For the start of line pointer.
|
2019-01-04 16:02:22 +11:00
|
|
|
|
marks []textMark // Text marks get written here.
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// newTextState returns a default textState.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func newTextState() textState {
|
2018-08-22 12:29:34 +10:00
|
|
|
|
return textState{
|
2019-01-02 10:39:30 +11:00
|
|
|
|
th: 100,
|
|
|
|
|
tmode: RenderModeFill,
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-08-22 12:29:34 +10:00
|
|
|
|
// newTextObject returns a default textObject.
|
2018-12-27 20:51:34 +11:00
|
|
|
|
func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
|
2019-07-18 16:41:47 +10:00
|
|
|
|
state *textState, fontStack *fontStacker) *textObject {
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return &textObject{
|
2018-07-13 17:40:27 +10:00
|
|
|
|
e: e,
|
2018-12-27 20:51:34 +11:00
|
|
|
|
resources: resources,
|
2018-07-13 17:40:27 +10:00
|
|
|
|
gs: gs,
|
|
|
|
|
fontStack: fontStack,
|
2019-01-02 10:39:30 +11:00
|
|
|
|
state: state,
|
|
|
|
|
tm: transform.IdentityMatrix(),
|
|
|
|
|
tlm: transform.IdentityMatrix(),
|
2018-03-22 13:01:04 +00:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-28 23:25:17 +00:00
|
|
|
|
// renderText processes and renders byte array `data` for extraction purposes.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) renderText(data []byte) error {
|
2018-09-20 11:49:44 +10:00
|
|
|
|
font := to.getCurrentFont()
|
2018-10-30 21:55:30 +11:00
|
|
|
|
charcodes := font.BytesToCharcodes(data)
|
2019-01-01 12:22:39 +11:00
|
|
|
|
runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes)
|
2018-11-28 18:06:03 +11:00
|
|
|
|
if numMisses > 0 {
|
|
|
|
|
common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
|
|
|
|
|
}
|
2018-10-30 21:55:30 +11:00
|
|
|
|
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.state.numChars += numChars
|
|
|
|
|
to.state.numMisses += numMisses
|
2018-07-25 12:00:49 +10:00
|
|
|
|
|
2019-01-02 10:39:30 +11:00
|
|
|
|
state := to.state
|
|
|
|
|
tfs := state.tfs
|
|
|
|
|
th := state.th / 100.0
|
2019-03-09 18:03:43 +00:00
|
|
|
|
spaceMetrics, ok := font.GetRuneMetrics(' ')
|
2018-12-02 13:09:32 +11:00
|
|
|
|
if !ok {
|
|
|
|
|
spaceMetrics, ok = font.GetCharMetrics(32)
|
|
|
|
|
}
|
2018-12-02 09:14:58 +11:00
|
|
|
|
if !ok {
|
2019-03-09 18:03:43 +00:00
|
|
|
|
spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
|
2018-10-09 11:49:59 +11:00
|
|
|
|
}
|
|
|
|
|
spaceWidth := spaceMetrics.Wx * glyphTextRatio
|
2019-01-01 12:22:39 +11:00
|
|
|
|
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
|
2018-11-30 16:53:48 +00:00
|
|
|
|
stateMatrix := transform.NewMatrix(
|
2018-09-20 11:49:44 +10:00
|
|
|
|
tfs*th, 0,
|
|
|
|
|
0, tfs,
|
2019-01-02 10:39:30 +11:00
|
|
|
|
0, state.trise)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
|
2019-01-01 12:22:39 +11:00
|
|
|
|
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
|
2018-11-18 17:21:30 +11:00
|
|
|
|
|
2019-01-01 12:22:39 +11:00
|
|
|
|
for i, r := range runes {
|
2018-11-30 23:01:04 +00:00
|
|
|
|
// TODO(peterwilliams97): Need to find and fix cases where this happens.
|
2018-12-07 18:43:24 +02:00
|
|
|
|
if r == '\x00' {
|
2018-11-28 18:06:03 +11:00
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-30 21:55:30 +11:00
|
|
|
|
code := charcodes[i]
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// The location of the text on the page in device coordinates is given by trm, the text
|
|
|
|
|
// rendering matrix.
|
2019-01-22 18:18:27 +11:00
|
|
|
|
trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
|
|
|
|
|
// calculate the text location displacement due to writing `r`. We will use this to update
|
2019-01-02 10:39:30 +11:00
|
|
|
|
// to.tm
|
2018-09-20 11:49:44 +10:00
|
|
|
|
|
|
|
|
|
// w is the unscaled movement at the end of a word.
|
|
|
|
|
w := 0.0
|
2018-12-07 18:43:24 +02:00
|
|
|
|
if r == ' ' {
|
2019-01-02 10:39:30 +11:00
|
|
|
|
w = state.tw
|
2018-09-20 11:49:44 +10:00
|
|
|
|
}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
2018-10-30 21:55:30 +11:00
|
|
|
|
m, ok := font.GetCharMetrics(code)
|
|
|
|
|
if !ok {
|
2018-11-08 15:20:12 +11:00
|
|
|
|
common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
|
2018-10-30 21:55:30 +11:00
|
|
|
|
return errors.New("no char metrics")
|
2018-09-20 11:49:44 +10:00
|
|
|
|
}
|
2018-10-30 21:55:30 +11:00
|
|
|
|
|
2018-10-09 11:49:59 +11:00
|
|
|
|
// c is the character size in unscaled text units.
|
2018-11-30 16:53:48 +00:00
|
|
|
|
c := transform.Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
|
2018-10-30 21:55:30 +11:00
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// t0 is the end of this character.
|
2018-10-09 11:49:59 +11:00
|
|
|
|
// t is the displacement of the text cursor when the character is rendered.
|
2018-11-30 16:53:48 +00:00
|
|
|
|
t0 := transform.Point{X: (c.X*tfs + w) * th}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
|
2018-09-20 11:49:44 +10:00
|
|
|
|
|
2018-11-19 14:19:50 +11:00
|
|
|
|
// td, td0 are t, t0 in matrix form.
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// td0 is where this character ends. td is where the next character starts.
|
2018-11-18 17:21:30 +11:00
|
|
|
|
td0 := translationMatrix(t0)
|
2018-10-09 11:49:59 +11:00
|
|
|
|
td := translationMatrix(t)
|
|
|
|
|
|
2019-01-02 10:39:30 +11:00
|
|
|
|
common.Log.Trace("\"%c\" stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.tm)
|
|
|
|
|
common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.tc, w, state.tw)
|
|
|
|
|
common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.tm).Mult(to.gs.CTM))
|
2018-11-26 17:17:17 +11:00
|
|
|
|
|
2019-01-04 16:02:22 +11:00
|
|
|
|
mark := to.newTextMark(
|
2018-11-18 17:21:30 +11:00
|
|
|
|
string(r),
|
2018-11-26 17:17:17 +11:00
|
|
|
|
trm,
|
2019-01-22 18:18:27 +11:00
|
|
|
|
translation(to.gs.CTM.Mult(to.tm).Mult(td0)),
|
2019-07-18 16:41:47 +10:00
|
|
|
|
math.Abs(spaceWidth*trm.ScalingFactorX()),
|
|
|
|
|
font,
|
|
|
|
|
to.state.tc)
|
|
|
|
|
if font == nil {
|
|
|
|
|
common.Log.Debug("ERROR: No font.")
|
|
|
|
|
} else if font.Encoder() == nil {
|
|
|
|
|
common.Log.Debug("ERROR: No encoding. font=%s", font)
|
|
|
|
|
} else {
|
|
|
|
|
original, ok := font.Encoder().CharcodeToRune(code)
|
|
|
|
|
if ok {
|
|
|
|
|
mark.original = string(original)
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-01-04 16:02:22 +11:00
|
|
|
|
common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
|
|
|
|
|
to.marks = append(to.marks, mark)
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
|
|
|
|
// update the text matrix by the displacement of the text location.
|
2019-01-22 18:18:27 +11:00
|
|
|
|
to.tm.Concat(td)
|
2019-01-02 10:39:30 +11:00
|
|
|
|
common.Log.Trace("to.tm=%s", to.tm)
|
2018-09-20 11:49:44 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// glyphTextRatio converts Glyph metrics units to unscaled text space units.
|
|
|
|
|
const glyphTextRatio = 1.0 / 1000.0
|
|
|
|
|
|
|
|
|
|
// translation returns the translation part of `m`.
|
2018-11-30 16:53:48 +00:00
|
|
|
|
func translation(m transform.Matrix) transform.Point {
|
2018-09-20 11:49:44 +10:00
|
|
|
|
tx, ty := m.Translation()
|
2019-03-09 20:45:19 +00:00
|
|
|
|
return transform.Point{X: tx, Y: ty}
|
2018-09-20 11:49:44 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// translationMatrix returns a matrix that translates by `p`.
|
2018-11-30 16:53:48 +00:00
|
|
|
|
func translationMatrix(p transform.Point) transform.Matrix {
|
|
|
|
|
return transform.TranslationMatrix(p.X, p.Y)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// moveTo moves the start of line pointer by `tx`,`ty` and sets the text pointer to the
|
|
|
|
|
// start of line pointer.
|
|
|
|
|
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
|
|
|
|
|
// `tx` and `ty` are in unscaled text space units.
|
|
|
|
|
func (to *textObject) moveTo(tx, ty float64) {
|
2019-01-22 18:18:27 +11:00
|
|
|
|
to.tlm.Concat(transform.NewMatrix(1, 0, 0, 1, tx, ty))
|
2019-01-02 10:39:30 +11:00
|
|
|
|
to.tm = to.tlm
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-02 10:39:30 +11:00
|
|
|
|
// textMark represents text drawn on a page and its position in device coordinates.
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// All dimensions are in device coordinates.
|
2019-01-02 10:39:30 +11:00
|
|
|
|
type textMark struct {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
text string // The text (decoded via ToUnicode).
|
|
|
|
|
original string // Original text (decoded).
|
|
|
|
|
bbox model.PdfRectangle // Text bounding box.
|
|
|
|
|
orient int // The text orientation in degrees. This is the current TRM rounded to 10°.
|
|
|
|
|
orientedStart transform.Point // Left of text in orientation where text is horizontal.
|
|
|
|
|
orientedEnd transform.Point // Right of text in orientation where text is horizontal.
|
|
|
|
|
height float64 // Text height.
|
|
|
|
|
spaceWidth float64 // Best guess at the width of a space in the font the text was rendered with.
|
|
|
|
|
font *model.PdfFont // The font the mark was drawn with.
|
|
|
|
|
fontsize float64 // The font size the mark was drawn with.
|
|
|
|
|
charspacing float64 // TODO (peterwilliams97: Should this be exposed in TextMark?
|
|
|
|
|
trm transform.Matrix // The current text rendering matrix (TRM above).
|
|
|
|
|
end transform.Point // The end of character device coordinates.
|
|
|
|
|
count int64 // To help with reading debug logs.
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
|
|
|
|
|
// and end of character device coordinates `end`. `spaceWidth` is our best guess at the width of a
|
|
|
|
|
// space in the font the text is rendered in device coordinates.
|
|
|
|
|
func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
|
|
|
|
|
spaceWidth float64, font *model.PdfFont, charspacing float64) textMark {
|
2018-11-27 13:37:12 +11:00
|
|
|
|
to.e.textCount++
|
2018-11-26 17:17:17 +11:00
|
|
|
|
theta := trm.Angle()
|
2018-11-29 17:04:20 +11:00
|
|
|
|
orient := nearestMultiple(theta, 10)
|
|
|
|
|
var height float64
|
|
|
|
|
if orient%180 != 90 {
|
2018-11-28 18:06:03 +11:00
|
|
|
|
height = trm.ScalingFactorY()
|
|
|
|
|
} else {
|
|
|
|
|
height = trm.ScalingFactorX()
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
start := translation(trm)
|
|
|
|
|
bbox := model.PdfRectangle{Llx: start.X, Lly: start.Y, Urx: end.X, Ury: end.Y}
|
|
|
|
|
switch orient % 360 {
|
|
|
|
|
case 90:
|
|
|
|
|
bbox.Urx -= height
|
|
|
|
|
case 180:
|
|
|
|
|
bbox.Ury -= height
|
|
|
|
|
case 270:
|
|
|
|
|
bbox.Urx += height
|
|
|
|
|
default:
|
|
|
|
|
bbox.Ury += height
|
|
|
|
|
}
|
|
|
|
|
tm := textMark{
|
2019-01-02 10:39:30 +11:00
|
|
|
|
text: text,
|
|
|
|
|
orient: orient,
|
2019-07-18 16:41:47 +10:00
|
|
|
|
bbox: bbox,
|
|
|
|
|
orientedStart: start.Rotate(theta),
|
2019-01-02 10:39:30 +11:00
|
|
|
|
orientedEnd: end.Rotate(theta),
|
2019-07-18 16:41:47 +10:00
|
|
|
|
height: math.Abs(height),
|
2019-01-02 10:39:30 +11:00
|
|
|
|
spaceWidth: spaceWidth,
|
2019-07-18 16:41:47 +10:00
|
|
|
|
font: font,
|
|
|
|
|
fontsize: to.state.tfs,
|
|
|
|
|
charspacing: charspacing,
|
|
|
|
|
trm: trm,
|
|
|
|
|
end: end,
|
2018-11-27 13:37:12 +11:00
|
|
|
|
count: to.e.textCount,
|
2018-11-18 17:21:30 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
if !isTextSpace(tm.text) && tm.Width() == 0.0 {
|
|
|
|
|
common.Log.Debug("ERROR: Zero width text. tm=%s\n\tm=%#v", tm, tm)
|
|
|
|
|
}
|
|
|
|
|
return tm
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// isTextSpace returns true if `text` contains nothing but space code points.
|
|
|
|
|
func isTextSpace(text string) bool {
|
|
|
|
|
for _, r := range text {
|
|
|
|
|
if !unicode.IsSpace(r) {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-01 12:22:39 +11:00
|
|
|
|
// nearestMultiple return the integer multiple of `m` that is closest to `x`.
|
2018-11-29 17:04:20 +11:00
|
|
|
|
func nearestMultiple(x float64, m int) int {
|
|
|
|
|
if m == 0 {
|
|
|
|
|
m = 1
|
|
|
|
|
}
|
|
|
|
|
fac := float64(m)
|
|
|
|
|
return int(math.Round(x/fac) * fac)
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// String returns a string describing `tm`.
|
|
|
|
|
func (tm textMark) String() string {
|
|
|
|
|
return fmt.Sprintf("textMark{@%03d [%.3f,%.3f] w=%.1f %d° %q}",
|
|
|
|
|
tm.count, tm.orientedStart.X, tm.orientedStart.Y, tm.Width(), tm.orient,
|
|
|
|
|
truncate(tm.text, 100))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Width returns the width of `tm`.text in the text direction.
|
|
|
|
|
func (tm textMark) Width() float64 {
|
|
|
|
|
return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
|
2018-10-09 11:49:59 +11:00
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// ToTextMark returns the public view of `tm`.
|
|
|
|
|
func (tm textMark) ToTextMark() TextMark {
|
|
|
|
|
return TextMark{
|
|
|
|
|
Text: tm.text,
|
|
|
|
|
Original: tm.original,
|
|
|
|
|
BBox: tm.bbox,
|
|
|
|
|
Font: tm.font,
|
|
|
|
|
FontSize: tm.fontsize,
|
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// PageText represents the layout of text on a device page.
|
|
|
|
|
type PageText struct {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
marks []textMark // Texts and their positions on a PDF page.
|
|
|
|
|
viewText string // Extracted page text.
|
|
|
|
|
viewMarks []TextMark // Public view of `marks`.
|
2019-01-04 16:02:22 +11:00
|
|
|
|
}
|
2018-06-27 16:31:28 +10:00
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// String returns a string describing `pt`.
|
|
|
|
|
func (pt PageText) String() string {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
summary := fmt.Sprintf("PageText: %d elements", len(pt.marks))
|
|
|
|
|
parts := []string{"-" + summary}
|
|
|
|
|
for _, tm := range pt.marks {
|
|
|
|
|
parts = append(parts, tm.String())
|
2019-01-01 12:22:39 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
parts = append(parts, "+"+summary)
|
2019-01-01 12:22:39 +11:00
|
|
|
|
return strings.Join(parts, "\n")
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// length returns the number of elements in `pt.marks`.
|
|
|
|
|
func (pt PageText) length() int {
|
|
|
|
|
return len(pt.marks)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// Text returns the extracted page text.
|
|
|
|
|
func (pt PageText) Text() string {
|
|
|
|
|
return pt.viewText
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ToText returns the page text as a single string.
|
|
|
|
|
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
|
|
|
|
|
// Text() instead.
|
|
|
|
|
func (pt PageText) ToText() string {
|
|
|
|
|
return pt.Text()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Marks returns the TextMark collection for a page. It represents all the text on the page.
|
|
|
|
|
func (pt PageText) Marks() *TextMarkArray {
|
|
|
|
|
return &TextMarkArray{marks: pt.viewMarks}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TextMarkArray is a collection of TextMarks.
|
|
|
|
|
type TextMarkArray struct {
|
|
|
|
|
marks []TextMark
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-04 09:29:21 +00:00
|
|
|
|
// Append appends `mark` to the mark array.
|
|
|
|
|
func (ma *TextMarkArray) Append(mark TextMark) {
|
|
|
|
|
ma.marks = append(ma.marks, mark)
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// String returns a string describing `ma`.
|
|
|
|
|
func (ma TextMarkArray) String() string {
|
|
|
|
|
n := len(ma.marks)
|
|
|
|
|
if n == 0 {
|
|
|
|
|
return "EMPTY"
|
|
|
|
|
}
|
|
|
|
|
m0 := ma.marks[0]
|
|
|
|
|
m1 := ma.marks[n-1]
|
|
|
|
|
return fmt.Sprintf("{TEXTMARKARRAY: %d elements\n\tfirst=%s\n\t last=%s}", n, m0, m1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Elements returns the TextMarks in `ma`.
|
|
|
|
|
func (ma *TextMarkArray) Elements() []TextMark {
|
|
|
|
|
return ma.marks
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Len returns the number of TextMarks in `ma`.
|
|
|
|
|
func (ma *TextMarkArray) Len() int {
|
|
|
|
|
if ma == nil {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
return len(ma.marks)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// RangeOffset returns the TextMarks in `ma` that have `start` <= TextMark.Offset < `end`.
|
|
|
|
|
func (ma *TextMarkArray) RangeOffset(start, end int) (*TextMarkArray, error) {
|
|
|
|
|
if ma == nil {
|
|
|
|
|
return nil, errors.New("ma==nil")
|
|
|
|
|
}
|
|
|
|
|
if end < start {
|
|
|
|
|
return nil, fmt.Errorf("end < start. RangeOffset not defined. start=%d end=%d ", start, end)
|
|
|
|
|
}
|
|
|
|
|
n := len(ma.marks)
|
|
|
|
|
if n == 0 {
|
|
|
|
|
return ma, nil
|
|
|
|
|
}
|
|
|
|
|
if start < ma.marks[0].Offset {
|
|
|
|
|
start = ma.marks[0].Offset
|
|
|
|
|
}
|
|
|
|
|
if end > ma.marks[n-1].Offset+1 {
|
|
|
|
|
end = ma.marks[n-1].Offset + 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
iStart := sort.Search(n, func(i int) bool { return ma.marks[i].Offset >= start })
|
|
|
|
|
if !(0 <= iStart && iStart < n) {
|
|
|
|
|
err := fmt.Errorf("Out of range. start=%d iStart=%d len=%d\n\tfirst=%v\n\t last=%v",
|
|
|
|
|
start, iStart, n, ma.marks[0], ma.marks[n-1])
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
iEnd := sort.Search(n, func(i int) bool { return ma.marks[i].Offset > end-1 })
|
|
|
|
|
if !(0 <= iEnd && iEnd < n) {
|
|
|
|
|
err := fmt.Errorf("Out of range. end=%d iEnd=%d len=%d\n\tfirst=%v\n\t last=%v",
|
|
|
|
|
end, iEnd, n, ma.marks[0], ma.marks[n-1])
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
if iEnd <= iStart {
|
|
|
|
|
// This should never happen.
|
|
|
|
|
return nil, fmt.Errorf("start=%d end=%d iStart=%d iEnd=%d", start, end, iStart, iEnd)
|
|
|
|
|
}
|
|
|
|
|
return &TextMarkArray{marks: ma.marks[iStart:iEnd]}, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
|
|
|
|
|
func (ma *TextMarkArray) BBox() (model.PdfRectangle, bool) {
|
|
|
|
|
if len(ma.marks) == 0 {
|
|
|
|
|
return model.PdfRectangle{}, false
|
|
|
|
|
}
|
|
|
|
|
bbox := ma.marks[0].BBox
|
|
|
|
|
for _, tm := range ma.marks[1:] {
|
|
|
|
|
if isTextSpace(tm.Text) {
|
|
|
|
|
continue
|
2018-11-28 18:06:03 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
bbox = rectUnion(bbox, tm.BBox)
|
2018-11-28 18:06:03 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
return bbox, true
|
2018-11-28 18:06:03 +11:00
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// rectUnion returns the smallest axis-aligned rectangle that contains `b1` and `b2`.
|
|
|
|
|
func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {
|
|
|
|
|
return model.PdfRectangle{
|
|
|
|
|
Llx: math.Min(b1.Llx, b2.Llx),
|
|
|
|
|
Lly: math.Min(b1.Lly, b2.Lly),
|
|
|
|
|
Urx: math.Max(b1.Urx, b2.Urx),
|
|
|
|
|
Ury: math.Max(b1.Ury, b2.Ury),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TextMark represents extracted text on a page with information regarding both textual content,
|
|
|
|
|
// formatting (font and size) and positioning.
|
|
|
|
|
// It is the smallest unit of text on a PDF page, typically a single character.
|
|
|
|
|
//
|
|
|
|
|
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
|
|
|
|
|
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
|
|
|
|
|
// `bbox` of substring `term` in `text`.
|
|
|
|
|
//
|
|
|
|
|
// ex, _ := New(page)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// pageText, _, _, err := ex.ExtractPageText()
|
|
|
|
|
// // handle errors
|
|
|
|
|
// text := pageText.Text()
|
|
|
|
|
// textMarks := pageText.Marks()
|
|
|
|
|
//
|
|
|
|
|
// start := strings.Index(text, term)
|
|
|
|
|
// end := start + len(term)
|
|
|
|
|
// spanMarks, err := textMarks.RangeOffset(start, end)
|
|
|
|
|
// // handle errors
|
|
|
|
|
// bbox, ok := spanMarks.BBox()
|
|
|
|
|
// // handle errors
|
|
|
|
|
type TextMark struct {
|
|
|
|
|
// Text is the extracted text. It has been decoded to Unicode via ToUnicode().
|
|
|
|
|
Text string
|
|
|
|
|
// Original is the text in the PDF. It has not been decoded like `Text`.
|
|
|
|
|
Original string
|
|
|
|
|
// BBox is the bounding box of the text.
|
|
|
|
|
BBox model.PdfRectangle
|
|
|
|
|
// Font is the font the text was drawn with.
|
|
|
|
|
Font *model.PdfFont
|
|
|
|
|
// FontSize is the font size the text was drawn with.
|
|
|
|
|
FontSize float64
|
|
|
|
|
// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
|
|
|
|
|
// text, textMarks := pageText.Text(), pageText.Marks()
|
|
|
|
|
// marks := textMarks.Elements()
|
|
|
|
|
// then marks[i].Offset is the offset of marks[i].Text in text.
|
|
|
|
|
Offset int
|
|
|
|
|
// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
|
|
|
|
|
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
|
|
|
|
|
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
|
|
|
|
|
Meta bool
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// String returns a string describing `tm`.
|
|
|
|
|
func (tm TextMark) String() string {
|
|
|
|
|
b := tm.BBox
|
|
|
|
|
var font string
|
|
|
|
|
if tm.Font != nil {
|
|
|
|
|
font = tm.Font.String()
|
|
|
|
|
if len(font) > 50 {
|
|
|
|
|
font = font[:50] + "..."
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
var meta string
|
|
|
|
|
if tm.Meta {
|
|
|
|
|
meta = " *M*"
|
|
|
|
|
}
|
|
|
|
|
return fmt.Sprintf("{TextMark: %d %q=%02x (%5.1f, %5.1f) (%5.1f, %5.1f) %s%s}",
|
|
|
|
|
tm.Offset, tm.Text, []rune(tm.Text), b.Llx, b.Lly, b.Urx, b.Ury, font, meta)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
|
|
|
|
|
// `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
|
|
|
|
|
// The comments above the TextMark definition describe how to use the []TextMark to
|
|
|
|
|
// maps substrings of the page text to locations on the PDF page.
|
|
|
|
|
func (pt *PageText) computeViews() {
|
2019-01-04 16:07:03 +11:00
|
|
|
|
fontHeight := pt.height()
|
2018-11-28 22:13:56 +11:00
|
|
|
|
// We sort with a y tolerance to allow for subscripts, diacritics etc.
|
2018-11-30 23:01:04 +00:00
|
|
|
|
tol := minFloat(fontHeight*0.2, 5.0)
|
2019-07-18 16:41:47 +10:00
|
|
|
|
common.Log.Trace("ToTextLocation: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)
|
|
|
|
|
// Uncomment the 2 following Debug statements to see the effects of sorting.
|
|
|
|
|
// common.Log.Debug("computeViews: Before sorting %s", pt)
|
2019-01-04 16:07:03 +11:00
|
|
|
|
pt.sortPosition(tol)
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// common.Log.Debug("computeViews: After sorting %s", pt)
|
2019-01-04 16:07:03 +11:00
|
|
|
|
lines := pt.toLines(tol)
|
2019-07-18 16:41:47 +10:00
|
|
|
|
texts := make([]string, len(lines))
|
|
|
|
|
for i, l := range lines {
|
|
|
|
|
texts[i] = strings.Join(l.words(), wordJoiner)
|
|
|
|
|
}
|
|
|
|
|
text := strings.Join(texts, lineJoiner)
|
|
|
|
|
var marks []TextMark
|
|
|
|
|
offset := 0
|
|
|
|
|
for i, l := range lines {
|
|
|
|
|
for j, tm := range l.marks {
|
|
|
|
|
tm.Offset = offset
|
|
|
|
|
marks = append(marks, tm)
|
|
|
|
|
offset += len(tm.Text)
|
|
|
|
|
if j == len(l.marks)-1 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if wordJoinerLen > 0 {
|
|
|
|
|
tm := TextMark{
|
|
|
|
|
Offset: offset,
|
|
|
|
|
Text: wordJoiner,
|
|
|
|
|
Meta: true,
|
|
|
|
|
}
|
|
|
|
|
marks = append(marks, tm)
|
|
|
|
|
offset += wordJoinerLen
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if i == len(lines)-1 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if lineJoinerLen > 0 {
|
|
|
|
|
tm := TextMark{
|
|
|
|
|
Offset: offset,
|
|
|
|
|
Text: lineJoiner,
|
|
|
|
|
Meta: true,
|
|
|
|
|
}
|
|
|
|
|
marks = append(marks, tm)
|
|
|
|
|
offset += lineJoinerLen
|
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
pt.viewText = text
|
|
|
|
|
pt.viewMarks = marks
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// height returns the max height of the elements in `pt.marks`.
|
|
|
|
|
func (pt PageText) height() float64 {
|
|
|
|
|
fontHeight := 0.0
|
|
|
|
|
for _, tm := range pt.marks {
|
|
|
|
|
if tm.height > fontHeight {
|
|
|
|
|
fontHeight = tm.height
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return fontHeight
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
// wordJoiner is added between text marks in extracted text.
|
|
|
|
|
wordJoiner = ""
|
|
|
|
|
// lineJoiner is added between lines in extracted text.
|
|
|
|
|
lineJoiner = "\n"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var (
|
|
|
|
|
wordJoinerLen = len(wordJoiner)
|
|
|
|
|
lineJoinerLen = len(lineJoiner)
|
|
|
|
|
// spaceMark is a special TextMark used for spaces.
|
|
|
|
|
spaceMark = TextMark{
|
|
|
|
|
Text: " ",
|
|
|
|
|
Original: " ",
|
|
|
|
|
Meta: true,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// sortPosition sorts a text list by its elements' positions on a page.
|
2018-11-10 21:19:02 +11:00
|
|
|
|
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
|
|
|
|
|
// is horizontal.
|
2019-01-04 16:07:03 +11:00
|
|
|
|
func (pt *PageText) sortPosition(tol float64) {
|
2019-01-05 09:14:10 +11:00
|
|
|
|
sort.SliceStable(pt.marks, func(i, j int) bool {
|
|
|
|
|
ti, tj := pt.marks[i], pt.marks[j]
|
2019-01-02 10:39:30 +11:00
|
|
|
|
if ti.orient != tj.orient {
|
|
|
|
|
return ti.orient < tj.orient
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol {
|
|
|
|
|
return ti.orientedStart.Y > tj.orientedStart.Y
|
2018-11-10 21:19:02 +11:00
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
return ti.orientedStart.X < tj.orientedStart.X
|
2018-08-22 12:29:34 +10:00
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-02 10:39:30 +11:00
|
|
|
|
// textLine represents a line of text on a page.
|
|
|
|
|
type textLine struct {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
x float64 // x position of line.
|
|
|
|
|
y float64 // y position of line.
|
|
|
|
|
h float64 // height of line text.
|
|
|
|
|
dxList []float64 // x distance between successive words in line.
|
|
|
|
|
marks []TextMark // TextMarks in the line.
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// words returns the texts in `tl`.
|
|
|
|
|
func (tl textLine) words() []string {
|
|
|
|
|
var texts []string
|
|
|
|
|
for _, tm := range tl.marks {
|
|
|
|
|
texts = append(texts, tm.Text)
|
|
|
|
|
}
|
|
|
|
|
return texts
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// toLines returns the text and positions in `pt.marks` as a slice of textLine.
|
2018-12-02 18:41:48 +11:00
|
|
|
|
// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
2018-11-10 21:19:02 +11:00
|
|
|
|
// that text is horizontal) before calling this function.
|
2019-01-04 16:07:03 +11:00
|
|
|
|
func (pt PageText) toLines(tol float64) []textLine {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// We divide `pt.marks` into slices which contain texts with the same orientation, extract the
|
|
|
|
|
// lines for each orientation then return the concatenation of these lines sorted by orientation.
|
2019-01-04 16:07:03 +11:00
|
|
|
|
tlOrient := make(map[int][]textMark, len(pt.marks))
|
2019-07-18 16:41:47 +10:00
|
|
|
|
for _, tm := range pt.marks {
|
|
|
|
|
tlOrient[tm.orient] = append(tlOrient[tm.orient], tm)
|
2018-11-10 21:19:02 +11:00
|
|
|
|
}
|
2019-01-02 10:39:30 +11:00
|
|
|
|
var lines []textLine
|
2018-11-27 13:37:12 +11:00
|
|
|
|
for _, o := range orientKeys(tlOrient) {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
lns := PageText{marks: tlOrient[o]}.toLinesOrient(tol)
|
|
|
|
|
lines = append(lines, lns...)
|
2018-11-26 17:17:17 +11:00
|
|
|
|
}
|
|
|
|
|
return lines
|
2018-11-10 21:19:02 +11:00
|
|
|
|
}
|
|
|
|
|
|
2019-01-04 16:07:03 +11:00
|
|
|
|
// toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// NOTE: This function only works on text lists where all text is the same orientation so it should
|
|
|
|
|
// only be called from toLines.
|
2018-12-02 18:41:48 +11:00
|
|
|
|
// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// that text is horizontal) before calling this function.
|
2019-01-04 16:07:03 +11:00
|
|
|
|
func (pt PageText) toLinesOrient(tol float64) []textLine {
|
|
|
|
|
if len(pt.marks) == 0 {
|
2019-01-02 10:39:30 +11:00
|
|
|
|
return []textLine{}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
var marks []TextMark
|
2019-01-02 10:39:30 +11:00
|
|
|
|
var lines []textLine
|
2019-07-18 16:41:47 +10:00
|
|
|
|
var xx []float64
|
2019-01-04 16:07:03 +11:00
|
|
|
|
y := pt.marks[0].orientedStart.Y
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
|
|
|
|
scanning := false
|
|
|
|
|
|
2018-11-28 23:25:17 +00:00
|
|
|
|
averageCharWidth := exponAve{}
|
|
|
|
|
wordSpacing := exponAve{}
|
2019-01-04 16:07:03 +11:00
|
|
|
|
lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
for _, tm := range pt.marks {
|
|
|
|
|
if tm.orientedStart.Y+tol < y {
|
|
|
|
|
if len(marks) > 0 {
|
|
|
|
|
tl := newLine(y, xx, marks)
|
2018-10-09 13:47:43 +11:00
|
|
|
|
if averageCharWidth.running {
|
2019-01-01 12:22:39 +11:00
|
|
|
|
// FIXME(peterwilliams97): Fix and reinstate combineDiacritics.
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// tl = combineDiacritics(tl, averageCharWidth.ave)
|
|
|
|
|
tl = removeDuplicates(tl, averageCharWidth.ave)
|
2018-10-09 13:47:43 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
lines = append(lines, tl)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
marks = []TextMark{}
|
|
|
|
|
xx = []float64{}
|
|
|
|
|
y = tm.orientedStart.Y
|
2018-10-09 11:49:59 +11:00
|
|
|
|
scanning = false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Detect text movements that represent spaces on the printed page.
|
|
|
|
|
// We use a heuristic from PdfBox: If the next character starts to the right of where a
|
|
|
|
|
// character after a space at "normal spacing" would start, then there is a space before it.
|
|
|
|
|
// The tricky thing to guess here is the width of a space at normal spacing.
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// We follow PdfBox and use min(deltaSpace, deltaCharWidth).
|
2018-10-09 11:49:59 +11:00
|
|
|
|
deltaSpace := 0.0
|
2019-07-18 16:41:47 +10:00
|
|
|
|
if tm.spaceWidth == 0 {
|
2018-10-09 11:49:59 +11:00
|
|
|
|
deltaSpace = math.MaxFloat64
|
|
|
|
|
} else {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
wordSpacing.update(tm.spaceWidth)
|
2018-10-09 11:49:59 +11:00
|
|
|
|
deltaSpace = wordSpacing.ave * 0.5
|
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
averageCharWidth.update(tm.Width())
|
2018-10-09 11:49:59 +11:00
|
|
|
|
deltaCharWidth := averageCharWidth.ave * 0.3
|
|
|
|
|
|
|
|
|
|
isSpace := false
|
2018-11-28 23:25:17 +00:00
|
|
|
|
nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
|
2019-07-18 16:41:47 +10:00
|
|
|
|
if scanning && !isTextSpace(tm.text) {
|
|
|
|
|
isSpace = nextWordX < tm.orientedStart.X
|
2018-10-09 11:49:59 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
common.Log.Trace("tm=%s", tm)
|
2018-11-26 17:17:17 +11:00
|
|
|
|
common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tm.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
|
2018-11-26 17:17:17 +11:00
|
|
|
|
common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tm.text, tm.orientedStart.X, tm.orientedStart.Y, lastEndX, nextWordX,
|
|
|
|
|
nextWordX-tm.orientedStart.X, isSpace)
|
2018-11-18 17:21:30 +11:00
|
|
|
|
|
2018-10-09 11:49:59 +11:00
|
|
|
|
if isSpace {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
marks = append(marks, spaceMark)
|
|
|
|
|
xx = append(xx, (lastEndX+tm.orientedStart.X)*0.5)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2018-10-09 11:49:59 +11:00
|
|
|
|
|
|
|
|
|
// Add the text to the line.
|
2019-07-18 16:41:47 +10:00
|
|
|
|
lastEndX = tm.orientedEnd.X
|
|
|
|
|
marks = append(marks, tm.ToTextMark())
|
|
|
|
|
xx = append(xx, tm.orientedStart.X)
|
2018-10-09 11:49:59 +11:00
|
|
|
|
scanning = true
|
2018-11-26 17:17:17 +11:00
|
|
|
|
common.Log.Trace("lastEndX=%.2f", lastEndX)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
if len(marks) > 0 {
|
|
|
|
|
tl := newLine(y, xx, marks)
|
2018-10-09 13:47:43 +11:00
|
|
|
|
if averageCharWidth.running {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tl = removeDuplicates(tl, averageCharWidth.ave)
|
2018-10-09 13:47:43 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
lines = append(lines, tl)
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
return lines
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-27 13:37:12 +11:00
|
|
|
|
// orientKeys returns the keys of `tlOrient` as a sorted slice.
|
2019-01-04 16:02:22 +11:00
|
|
|
|
func orientKeys(tlOrient map[int][]textMark) []int {
|
2018-11-27 13:37:12 +11:00
|
|
|
|
keys := []int{}
|
|
|
|
|
for k := range tlOrient {
|
|
|
|
|
keys = append(keys, k)
|
|
|
|
|
}
|
|
|
|
|
sort.Ints(keys)
|
|
|
|
|
return keys
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-28 23:25:17 +00:00
|
|
|
|
// exponAve implements an exponential average.
|
|
|
|
|
type exponAve struct {
|
2018-10-09 11:49:59 +11:00
|
|
|
|
ave float64 // Current average value.
|
|
|
|
|
running bool // Has `ave` been set?
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// update updates the exponential average `exp`.ave with latest value `x` and returns `exp`.ave.
|
2018-11-28 23:25:17 +00:00
|
|
|
|
func (exp *exponAve) update(x float64) float64 {
|
2018-10-09 11:49:59 +11:00
|
|
|
|
if !exp.running {
|
|
|
|
|
exp.ave = x
|
2018-10-09 13:47:43 +11:00
|
|
|
|
exp.running = true
|
2018-10-09 11:49:59 +11:00
|
|
|
|
} else {
|
2019-01-01 12:22:39 +11:00
|
|
|
|
// NOTE(peterwilliams97): 0.5 is a guess. It may be possible to improve average character
|
2018-12-02 18:13:40 +11:00
|
|
|
|
// and space width estimation by tuning this value. It may be that different exponents
|
|
|
|
|
// would work better for character and space estimation.
|
2018-10-09 11:49:59 +11:00
|
|
|
|
exp.ave = (exp.ave + x) * 0.5
|
|
|
|
|
}
|
|
|
|
|
return exp.ave
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-02 10:39:30 +11:00
|
|
|
|
// newLine returns the textLine representation of strings `words` with y coordinate `y` and x
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// coordinates `xx` and height `h`.
|
|
|
|
|
func newLine(y float64, xx []float64, marks []TextMark) textLine {
|
|
|
|
|
dxList := make([]float64, len(xx)-1)
|
|
|
|
|
for i := 1; i < len(xx); i++ {
|
|
|
|
|
dxList[i-1] = xx[i] - xx[i-1]
|
|
|
|
|
}
|
|
|
|
|
return textLine{
|
|
|
|
|
x: xx[0],
|
|
|
|
|
y: y,
|
|
|
|
|
dxList: dxList,
|
|
|
|
|
marks: marks,
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
2018-10-09 13:47:43 +11:00
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
// removeDuplicates returns `tl` with duplicate characters removed. `charWidth` is the average
|
2018-11-10 21:19:02 +11:00
|
|
|
|
// character width for the line.
|
2019-07-18 16:41:47 +10:00
|
|
|
|
func removeDuplicates(tl textLine, charWidth float64) textLine {
|
|
|
|
|
if len(tl.dxList) == 0 || len(tl.marks) == 0 {
|
|
|
|
|
return tl
|
2018-10-09 13:47:43 +11:00
|
|
|
|
}
|
2018-12-02 18:13:40 +11:00
|
|
|
|
// NOTE(peterwilliams97) 0.3 is a guess. It may be possible to tune this to a better value.
|
2018-10-09 13:47:43 +11:00
|
|
|
|
tol := charWidth * 0.3
|
2019-07-18 16:41:47 +10:00
|
|
|
|
marks := []TextMark{tl.marks[0]}
|
2018-12-02 18:13:40 +11:00
|
|
|
|
var dxList []float64
|
2018-10-09 19:05:38 +11:00
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tm0 := tl.marks[0]
|
|
|
|
|
for i, dx := range tl.dxList {
|
|
|
|
|
tm := tl.marks[i+1]
|
|
|
|
|
if tm.Text != tm0.Text || dx > tol {
|
|
|
|
|
marks = append(marks, tm)
|
2018-10-09 13:47:43 +11:00
|
|
|
|
dxList = append(dxList, dx)
|
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tm0 = tm
|
|
|
|
|
}
|
|
|
|
|
return textLine{
|
|
|
|
|
x: tl.x,
|
|
|
|
|
y: tl.y,
|
|
|
|
|
dxList: dxList,
|
|
|
|
|
marks: marks,
|
2018-10-09 13:47:43 +11:00
|
|
|
|
}
|
2018-08-22 12:29:34 +10:00
|
|
|
|
}
|
|
|
|
|
|
2018-11-28 18:06:03 +11:00
|
|
|
|
// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
|
|
|
|
|
// `charWidth` is the average character width for the line.
|
|
|
|
|
// We have to do this because PDF can render diacritics separately to the characters they attach to
|
|
|
|
|
// in extracted text.
|
2019-07-18 16:41:47 +10:00
|
|
|
|
func combineDiacritics(tl textLine, charWidth float64) textLine {
|
|
|
|
|
if len(tl.dxList) == 0 || len(tl.marks) == 0 {
|
|
|
|
|
return tl
|
2018-11-28 18:06:03 +11:00
|
|
|
|
}
|
2018-12-02 18:13:40 +11:00
|
|
|
|
// NOTE(peterwilliams97) 0.2 is a guess. It may be possible to tune this to a better value.
|
2018-11-28 18:06:03 +11:00
|
|
|
|
tol := charWidth * 0.2
|
|
|
|
|
common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
var marks []TextMark
|
2018-11-30 23:01:04 +00:00
|
|
|
|
var dxList []float64
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tm := marks[0]
|
|
|
|
|
w, c := countDiacritic(tm.Text)
|
2018-11-28 18:06:03 +11:00
|
|
|
|
delta := 0.0
|
|
|
|
|
dx0 := 0.0
|
|
|
|
|
parts := []string{w}
|
|
|
|
|
numChars := c
|
|
|
|
|
|
2019-07-18 16:41:47 +10:00
|
|
|
|
for i, dx := range tl.dxList {
|
|
|
|
|
tm = marks[i+1]
|
|
|
|
|
w, c := countDiacritic(tm.Text)
|
2018-11-28 18:06:03 +11:00
|
|
|
|
if numChars+c <= 1 && delta+dx <= tol {
|
|
|
|
|
if len(parts) == 0 {
|
|
|
|
|
dx0 = dx
|
|
|
|
|
} else {
|
|
|
|
|
delta += dx
|
|
|
|
|
}
|
|
|
|
|
parts = append(parts, w)
|
|
|
|
|
numChars += c
|
|
|
|
|
} else {
|
|
|
|
|
if len(parts) > 0 {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
if len(marks) > 0 {
|
2018-11-28 18:06:03 +11:00
|
|
|
|
dxList = append(dxList, dx0)
|
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tm.Text = combine(parts)
|
|
|
|
|
marks = append(marks, tm)
|
2018-11-28 18:06:03 +11:00
|
|
|
|
}
|
|
|
|
|
parts = []string{w}
|
|
|
|
|
numChars = c
|
|
|
|
|
dx0 = dx
|
|
|
|
|
delta = 0.0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if len(parts) > 0 {
|
2019-07-18 16:41:47 +10:00
|
|
|
|
if len(marks) > 0 {
|
2018-11-28 18:06:03 +11:00
|
|
|
|
dxList = append(dxList, dx0)
|
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
tm.Text = combine(parts)
|
|
|
|
|
marks = append(marks, tm)
|
2018-11-28 18:06:03 +11:00
|
|
|
|
}
|
2019-07-18 16:41:47 +10:00
|
|
|
|
if len(marks) != len(dxList)+1 {
|
|
|
|
|
common.Log.Error("Inconsistent: \nwords=%d \ndxList=%d %.2f",
|
|
|
|
|
len(marks), len(dxList), dxList)
|
|
|
|
|
return tl
|
|
|
|
|
}
|
|
|
|
|
return textLine{
|
|
|
|
|
x: tl.x,
|
|
|
|
|
y: tl.y,
|
|
|
|
|
dxList: dxList,
|
|
|
|
|
marks: marks,
|
2018-11-28 18:06:03 +11:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
|
|
|
|
|
func combine(parts []string) string {
|
|
|
|
|
if len(parts) == 1 {
|
|
|
|
|
// Must be a non-diacritic.
|
|
|
|
|
return parts[0]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
|
|
|
|
|
diacritic := map[string]bool{}
|
|
|
|
|
for _, w := range parts {
|
|
|
|
|
r := []rune(w)[0]
|
|
|
|
|
diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
|
|
|
|
|
}
|
|
|
|
|
sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })
|
|
|
|
|
|
|
|
|
|
// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
|
|
|
|
|
for i, w := range parts {
|
|
|
|
|
parts[i] = strings.TrimSpace(norm.NFKC.String(w))
|
|
|
|
|
}
|
|
|
|
|
return strings.Join(parts, "")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
|
2018-12-27 20:51:34 +11:00
|
|
|
|
// non-diacritics in `w` (0 or 1).
|
2018-11-28 18:06:03 +11:00
|
|
|
|
func countDiacritic(w string) (string, int) {
|
|
|
|
|
runes := []rune(w)
|
|
|
|
|
if len(runes) != 1 {
|
|
|
|
|
return w, 1
|
|
|
|
|
}
|
|
|
|
|
r := runes[0]
|
|
|
|
|
c := 1
|
2018-12-27 20:51:34 +11:00
|
|
|
|
if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
|
|
|
|
|
r != '\'' && r != '"' && r != '`' {
|
2018-11-28 18:06:03 +11:00
|
|
|
|
c = 0
|
|
|
|
|
}
|
|
|
|
|
if w2, ok := diacritics[r]; ok {
|
|
|
|
|
c = 0
|
|
|
|
|
w = w2
|
|
|
|
|
}
|
|
|
|
|
return w, c
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
|
|
|
|
|
// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
|
2018-12-02 18:13:40 +11:00
|
|
|
|
// (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
|
2018-11-28 18:06:03 +11:00
|
|
|
|
var diacritics = map[rune]string{
|
|
|
|
|
0x0060: "\u0300",
|
|
|
|
|
0x02CB: "\u0300",
|
|
|
|
|
0x0027: "\u0301",
|
|
|
|
|
0x02B9: "\u0301",
|
|
|
|
|
0x02CA: "\u0301",
|
|
|
|
|
0x005e: "\u0302",
|
|
|
|
|
0x02C6: "\u0302",
|
|
|
|
|
0x007E: "\u0303",
|
|
|
|
|
0x02C9: "\u0304",
|
|
|
|
|
0x00B0: "\u030A",
|
|
|
|
|
0x02BA: "\u030B",
|
|
|
|
|
0x02C7: "\u030C",
|
|
|
|
|
0x02C8: "\u030D",
|
|
|
|
|
0x0022: "\u030E",
|
|
|
|
|
0x02BB: "\u0312",
|
|
|
|
|
0x02BC: "\u0313",
|
|
|
|
|
0x0486: "\u0313",
|
|
|
|
|
0x055A: "\u0313",
|
|
|
|
|
0x02BD: "\u0314",
|
|
|
|
|
0x0485: "\u0314",
|
|
|
|
|
0x0559: "\u0314",
|
|
|
|
|
0x02D4: "\u031D",
|
|
|
|
|
0x02D5: "\u031E",
|
|
|
|
|
0x02D6: "\u031F",
|
|
|
|
|
0x02D7: "\u0320",
|
|
|
|
|
0x02B2: "\u0321",
|
|
|
|
|
0x02CC: "\u0329",
|
|
|
|
|
0x02B7: "\u032B",
|
|
|
|
|
0x02CD: "\u0331",
|
|
|
|
|
0x005F: "\u0332",
|
|
|
|
|
0x204E: "\u0359",
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-20 11:49:44 +10:00
|
|
|
|
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
|
|
|
|
|
// empty.
|
|
|
|
|
func (to *textObject) getCurrentFont() *model.PdfFont {
|
|
|
|
|
if to.fontStack.empty() {
|
|
|
|
|
common.Log.Debug("ERROR: No font defined. Using default.")
|
|
|
|
|
return model.DefaultFont()
|
|
|
|
|
}
|
|
|
|
|
return to.fontStack.peek()
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-28 11:11:43 +10:00
|
|
|
|
// getFont returns the font named `name` if it exists in the page's resources or an error if it
|
2018-09-17 12:12:06 +10:00
|
|
|
|
// doesn't. It caches the returned fonts.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) getFont(name string) (*model.PdfFont, error) {
|
2018-09-22 09:28:18 +10:00
|
|
|
|
if to.e.fontCache != nil {
|
|
|
|
|
to.e.accessCount++
|
|
|
|
|
entry, ok := to.e.fontCache[name]
|
|
|
|
|
if ok {
|
|
|
|
|
entry.access = to.e.accessCount
|
|
|
|
|
return entry.font, nil
|
|
|
|
|
}
|
2018-09-17 12:12:06 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Font not in cache. Load it.
|
|
|
|
|
font, err := to.getFontDirect(name)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-22 09:28:18 +10:00
|
|
|
|
if to.e.fontCache != nil {
|
|
|
|
|
entry := fontEntry{font, to.e.accessCount}
|
|
|
|
|
|
|
|
|
|
// Eject a victim if the cache is full.
|
|
|
|
|
if len(to.e.fontCache) >= maxFontCache {
|
2018-12-09 19:28:50 +02:00
|
|
|
|
var names []string
|
2018-09-22 09:28:18 +10:00
|
|
|
|
for name := range to.e.fontCache {
|
|
|
|
|
names = append(names, name)
|
|
|
|
|
}
|
|
|
|
|
sort.Slice(names, func(i, j int) bool {
|
|
|
|
|
return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
|
|
|
|
|
})
|
|
|
|
|
delete(to.e.fontCache, names[0])
|
2018-09-17 12:12:06 +10:00
|
|
|
|
}
|
2018-09-22 09:28:18 +10:00
|
|
|
|
to.e.fontCache[name] = entry
|
2018-09-17 12:12:06 +10:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return font, nil
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-21 16:43:10 +10:00
|
|
|
|
// fontEntry is a entry in the font cache.
|
2018-09-17 12:12:06 +10:00
|
|
|
|
type fontEntry struct {
|
|
|
|
|
font *model.PdfFont // The font being cached.
|
|
|
|
|
access int64 // Last access. Used to determine LRU cache victims.
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// maxFontCache is the maximum number of PdfFont's in fontCache.
|
|
|
|
|
const maxFontCache = 10
|
|
|
|
|
|
|
|
|
|
// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
|
2018-11-28 23:25:17 +00:00
|
|
|
|
// it doesn't. Accesses page resources directly (not cached).
|
2018-09-17 12:12:06 +10:00
|
|
|
|
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
|
2018-06-27 16:31:28 +10:00
|
|
|
|
fontObj, err := to.getFontDict(name)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
font, err := model.NewPdfFontFromPdfObject(fontObj)
|
|
|
|
|
if err != nil {
|
2018-09-17 12:12:06 +10:00
|
|
|
|
common.Log.Debug("getFontDirect: NewPdfFontFromPdfObject failed. name=%#q err=%v", name, err)
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
|
|
|
|
return font, err
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-27 21:33:31 +11:00
|
|
|
|
// getFontDict returns the font dict with key `name` if it exists in the page's or form's Font
|
|
|
|
|
// resources or an error if it doesn't.
|
2018-07-25 12:00:49 +10:00
|
|
|
|
func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
|
2018-12-27 20:51:34 +11:00
|
|
|
|
resources := to.resources
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if resources == nil {
|
|
|
|
|
common.Log.Debug("getFontDict. No resources. name=%#q", name)
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return nil, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-15 16:28:56 +10:00
|
|
|
|
fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
|
2018-06-27 16:31:28 +10:00
|
|
|
|
if !found {
|
2018-07-25 12:00:49 +10:00
|
|
|
|
common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
|
2018-11-21 13:14:11 +11:00
|
|
|
|
return nil, errors.New("font not in resources")
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|
2018-07-25 12:00:49 +10:00
|
|
|
|
return fontObj, nil
|
2018-06-27 16:31:28 +10:00
|
|
|
|
}
|