unipdf/extractor/text.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"bytes"
	"errors"
	"fmt"
	"math"
	"sort"
	"strings"
	"unicode"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/contentstream"
	"github.com/unidoc/unipdf/v3/core"
	"github.com/unidoc/unipdf/v3/internal/textencoding"
	"github.com/unidoc/unipdf/v3/internal/transform"
	"github.com/unidoc/unipdf/v3/model"
)

const verbose = false

// maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack
// overflow and high enough to accomodate customers' PDFs
const maxFormStack 10

// ExtractText processes and extracts all text data in content streams and returns as a string.
// It takes into account character encodings in the PDF file, which are decoded by
// CharcodeBytesToUnicode.
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
func (e *Extractor) ExtractText() (string, error) {
	text, _, _, err := e.ExtractTextWithStats()
	return text, err
}

// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
	pageText, numChars, numMisses, err := e.ExtractPageText()
	if err != nil {
		return "", numChars, numMisses, err
	}
	return pageText.Text(), numChars, numMisses, nil
}

// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
	pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0)
	if err != nil {
		return nil, numChars, numMisses, err
	}
	pt.computeViews()
	// procBuf(pt)

	return pt, numChars, numMisses, err
}

// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
// PageText.
// This can be called on a page or a form XObject.
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (
	*PageText, int, int, error) {
	common.Log.Trace("extractPageText: level=%d", level)
	pageText := &PageText{pageSize: e.mediaBox}
	state := newTextState(e.mediaBox)
	var savedStates stateStack
	to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &savedStates)
	var inTextObj bool

	if level > maxFormStack {
		err := errors.New("form stack overflow")
		common.Log.Debug("ERROR: extractPageText. recursion level=%d err=%w", level, err)
		return pageText, state.numChars, state.numMisses, err
	}

	// Uncomment the following 3 statements to log the content stream.
	// common.Log.Info("contents* %d -----------------------------", len(contents))
	// fmt.Println(contents)
	// common.Log.Info("contents+ -----------------------------")

	cstreamParser := contentstream.NewContentStreamParser(contents)
	operations, err := cstreamParser.Parse()
	if err != nil {
		common.Log.Debug("ERROR: extractPageText parse failed. err=%w", err)
		return pageText, state.numChars, state.numMisses, err
	}

	processor := contentstream.NewContentStreamProcessor(*operations)

	processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
		func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
			resources *model.PdfPageResources) error {

			operand := op.Operand

			if verbose {
				common.Log.Info("&&& op=%s", op)
			}

			switch operand {
			case "q":
				savedStates.push(&state)
				// common.Log.Info("Save state: stack=%d\n %s", len(savedStates), state.String())
			case "Q":
				if verbose {
					common.Log.Info("Restore state: %s", savedStates.String())
				}
				if !savedStates.empty() {
					// oldState := state
					state = *savedStates.top()
					// common.Log.Info("Restore state: stack=%d\n %s\n→%s",
					// 	len(savedStates), oldState.String(), state.String())
					if len(savedStates) >= 2 {
						savedStates.pop()
					}
				}
			case "BT": // Begin text
				// Begin a text object, initializing the text matrix, Tm, and
				// the text line matrix, Tlm, to the identity matrix. Text
				// objects shall not be nested. A second BT shall not appear
				// before an ET. However, if that happens, all existing marks
				// are added to the  page marks, in order to avoid losing content.
				if inTextObj {
					common.Log.Debug("BT called while in a text object")
					pageText.marks = append(pageText.marks, to.marks...)
				}
				inTextObj = true
				to = newTextObject(e, resources, gs, &state, &savedStates)
			case "ET": // End Text
				// End text object, discarding text matrix. If the current
				// text object contains text marks, they are added to the
				// page text marks collection.
				// The ET operator should always have a matching BT operator.
				// However, if ET appears outside of a text object, the behavior
				// does not change: the text matrices are discarded and all
				// existing marks in the text object are added to the page marks.
				if !inTextObj {
					common.Log.Debug("ET called outside of a text object")
				}
				inTextObj = false
				pageText.marks = append(pageText.marks, to.marks...)
				to.reset()
			case "T*": // Move to start of next text line
				to.nextLine()
			case "Td": // Move text location
				if ok, err := to.checkOp(op, 2, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				x, y, err := toFloatXY(op.Params)
				if err != nil {
					return err
				}
				to.moveText(x, y)
			case "TD": // Move text location and set leading.
				if ok, err := to.checkOp(op, 2, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				x, y, err := toFloatXY(op.Params)
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.moveTextSetLeading(x, y)
			case "Tj": // Show text.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: Tj op=%s err=%v", op, err)
					return err
				}
				charcodes, ok := core.GetStringBytes(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: Tj op=%s GetStringBytes failed", op)
					return core.ErrTypeError
				}
				return to.showText(charcodes)
			case "TJ": // Show text with adjustable spacing.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: TJ err=%v", err)
					return err
				}
				args, ok := core.GetArray(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: TJ op=%s GetArrayVal failed", op)
					return err
				}
				return to.showTextAdjusted(args)
			case "'": // Move to next line and show text.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: ' err=%v", err)
					return err
				}
				charcodes, ok := core.GetStringBytes(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: ' op=%s GetStringBytes failed", op)
					return core.ErrTypeError
				}
				to.nextLine()
				return to.showText(charcodes)
			case `"`: // Set word and character spacing, move to next line, and show text.
				if ok, err := to.checkOp(op, 3, true); !ok {
					common.Log.Debug("ERROR: \" err=%v", err)
					return err
				}
				x, y, err := toFloatXY(op.Params[:2])
				if err != nil {
					return err
				}
				charcodes, ok := core.GetStringBytes(op.Params[2])
				if !ok {
					common.Log.Debug("ERROR: \" op=%s GetStringBytes failed", op)
					return core.ErrTypeError
				}
				to.setCharSpacing(x)
				to.setWordSpacing(y)
				to.nextLine()
				return to.showText(charcodes)
			case "TL": // Set text leading.
				y, err := floatParam(op)
				if err != nil {
					common.Log.Debug("ERROR: TL err=%v", err)
					return err
				}
				to.setTextLeading(y)
			case "Tc": // Set character spacing.
				y, err := floatParam(op)
				if err != nil {
					common.Log.Debug("ERROR: Tc err=%v", err)
					return err
				}
				to.setCharSpacing(y)
			case "Tf": // Set font.
				if ok, err := to.checkOp(op, 2, true); !ok {
					common.Log.Debug("ERROR: Tf err=%v", err)
					return err
				}
				name, ok := core.GetNameVal(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: Tf op=%s GetNameVal failed", op)
					return core.ErrTypeError
				}
				size, err := core.GetNumberAsFloat(op.Params[1])
				if !ok {
					common.Log.Debug("ERROR: Tf op=%s GetFloatVal failed. err=%v", op, err)
					return err
				}
				err = to.setFont(name, size)
				to.invalidFont = unsupportedFontErr(err)
				if err != nil && !to.invalidFont {
					return err
				}
			case "Tm": // Set text matrix.
				if ok, err := to.checkOp(op, 6, true); !ok {
					common.Log.Debug("ERROR: Tm err=%v", err)
					return err
				}
				floats, err := core.GetNumbersAsFloat(op.Params)
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setTextMatrix(floats)
			case "Tr": // Set text rendering mode.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: Tr err=%v", err)
					return err
				}
				mode, ok := core.GetIntVal(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: Tr op=%s GetIntVal failed", op)
					return core.ErrTypeError
				}
				to.setTextRenderMode(mode)
			case "Ts": // Set text rise.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: Ts err=%v", err)
					return err
				}
				y, err := core.GetNumberAsFloat(op.Params[0])
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setTextRise(y)
			case "Tw": // Set word spacing.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				y, err := core.GetNumberAsFloat(op.Params[0])
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setWordSpacing(y)
			case "Tz": // Set horizontal scaling.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				y, err := core.GetNumberAsFloat(op.Params[0])
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setHorizScaling(y)
			case "Do":
				// Handle XObjects by recursing through form XObjects.
				if len(op.Params) == 0 {
					common.Log.Debug("ERROR: expected XObject name operand for Do operator. Got %+v.", op.Params)
					return core.ErrRangeError
				}

				// Get XObject name.
				name, ok := core.GetName(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: invalid Do operator XObject name operand: %+v.", op.Params[0])
					return core.ErrTypeError
				}

				_, xtype := resources.GetXObjectByName(*name)
				if xtype != model.XObjectTypeForm {
					break
				}
				// Only process each form once.
				formResult, ok := e.formResults[name.String()]
				if !ok {
					xform, err := resources.GetXObjectFormByName(*name)
					if err != nil {
						common.Log.Debug("ERROR: %v", err)
						return err
					}
					formContent, err := xform.GetContentStream()
					if err != nil {
						common.Log.Debug("ERROR: %v", err)
						return err
					}
					formResources := xform.Resources
					if formResources == nil {
						formResources = resources
					}
					tList, numChars, numMisses, err := e.extractPageText(string(formContent),
						formResources, level+1)
					if err != nil {
						common.Log.Debug("ERROR: %v", err)
						return err
					}
					formResult = textResult{*tList, numChars, numMisses}
					e.formResults[name.String()] = formResult
				}

				pageText.marks = append(pageText.marks, formResult.pageText.marks...)
				state.numChars += formResult.numChars
				state.numMisses += formResult.numMisses
			}
			return nil
		})

	err = processor.Process(resources)
	if err != nil {
		common.Log.Debug("ERROR: Processing: err=%v", err)
	}
	return pageText, state.numChars, state.numMisses, err
}

// unsupportedFontErr returns true if `err` indicated that the selected font or encoding is not supported.
func unsupportedFontErr(err error) bool {
	if err == model.ErrFontNotSupported ||
		err == model.ErrType1CFontNotSupported ||
		err == model.ErrType3FontNotSupported ||
		err == model.ErrTTCmapNotSupported {
		return true
	}
	if err == nil {
		return false
	}
	errStr := err.Error()
	return strings.Contains(errStr, "unsupported font encoding:") ||
		strings.Contains(errStr, "unexpected subtable format:") ||
		strings.Contains(errStr, "fonts based on PostScript outlines are not supported")
}

// textResult is used for holding results of PDF form processig
type textResult struct {
	pageText  PageText
	numChars  int
	numMisses int
}

//
// Text operators
//

// moveText "Td" Moves start of text by `tx`,`ty`.
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
// tx and ty are in unscaled text space units.
func (to *textObject) moveText(tx, ty float64) {
	to.moveTo(tx, ty)
}

// moveTextSetLeading "TD" Move text location and set leading.
// Move to the start of the next line, offset from the start of the current line by (tx, ty). As a
// side effect, this operator shall set the leading parameter in the text state. This operator shall
// have the same effect as this code:
//  −ty TL
//  tx ty Td
func (to *textObject) moveTextSetLeading(tx, ty float64) {
	to.state.tl = -ty
	to.moveTo(tx, ty)
}

// nextLine "T*"" Moves start of text line to next text line
// Move to the start of the next line. This operator has the same effect as the code
//    0 -Tl Td
// where Tl denotes the current leading parameter in the text state. The negative of Tl is used
// here because Tl is the text leading expressed as a positive number. Going to the next line
// entails decreasing the y coordinate. (page 250)
func (to *textObject) nextLine() {
	to.moveTo(0, -to.state.tl)
}

// setTextMatrix "Tm".
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
// in `f` (page 250).
func (to *textObject) setTextMatrix(f []float64) {
	if len(f) != 6 {
		common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
		return
	}
	a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
	to.tm = transform.NewMatrix(a, b, c, d, tx, ty)
	to.tlm = to.tm
	to.logCursor()
}

// showText "Tj". Show a text string.
func (to *textObject) showText(charcodes []byte) error {
	return to.renderText(charcodes)
}

// showTextAdjusted "TJ". Show text with adjustable spacing.
func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
	vertical := false
	for _, o := range args.Elements() {
		switch o.(type) {
		case *core.PdfObjectFloat, *core.PdfObjectInteger:
			x, err := core.GetNumberAsFloat(o)
			if err != nil {
				common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
				return err
			}
			dx, dy := -x*0.001*to.state.tfs, 0.0
			if vertical {
				dy, dx = dx, dy
			}
			td := translationMatrix(transform.Point{X: dx, Y: dy})
			to.tm.Concat(td)
			to.logCursor()
		case *core.PdfObjectString:
			charcodes, ok := core.GetStringBytes(o)
			if !ok {
				common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
				return core.ErrTypeError
			}
			to.renderText(charcodes)
		default:
			common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
			return core.ErrTypeError
		}
	}
	return nil
}

// setTextLeading "TL". Set text leading.
func (to *textObject) setTextLeading(y float64) {
	if to == nil || to.state == nil {
		return
	}
	to.state.tl = y
}

// setCharSpacing "Tc". Set character spacing.
func (to *textObject) setCharSpacing(x float64) {
	if to == nil {
		return
	}
	to.state.tc = x
	if verbose {
		common.Log.Info("setCharSpacing: %.2f state=%s", to.state.String())
	}
}

// setFont "Tf". Set font.
func (to *textObject) setFont(name string, size float64) error {
	if to == nil {
		return nil
	}
	to.state.tfs = size
	font, err := to.getFont(name)
	if err != nil {
		if err == model.ErrFontNotSupported {
			// TODO(peterwilliams97): Do we need to handle this case in a special way?
			return err
		}
		return err
	}
	to.state.tfont = font
	if to.savedStates.empty() {
		to.savedStates.push(to.state)
	} else {
		to.savedStates.top().tfont = to.state.tfont
	}

	return nil
}

// setTextRenderMode "Tr". Set text rendering mode.
func (to *textObject) setTextRenderMode(mode int) {
	if to == nil {
		return
	}
	to.state.tmode = RenderMode(mode)
}

// setTextRise "Ts". Set text rise.
func (to *textObject) setTextRise(y float64) {
	if to == nil {
		return
	}
	to.state.trise = y
}

// setWordSpacing "Tw". Set word spacing.
func (to *textObject) setWordSpacing(y float64) {
	if to == nil {
		return
	}
	to.state.tw = y
}

// setHorizScaling "Tz". Set horizontal scaling.
func (to *textObject) setHorizScaling(y float64) {
	if to == nil {
		return
	}
	to.state.th = y
}

// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
// a single float parameter or we aren't in a text stream.
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
	if len(op.Params) != 1 {
		err := errors.New("incorrect parameter count")
		common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
			op.Operand, 1, len(op.Params), op.Params)
		return 0.0, err
	}
	return core.GetNumberAsFloat(op.Params[0])
}

// checkOp returns true if we are in a text stream and `op` has `numParams` params.
// If `hard` is true and the number of params don't match, an error is returned.
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int, hard bool) (
	ok bool, err error) {
	if to == nil {
		var params []core.PdfObject
		if numParams > 0 {
			params = op.Params
			if len(params) > numParams {
				params = params[:numParams]
			}
		}
		common.Log.Debug("%#q operand outside text. params=%+v", op.Operand, params)
	}
	if numParams >= 0 {
		if len(op.Params) != numParams {
			if hard {
				err = errors.New("incorrect parameter count")
			}
			common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
				op.Operand, numParams, len(op.Params), op.Params)
			return false, err
		}
	}
	return true, nil
}

// stateStack is the PDF textState stack implementation.
type stateStack []*textState

// String returns a string describing the current state of the textState stack.
func (savedStates *stateStack) String() string {
	parts := []string{fmt.Sprintf("---- font stack: %d", len(*savedStates))}
	for i, state := range *savedStates {
		s := "<nil>"
		if state != nil {
			s = state.String()
		}
		parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
	}
	return strings.Join(parts, "\n")
}

// push pushes a copy of `state` onto the textState stack.
func (savedStates *stateStack) push(state *textState) {
	s := *state
	*savedStates = append(*savedStates, &s)
}

// pop pops and returns a copy of the last state on the textState stack there is one or nil if
// there isn't.
func (savedStates *stateStack) pop() *textState {
	if savedStates.empty() {
		return nil
	}
	state := *(*savedStates)[len(*savedStates)-1]
	*savedStates = (*savedStates)[:len(*savedStates)-1]
	return &state
}

// top returns the last saved state if there is one or nil if there isn't.
// NOTE: The return is a pointer. Modifying it will modify the stack.
func (savedStates *stateStack) top() *textState {
	if savedStates.empty() {
		return nil
	}
	return (*savedStates)[savedStates.size()-1]
}

// empty returns true if the textState stack is empty.
func (savedStates *stateStack) empty() bool {
	return len(*savedStates) == 0
}

// size returns the number of elements in the textState stack.
func (savedStates *stateStack) size() int {
	return len(*savedStates)
}

// 9.3 Text State Parameters and Operators (page 243)
// Some of these parameters are expressed in unscaled text space units. This means that they shall
// be specified in a coordinate system that shall be defined by the text matrix, Tm but shall not be
// scaled by the font size parameter, Tfs.

// textState represents the text state.
type textState struct {
	tc       float64        // Character spacing. Unscaled text space units.
	tw       float64        // Word spacing. Unscaled text space units.
	th       float64        // Horizontal scaling.
	tl       float64        // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
	tfs      float64        // Text font size.
	tmode    RenderMode     // Text rendering mode.
	trise    float64        // Text rise. Unscaled text space units. Set by Ts.
	tfont    *model.PdfFont // Text font.
	mediaBox model.PdfRectangle
	// For debugging
	numChars  int
	numMisses int
}

// String returns a description of `state`.
func (state *textState) String() string {
	fontName := "[NOT SET]"
	if state.tfont != nil {
		fontName = state.tfont.BaseFont()
	}
	return fmt.Sprintf("tc=%.2f tw=%.2f tfs=%.2f font=%q",
		state.tc, state.tw, state.tfs, fontName)
}

// 9.4.1 General (page 248)
// A PDF text object consists of operators that may show text strings, move the text position, and
// set text state and certain other parameters. In addition, two parameters may be specified only
// within a text object and shall not persist from one text object to the next:
//   • Tm, the text matrix
//   • Tlm, the text line matrix
//
// Text space is converted to device space by this transform (page 252)
// Trm is the text rendering matrix
//        | Tfs x Th   0      0 |
// Trm  = | 0         Tfs     0 | × Tm × CTM
//        | 0         Trise   1 |
// This corresponds to the following code in renderText()
//  trm := to.gs.CTM.Mult(stateMatrix).Mult(to.tm)

// textObject represents a PDF text object.
type textObject struct {
	e           *Extractor
	resources   *model.PdfPageResources
	gs          contentstream.GraphicsState
	state       *textState
	savedStates *stateStack
	tm          transform.Matrix // Text matrix. For the character pointer.
	tlm         transform.Matrix // Text line matrix. For the start of line pointer.
	marks       []*textMark      // Text marks get written here.
	invalidFont bool             // Flag that gets set true when we can't handle the current font.
}

// newTextState returns a default textState.
func newTextState(mediaBox model.PdfRectangle) textState {
	return textState{
		th:       100,
		tmode:    RenderModeFill,
		mediaBox: mediaBox,
	}
}

// newTextObject returns a default textObject.
func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
	state *textState, savedStates *stateStack) *textObject {
	return &textObject{
		e:           e,
		resources:   resources,
		gs:          gs,
		savedStates: savedStates,
		state:       state,
		tm:          transform.IdentityMatrix(),
		tlm:         transform.IdentityMatrix(),
	}
}

// reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text
// object to the identity matrix. In addition, the marks collection is cleared.
func (to *textObject) reset() {
	to.tm = transform.IdentityMatrix()
	to.tlm = transform.IdentityMatrix()
	to.marks = nil
	to.logCursor()
}

// logCursor is for debugging only. Remove !@#$
func (to *textObject) logCursor() {
	return
	state := to.state
	tfs := state.tfs
	th := state.th / 100.0
	stateMatrix := transform.NewMatrix(
		tfs*th, 0,
		0, tfs,
		0, state.trise)
	trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)
	cur := translation(trm)
	common.Log.Info("showTrm: %s cur=%.2f tm=%.2f CTM=%.2f",
		fileLine(1, false), cur, to.tm, to.gs.CTM)
}

// renderText processes and renders byte array `data` for extraction purposes.
// It extracts textMarks based the charcodes in `data` and the currect text and graphics states
// are tracked in `to`.
func (to *textObject) renderText(data []byte) error {
	if to.invalidFont {
		common.Log.Debug("renderText: Invalid font. Not processing.")
		return nil
	}
	font := to.getCurrentFont()
	charcodes := font.BytesToCharcodes(data)
	runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
	if numMisses > 0 {
		common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
	}

	to.state.numChars += numChars
	to.state.numMisses += numMisses

	state := to.state
	tfs := state.tfs
	th := state.th / 100.0
	spaceMetrics, ok := font.GetRuneMetrics(' ')
	if !ok {
		spaceMetrics, ok = font.GetCharMetrics(32)
	}
	if !ok {
		spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
	}
	spaceWidth := spaceMetrics.Wx * glyphTextRatio
	common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)

	stateMatrix := transform.NewMatrix(
		tfs*th, 0,
		0, tfs,
		0, state.trise)
	if verbose {
		common.Log.Info("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runeSlices)
	}

	for i, r := range runeSlices {
		if len(r) == 1 && r[0] == '\x00' {
			continue
		}

		code := charcodes[i]
		// The location of the text on the page in device coordinates is given by trm, the text
		// rendering matrix.
		trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)

		// calculate the text location displacement due to writing `r`. We will use this to update
		// to.tm

		// w is the unscaled movement at the end of a word.
		w := 0.0
		if len(r) == 1 && r[0] == 32 {
			w = state.tw
		}

		m, ok := font.GetCharMetrics(code)
		if !ok {
			common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
			return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code)
		}

		// c is the character size in unscaled text units.
		c := transform.Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}

		// t0 is the end of this character.
		// t is the displacement of the text cursor when the character is rendered.
		t0 := transform.Point{X: (c.X*tfs + w) * th}
		t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
		if verbose {
			common.Log.Info("tfs=%.2f tc=%.2f tw=%.2f th=%.2f", tfs, state.tc, state.tw, th)
			common.Log.Info("dx,dy=%.3f t0=%.2f t=%.2f", c, t0, t)
		}

		// td, td0 are t, t0 in matrix form.
		// td0 is where this character ends. td is where the next character starts.
		td0 := translationMatrix(t0)
		td := translationMatrix(t)
		end := to.gs.CTM.Mult(to.tm).Mult(td0)

		if verbose {
			common.Log.Info("end:\n\tCTM=%s\n\t tm=%s\n"+
				"\t td=%s xlat=%s\n"+
				"\ttd0=%s\n\t → %s xlat=%s",
				to.gs.CTM, to.tm,
				td, translation(to.gs.CTM.Mult(to.tm).Mult(td)),
				td0, end, translation(end))
		}

		mark, onPage := to.newTextMark(
			textencoding.ExpandLigatures(r),
			trm,
			translation(end),
			math.Abs(spaceWidth*trm.ScalingFactorX()),
			font,
			to.state.tc)
		if !onPage {
			common.Log.Debug("Text mark outside page. Skipping")
			continue
		}
		if font == nil {
			common.Log.Debug("ERROR: No font.")
		} else if font.Encoder() == nil {
			common.Log.Debug("ERROR: No encoding. font=%s", font)
		} else {
			original, ok := font.Encoder().CharcodeToRune(code)
			if ok {
				mark.original = string(original)
			}
		}
		common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
		to.marks = append(to.marks, &mark)

		// update the text matrix by the displacement of the text location.
		to.tm.Concat(td)
		if i != len(runeSlices)-1 {
			to.logCursor()
		}
	}

	return nil
}

// glyphTextRatio converts Glyph metrics units to unscaled text space units.
const glyphTextRatio = 1.0 / 1000.0

// translation returns the translation part of `m`.
func translation(m transform.Matrix) transform.Point {
	tx, ty := m.Translation()
	return transform.Point{X: tx, Y: ty}
}

// translationMatrix returns a matrix that translates by `p`.
func translationMatrix(p transform.Point) transform.Matrix {
	return transform.TranslationMatrix(p.X, p.Y)
}

// moveTo moves the start of line pointer by `tx`,`ty` and sets the text pointer to the
// start of line pointer.
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
// `tx` and `ty` are in unscaled text space units.
func (to *textObject) moveTo(tx, ty float64) {
	to.tlm.Concat(transform.NewMatrix(1, 0, 0, 1, tx, ty))
	to.tm = to.tlm
}

// isTextSpace returns true if `text` contains nothing but space code points.
func isTextSpace(text string) bool {
	for _, r := range text {
		if !unicode.IsSpace(r) {
			return false
		}
	}
	return true
}

// PageText represents the layout of text on a device page.
type PageText struct {
	marks     []*textMark // Texts and their positions on a PDF page.
	viewText  string      // Extracted page text.
	viewMarks []TextMark  // Public view of `marks`.
	pageSize  model.PdfRectangle
}

// String returns a string describing `pt`.
func (pt PageText) String() string {
	summary := fmt.Sprintf("PageText: %d elements", len(pt.marks))
	parts := []string{"-" + summary}
	for _, tm := range pt.marks {
		parts = append(parts, tm.String())
	}
	parts = append(parts, "+"+summary)
	return strings.Join(parts, "\n")
}

// Text returns the extracted page text.
func (pt PageText) Text() string {
	return pt.viewText
}

// ToText returns the page text as a single string.
// Deprecated: This function is deprecated and will be removed in a future major version. Please use
// Text() instead.
func (pt PageText) ToText() string {
	return pt.Text()
}

// Marks returns the TextMark collection for a page. It represents all the text on the page.
func (pt PageText) Marks() *TextMarkArray {
	return &TextMarkArray{marks: pt.viewMarks}
}

// computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
// `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
// The comments above the TextMark definition describe how to use the []TextMark to
// maps substrings of the page text to locations on the PDF page.
func (pt *PageText) computeViews() {
	common.Log.Trace("ToTextLocation: %d elements", len(pt.marks))
	paras := makeTextPage(pt.marks, pt.pageSize, 0)
	b := new(bytes.Buffer)
	paras.writeText(b)
	pt.viewText = b.String()
	pt.viewMarks = paras.toTextMarks()
}

// TextMarkArray is a collection of TextMarks.
type TextMarkArray struct {
	marks []TextMark
}

// Append appends `mark` to the mark array.
func (ma *TextMarkArray) Append(mark TextMark) {
	ma.marks = append(ma.marks, mark)
}

// String returns a string describing `ma`.
func (ma TextMarkArray) String() string {
	n := len(ma.marks)
	if n == 0 {
		return "EMPTY"
	}
	m0 := ma.marks[0]
	m1 := ma.marks[n-1]
	return fmt.Sprintf("{TEXTMARKARRAY: %d elements\n\tfirst=%s\n\t last=%s}", n, m0, m1)
}

// Elements returns the TextMarks in `ma`.
func (ma *TextMarkArray) Elements() []TextMark {
	return ma.marks
}

// Len returns the number of TextMarks in `ma`.
func (ma *TextMarkArray) Len() int {
	if ma == nil {
		return 0
	}
	return len(ma.marks)
}

// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
// `start` and `end` are offsets in the extracted text.
// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ﬃ ligature so the first and
// last elements of the returned TextMarkArray may only partially overlap text[start:end].
func (ma *TextMarkArray) RangeOffset(start, end int) (*TextMarkArray, error) {
	if ma == nil {
		return nil, errors.New("ma==nil")
	}
	if end < start {
		return nil, fmt.Errorf("end < start. RangeOffset not defined. start=%d end=%d ", start, end)
	}
	n := len(ma.marks)
	if n == 0 {
		return ma, nil
	}
	if start < ma.marks[0].Offset {
		start = ma.marks[0].Offset
	}
	if end > ma.marks[n-1].Offset+1 {
		end = ma.marks[n-1].Offset + 1
	}

	iStart := sort.Search(n, func(i int) bool { return ma.marks[i].Offset+len(ma.marks[i].Text)-1 >= start })
	if !(0 <= iStart && iStart < n) {
		err := fmt.Errorf("Out of range. start=%d iStart=%d len=%d\n\tfirst=%v\n\t last=%v",
			start, iStart, n, ma.marks[0], ma.marks[n-1])
		return nil, err
	}
	iEnd := sort.Search(n, func(i int) bool { return ma.marks[i].Offset > end-1 })
	if !(0 <= iEnd && iEnd < n) {
		err := fmt.Errorf("Out of range. end=%d iEnd=%d len=%d\n\tfirst=%v\n\t last=%v",
			end, iEnd, n, ma.marks[0], ma.marks[n-1])
		return nil, err
	}
	if iEnd <= iStart {
		// This should never happen.
		return nil, fmt.Errorf("iEnd <= iStart: start=%d end=%d iStart=%d iEnd=%d",
			start, end, iStart, iEnd)
	}
	return &TextMarkArray{marks: ma.marks[iStart:iEnd]}, nil
}

// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
func (ma *TextMarkArray) BBox() (model.PdfRectangle, bool) {
	var bbox model.PdfRectangle
	found := false
	for _, tm := range ma.marks {
		if tm.Meta || isTextSpace(tm.Text) {
			continue
		}
		if found {
			bbox = rectUnion(bbox, tm.BBox)
		} else {
			bbox = tm.BBox
			found = true
		}
	}
	return bbox, found
}

// TextMark represents extracted text on a page with information regarding both textual content,
// formatting (font and size) and positioning.
// It is the smallest unit of text on a PDF page, typically a single character.
//
// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
// `bbox` of substring `term` in `text`.
//
//     ex, _ := New(page)
//     // handle errors
//     pageText, _, _, err := ex.ExtractPageText()
//     // handle errors
//     text := pageText.Text()
//     textMarks := pageText.Marks()
//
//     	start := strings.Index(text, term)
//      end := start + len(term)
//      spanMarks, err := textMarks.RangeOffset(start, end)
//      // handle errors
//      bbox, ok := spanMarks.BBox()
//      // handle errors
type TextMark struct {
	count int64
	// Text is the extracted text. It has been decoded to Unicode via ToUnicode().
	Text string
	// Original is the text in the PDF. It has not been decoded like `Text`.
	Original string
	// BBox is the bounding box of the text.
	BBox model.PdfRectangle
	// Font is the font the text was drawn with.
	Font *model.PdfFont
	// FontSize is the font size the text was drawn with.
	FontSize float64
	// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
	//   text, textMarks := pageText.Text(), pageText.Marks()
	//   marks := textMarks.Elements()
	// then marks[i].Offset is the offset of marks[i].Text in text.
	Offset int
	// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
	// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
	//  distance  apart. See wordJoiner (lineJoiner) in PageText.computeViews().
	Meta bool
}

// String returns a string describing `tm`.
func (tm TextMark) String() string {
	b := tm.BBox
	var font string
	if tm.Font != nil {
		font = tm.Font.String()
		if len(font) > 50 {
			font = font[:50] + "..."
		}
	}
	var meta string
	if tm.Meta {
		meta = " *M*"
	}
	return fmt.Sprintf("{@%04d TextMark: %d %q=%02x (%6.2f, %6.2f) (%6.2f, %6.2f) %s%s}",
		tm.count, tm.Offset, tm.Text, []rune(tm.Text), b.Llx, b.Lly, b.Urx, b.Ury, font, meta)
}

// spaceMark is a special TextMark used for spaces.
var spaceMark = TextMark{
	Text:     "[X]",
	Original: " ",
	Meta:     true,
}

// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
// empty.
func (to *textObject) getCurrentFont() *model.PdfFont {
	var font *model.PdfFont
	if !to.savedStates.empty() {
		font = to.savedStates.top().tfont
	}
	if font == nil {
		common.Log.Debug("ERROR: No font defined. Using default.")
		return model.DefaultFont()
	}
	return font
}

// getFont returns the font named `name` if it exists in the page's resources or an error if it
// doesn't. It caches the returned fonts.
func (to *textObject) getFont(name string) (*model.PdfFont, error) {
	if to.e.fontCache != nil {
		to.e.accessCount++
		entry, ok := to.e.fontCache[name]
		if ok {
			entry.access = to.e.accessCount
			return entry.font, nil
		}
	}

	// Font not in cache. Load it.
	font, err := to.getFontDirect(name)
	if err != nil {
		return nil, err
	}

	if to.e.fontCache != nil {
		entry := fontEntry{font, to.e.accessCount}

		// Eject a victim if the cache is full.
		if len(to.e.fontCache) >= maxFontCache {
			var names []string
			for name := range to.e.fontCache {
				names = append(names, name)
			}
			sort.Slice(names, func(i, j int) bool {
				return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
			})
			delete(to.e.fontCache, names[0])
		}
		to.e.fontCache[name] = entry
	}

	return font, nil
}

// fontEntry is a entry in the font cache.
type fontEntry struct {
	font   *model.PdfFont // The font being cached.
	access int64          // Last access. Used to determine LRU cache victims.
}

// maxFontCache is the maximum number of PdfFont's in fontCache.
const maxFontCache = 10

// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
// it doesn't. Accesses page resources directly (not cached).
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
	fontObj, err := to.getFontDict(name)
	if err != nil {
		return nil, err
	}
	font, err := model.NewPdfFontFromPdfObject(fontObj)
	if err != nil {
		common.Log.Debug("getFontDirect: NewPdfFontFromPdfObject failed. name=%#q err=%v", name, err)
	}
	return font, err
}

// getFontDict returns the font dict with key `name` if it exists in the page's or form's Font
// resources or an error if it doesn't.
func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
	resources := to.resources
	if resources == nil {
		common.Log.Debug("getFontDict. No resources. name=%#q", name)
		return nil, nil
	}
	fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
	if !found {
		common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
		return nil, errors.New("font not in resources")
	}
	return fontObj, nil
}
-												Add LICENSE.md with reference to AGPL and Commercial license.  Add license header info to code.

											
										
										
											2018-03-22 14:03:47 +00:00
+								/*
 								 * This file is subject to the terms and conditions defined in
 								 * file 'LICENSE.md', which is part of this source code package.
 								 */
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								package extractor
 								import (
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									"bytes"
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									"errors"
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									"fmt"
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+									"math"
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									"sort"
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									"strings"
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									"unicode"
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
-												Update module version and import paths (#1)

* Update import path to use unipdf
* Update module name and version

											
										
										
											2019-05-16 23:08:40 +03:00
+									"github.com/unidoc/unipdf/v3/common"
-												Remmove pdf folder and move packages up one level (#2)


											
										
										
											2019-05-16 23:44:51 +03:00
+									"github.com/unidoc/unipdf/v3/contentstream"
 									"github.com/unidoc/unipdf/v3/core"
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									"github.com/unidoc/unipdf/v3/internal/textencoding"
-												Remmove pdf folder and move packages up one level (#2)


											
										
										
											2019-05-16 23:44:51 +03:00
+									"github.com/unidoc/unipdf/v3/internal/transform"
 									"github.com/unidoc/unipdf/v3/model"
-												Prevent panics (#305)

* Remove panic on font nil Differences array

* Remove unused bcmaps function

* Remove panics from the core/security/crypt package

* Fix extractor invalid Do operand crash

* Fix TTF parser crash for invalid hhea number of hMetrics

* Remove ECB crypt panics

* Remove standard_r6 panics

* Remove panic from render package
											
										
										
											2020-04-15 00:09:16 +03:00
+								)
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+								const verbose = false
-												Handle more cases of fonts not being set in text extraction code.

											
										
										
											2020-05-28 12:08:15 +10:00
+								// maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack
 								// overflow and high enough to accomodate customers' PDFs
 								const maxFormStack 10
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+								// ExtractText processes and extracts all text data in content streams and returns as a string.
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// It takes into account character encodings in the PDF file, which are decoded by
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// CharcodeBytesToUnicode.
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								func (e *Extractor) ExtractText() (string, error) {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									text, _, _, err := e.ExtractTextWithStats()
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									return text, err
 								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
 								// (`numChars`) and the number of characters that were not decoded (`numMisses`).
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+								func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									pageText, numChars, numMisses, err := e.ExtractPageText()
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if err != nil {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										return "", numChars, numMisses, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									return pageText.Text(), numChars, numMisses, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
 								func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0)
 									if err != nil {
 										return nil, numChars, numMisses, err
 									}
 									pt.computeViews()
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									// procBuf(pt)
-												Add Append to TextMarkArray

Useful when processing and grouping text marks.

											
										
										
											2019-08-04 09:29:21 +00:00
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									return pt, numChars, numMisses, err
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
 								// PageText.
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+								// This can be called on a page or a form XObject.
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (
 									*PageText, int, int, error) {
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									common.Log.Trace("extractPageText: level=%d", level)
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									pageText := &PageText{pageSize: e.mediaBox}
 									state := newTextState(e.mediaBox)
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									var savedStates stateStack
 									to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &savedStates)
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+									var inTextObj bool
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
-												Handle more cases of fonts not being set in text extraction code.

											
										
										
											2020-05-28 12:08:15 +10:00
+									if level > maxFormStack {
 										err := errors.New("form stack overflow")
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+										common.Log.Debug("ERROR: extractPageText. recursion level=%d err=%w", level, err)
 										return pageText, state.numChars, state.numMisses, err
 									}
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									// Uncomment the following 3 statements to log the content stream.
 									// common.Log.Info("contents* %d -----------------------------", len(contents))
 									// fmt.Println(contents)
 									// common.Log.Info("contents+ -----------------------------")
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									cstreamParser := contentstream.NewContentStreamParser(contents)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									operations, err := cstreamParser.Parse()
 									if err != nil {
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+										common.Log.Debug("ERROR: extractPageText parse failed. err=%w", err)
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+										return pageText, state.numChars, state.numMisses, err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									}
 									processor := contentstream.NewContentStreamProcessor(*operations)
 									processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
 											resources *model.PdfPageResources) error {
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+											operand := op.Operand
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+											if verbose {
 												common.Log.Info("&&& op=%s", op)
 											}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+											switch operand {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+											case "q":
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+												savedStates.push(&state)
 												// common.Log.Info("Save state: stack=%d\n %s", len(savedStates), state.String())
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+											case "Q":
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+												if verbose {
 													common.Log.Info("Restore state: %s", savedStates.String())
 												}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+												if !savedStates.empty() {
 													// oldState := state
 													state = *savedStates.top()
 													// common.Log.Info("Restore state: stack=%d\n %s\n→%s",
 													// 	len(savedStates), oldState.String(), state.String())
 													if len(savedStates) >= 2 {
 														savedStates.pop()
 													}
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											case "BT": // Begin text
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+												// Begin a text object, initializing the text matrix, Tm, and
 												// the text line matrix, Tlm, to the identity matrix. Text
 												// objects shall not be nested. A second BT shall not appear
 												// before an ET. However, if that happens, all existing marks
 												// are added to the  page marks, in order to avoid losing content.
 												if inTextObj {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													common.Log.Debug("BT called while in a text object")
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+													pageText.marks = append(pageText.marks, to.marks...)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+												inTextObj = true
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+												to = newTextObject(e, resources, gs, &state, &savedStates)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											case "ET": // End Text
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+												// End text object, discarding text matrix. If the current
 												// text object contains text marks, they are added to the
 												// page text marks collection.
 												// The ET operator should always have a matching BT operator.
 												// However, if ET appears outside of a text object, the behavior
 												// does not change: the text matrices are discarded and all
 												// existing marks in the text object are added to the page marks.
 												if !inTextObj {
 													common.Log.Debug("ET called outside of a text object")
 												}
 												inTextObj = false
-												(*pageText). -> pageText.

											
										
										
											2019-01-05 14:10:54 +11:00
+												pageText.marks = append(pageText.marks, to.marks...)
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+												to.reset()
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											case "T*": // Move to start of next text line
 												to.nextLine()
 											case "Td": // Move text location
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 2, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												x, y, err := toFloatXY(op.Params)
 												if err != nil {
 													return err
 												}
 												to.moveText(x, y)
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+											case "TD": // Move text location and set leading.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 2, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												x, y, err := toFloatXY(op.Params)
 												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												to.moveTextSetLeading(x, y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tj": // Show text.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tj op=%s err=%v", op, err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												charcodes, ok := core.GetStringBytes(op.Params[0])
 												if !ok {
 													common.Log.Debug("ERROR: Tj op=%s GetStringBytes failed", op)
 													return core.ErrTypeError
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												return to.showText(charcodes)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "TJ": // Show text with adjustable spacing.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: TJ err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Removed GetArrayVal

											
										
										
											2018-07-25 13:19:09 +10:00
+												args, ok := core.GetArray(op.Params[0])
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												if !ok {
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+													common.Log.Debug("ERROR: TJ op=%s GetArrayVal failed", op)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												return to.showTextAdjusted(args)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "'": // Move to next line and show text.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: ' err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												charcodes, ok := core.GetStringBytes(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: ' op=%s GetStringBytes failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
 												to.nextLine()
 												return to.showText(charcodes)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case `"`: // Set word and character spacing, move to next line, and show text.
-												Double quote content stream operator fixes (#313)

* Fix wrong symbol checks used for the double quote content stream operator

* Fix text extraction parameter check for the double quote operator
											
										
										
											2020-04-16 17:32:34 +03:00
+												if ok, err := to.checkOp(op, 3, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: \" err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												x, y, err := toFloatXY(op.Params[:2])
 												if err != nil {
 													return err
 												}
 												charcodes, ok := core.GetStringBytes(op.Params[2])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: \" op=%s GetStringBytes failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												to.setCharSpacing(x)
 												to.setWordSpacing(y)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												to.nextLine()
 												return to.showText(charcodes)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "TL": // Set text leading.
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+												y, err := floatParam(op)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if err != nil {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: TL err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setTextLeading(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tc": // Set character spacing.
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+												y, err := floatParam(op)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if err != nil {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tc err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setCharSpacing(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tf": // Set font.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 2, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tf err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												name, ok := core.GetNameVal(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tf op=%s GetNameVal failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												size, err := core.GetNumberAsFloat(op.Params[1])
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												if !ok {
 													common.Log.Debug("ERROR: Tf op=%s GetFloatVal failed. err=%v", op, err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												err = to.setFont(name, size)
-												Handle more cases of fonts not being set in text extraction code.

											
										
										
											2020-05-28 12:08:15 +10:00
+												to.invalidFont = unsupportedFontErr(err)
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+												if err != nil && !to.invalidFont {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tm": // Set text matrix.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 6, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tm err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+												floats, err := core.GetNumbersAsFloat(op.Params)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setTextMatrix(floats)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tr": // Set text rendering mode.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tr err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												mode, ok := core.GetIntVal(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tr op=%s GetIntVal failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
 												to.setTextRenderMode(mode)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Ts": // Set text rise.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Ts err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												y, err := core.GetNumberAsFloat(op.Params[0])
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setTextRise(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tw": // Set word spacing.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												y, err := core.GetNumberAsFloat(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												to.setWordSpacing(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tz": // Set horizontal scaling.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												y, err := core.GetNumberAsFloat(op.Params[0])
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setHorizScaling(y)
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+											case "Do":
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+												// Handle XObjects by recursing through form XObjects.
-												Prevent panics (#305)

* Remove panic on font nil Differences array

* Remove unused bcmaps function

* Remove panics from the core/security/crypt package

* Fix extractor invalid Do operand crash

* Fix TTF parser crash for invalid hhea number of hMetrics

* Remove ECB crypt panics

* Remove standard_r6 panics

* Remove panic from render package
											
										
										
											2020-04-15 00:09:16 +03:00
+												if len(op.Params) == 0 {
 													common.Log.Debug("ERROR: expected XObject name operand for Do operator. Got %+v.", op.Params)
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+													return core.ErrRangeError
-												Prevent panics (#305)

* Remove panic on font nil Differences array

* Remove unused bcmaps function

* Remove panics from the core/security/crypt package

* Fix extractor invalid Do operand crash

* Fix TTF parser crash for invalid hhea number of hMetrics

* Remove ECB crypt panics

* Remove standard_r6 panics

* Remove panic from render package
											
										
										
											2020-04-15 00:09:16 +03:00
+												}
 												// Get XObject name.
 												name, ok := core.GetName(op.Params[0])
 												if !ok {
 													common.Log.Debug("ERROR: invalid Do operator XObject name operand: %+v.", op.Params[0])
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+													return core.ErrTypeError
-												Prevent panics (#305)

* Remove panic on font nil Differences array

* Remove unused bcmaps function

* Remove panics from the core/security/crypt package

* Fix extractor invalid Do operand crash

* Fix TTF parser crash for invalid hhea number of hMetrics

* Remove ECB crypt panics

* Remove standard_r6 panics

* Remove panic from render package
											
										
										
											2020-04-15 00:09:16 +03:00
+												}
 												_, xtype := resources.GetXObjectByName(*name)
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												if xtype != model.XObjectTypeForm {
 													break
 												}
-												fixed comment

											
										
										
											2018-12-27 20:53:37 +11:00
+												// Only process each form once.
-												Prevent panics (#305)

* Remove panic on font nil Differences array

* Remove unused bcmaps function

* Remove panics from the core/security/crypt package

* Fix extractor invalid Do operand crash

* Fix TTF parser crash for invalid hhea number of hMetrics

* Remove ECB crypt panics

* Remove standard_r6 panics

* Remove panic from render package
											
										
										
											2020-04-15 00:09:16 +03:00
+												formResult, ok := e.formResults[name.String()]
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												if !ok {
-												Prevent panics (#305)

* Remove panic on font nil Differences array

* Remove unused bcmaps function

* Remove panics from the core/security/crypt package

* Fix extractor invalid Do operand crash

* Fix TTF parser crash for invalid hhea number of hMetrics

* Remove ECB crypt panics

* Remove standard_r6 panics

* Remove panic from render package
											
										
										
											2020-04-15 00:09:16 +03:00
+													xform, err := resources.GetXObjectFormByName(*name)
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+													if err != nil {
 														common.Log.Debug("ERROR: %v", err)
 														return err
 													}
 													formContent, err := xform.GetContentStream()
 													if err != nil {
 														common.Log.Debug("ERROR: %v", err)
 														return err
 													}
 													formResources := xform.Resources
 													if formResources == nil {
 														formResources = resources
 													}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+													tList, numChars, numMisses, err := e.extractPageText(string(formContent),
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+														formResources, level+1)
 													if err != nil {
 														common.Log.Debug("ERROR: %v", err)
 														return err
 													}
 													formResult = textResult{*tList, numChars, numMisses}
-												Prevent panics (#305)

* Remove panic on font nil Differences array

* Remove unused bcmaps function

* Remove panics from the core/security/crypt package

* Fix extractor invalid Do operand crash

* Fix TTF parser crash for invalid hhea number of hMetrics

* Remove ECB crypt panics

* Remove standard_r6 panics

* Remove panic from render package
											
										
										
											2020-04-15 00:09:16 +03:00
+													e.formResults[name.String()] = formResult
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												}
-												(*pageText). -> pageText.

											
										
										
											2019-01-05 14:10:54 +11:00
+												pageText.marks = append(pageText.marks, formResult.pageText.marks...)
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												state.numChars += formResult.numChars
 												state.numMisses += formResult.numMisses
 											}
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+											return nil
 										})
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									err = processor.Process(resources)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if err != nil {
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+										common.Log.Debug("ERROR: Processing: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									return pageText, state.numChars, state.numMisses, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Handle more cases of fonts not being set in text extraction code.

											
										
										
											2020-05-28 12:08:15 +10:00
+								// unsupportedFontErr returns true if `err` indicated that the selected font or encoding is not supported.
 								func unsupportedFontErr(err error) bool {
 									if err == model.ErrFontNotSupported ||
 										err == model.ErrType1CFontNotSupported ||
 										err == model.ErrType3FontNotSupported ||
 										err == model.ErrTTCmapNotSupported {
 										return true
 									}
 									if err == nil {
 										return false
 									}
 									errStr := err.Error()
 									return strings.Contains(errStr, "unsupported font encoding:") ||
 										strings.Contains(errStr, "unexpected subtable format:") ||
 										strings.Contains(errStr, "fonts based on PostScript outlines are not supported")
 								}
 								// textResult is used for holding results of PDF form processig
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+								type textResult struct {
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									pageText  PageText
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									numChars  int
 									numMisses int
 								}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								//
 								// Text operators
 								//
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// moveText "Td" Moves start of text by `tx`,`ty`.
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Move to the start of the next line, offset from the start of the current line by (tx, ty).
 								// tx and ty are in unscaled text space units.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) moveText(tx, ty float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									to.moveTo(tx, ty)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// moveTextSetLeading "TD" Move text location and set leading.
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Move to the start of the next line, offset from the start of the current line by (tx, ty). As a
 								// side effect, this operator shall set the leading parameter in the text state. This operator shall
 								// have the same effect as this code:
 								//  −ty TL
 								//  tx ty Td
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) moveTextSetLeading(tx, ty float64) {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tl = -ty
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									to.moveTo(tx, ty)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Fixed some typos.

											
										
										
											2019-01-03 15:41:36 +11:00
+								// nextLine "T*"" Moves start of text line to next text line
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Move to the start of the next line. This operator has the same effect as the code
 								//    0 -Tl Td
 								// where Tl denotes the current leading parameter in the text state. The negative of Tl is used
 								// here because Tl is the text leading expressed as a positive number. Going to the next line
 								// entails decreasing the y coordinate. (page 250)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) nextLine() {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.moveTo(0, -to.state.tl)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+								// setTextMatrix "Tm".
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// in `f` (page 250).
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextMatrix(f []float64) {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									if len(f) != 6 {
 										common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
 										return
 									}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.tm = transform.NewMatrix(a, b, c, d, tx, ty)
 									to.tlm = to.tm
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									to.logCursor()
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// showText "Tj". Show a text string.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) showText(charcodes []byte) error {
-												first attempt at parsing FontFile

											
										
										
											2018-07-02 16:46:43 +10:00
+									return to.renderText(charcodes)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// showTextAdjusted "TJ". Show text with adjustable spacing.
-												Removed GetArrayVal

											
										
										
											2018-07-25 13:19:09 +10:00
+								func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									vertical := false
-												Removed GetArrayVal

											
										
										
											2018-07-25 13:19:09 +10:00
+									for _, o := range args.Elements() {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										switch o.(type) {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										case *core.PdfObjectFloat, *core.PdfObjectInteger:
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											x, err := core.GetNumberAsFloat(o)
 											if err != nil {
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+												common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												return err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											dx, dy := -x*0.001*to.state.tfs, 0.0
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											if vertical {
 												dy, dx = dx, dy
 											}
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+											td := translationMatrix(transform.Point{X: dx, Y: dy})
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+											to.tm.Concat(td)
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+											to.logCursor()
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										case *core.PdfObjectString:
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+											charcodes, ok := core.GetStringBytes(o)
 											if !ok {
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+												common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											to.renderText(charcodes)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										default:
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+											common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										}
 									}
 									return nil
 								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setTextLeading "TL". Set text leading.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextLeading(y float64) {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									if to == nil || to.state == nil {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tl = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setCharSpacing "Tc". Set character spacing.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setCharSpacing(x float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tc = x
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									if verbose {
 										common.Log.Info("setCharSpacing: %.2f state=%s", to.state.String())
 									}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setFont "Tf". Set font.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setFont(name string, size float64) error {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return nil
 									}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									to.state.tfs = size
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									font, err := to.getFont(name)
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									if err != nil {
 										if err == model.ErrFontNotSupported {
 											// TODO(peterwilliams97): Do we need to handle this case in a special way?
 											return err
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										}
-												Changed error handling. Allow partial encoding maps. Don't continue processing unsupported fonts

											
										
										
											2018-07-04 18:00:37 +10:00
+										return err
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									}
 									to.state.tfont = font
 									if to.savedStates.empty() {
 										to.savedStates.push(to.state)
-												Parse FontFile entry in FontDescriptor

											
										
										
											2018-07-03 14:26:42 +10:00
+									} else {
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+										to.savedStates.top().tfont = to.state.tfont
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									return nil
 								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setTextRenderMode "Tr". Set text rendering mode.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextRenderMode(mode int) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tmode = RenderMode(mode)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setTextRise "Ts". Set text rise.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextRise(y float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.trise = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setWordSpacing "Tw". Set word spacing.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setWordSpacing(y float64) {
-												Fixed encoding selection for standard 14 fonts.

											
										
										
											2018-11-22 22:01:04 +11:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tw = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setHorizScaling "Tz". Set horizontal scaling.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setHorizScaling(y float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.th = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// a single float parameter or we aren't in a text stream.
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+								func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
 									if len(op.Params) != 1 {
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										err := errors.New("incorrect parameter count")
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+										common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
 											op.Operand, 1, len(op.Params), op.Params)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										return 0.0, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									return core.GetNumberAsFloat(op.Params[0])
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// checkOp returns true if we are in a text stream and `op` has `numParams` params.
 								// If `hard` is true and the number of params don't match, an error is returned.
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int, hard bool) (
 									ok bool, err error) {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if to == nil {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+										var params []core.PdfObject
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										if numParams > 0 {
 											params = op.Params
 											if len(params) > numParams {
 												params = params[:numParams]
 											}
 										}
 										common.Log.Debug("%#q operand outside text. params=%+v", op.Operand, params)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
 									if numParams >= 0 {
 										if len(op.Params) != numParams {
 											if hard {
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+												err = errors.New("incorrect parameter count")
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												op.Operand, numParams, len(op.Params), op.Params)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+											return false, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										}
 									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return true, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// stateStack is the PDF textState stack implementation.
 								type stateStack []*textState
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// String returns a string describing the current state of the textState stack.
 								func (savedStates *stateStack) String() string {
 									parts := []string{fmt.Sprintf("---- font stack: %d", len(*savedStates))}
 									for i, state := range *savedStates {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										s := "<nil>"
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+										if state != nil {
 											s = state.String()
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										}
 										parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
 									}
 									return strings.Join(parts, "\n")
 								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// push pushes a copy of `state` onto the textState stack.
 								func (savedStates *stateStack) push(state *textState) {
 									s := *state
 									*savedStates = append(*savedStates, &s)
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// pop pops and returns a copy of the last state on the textState stack there is one or nil if
 								// there isn't.
 								func (savedStates *stateStack) pop() *textState {
 									if savedStates.empty() {
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										return nil
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									state := *(*savedStates)[len(*savedStates)-1]
 									*savedStates = (*savedStates)[:len(*savedStates)-1]
 									return &state
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// top returns the last saved state if there is one or nil if there isn't.
 								// NOTE: The return is a pointer. Modifying it will modify the stack.
 								func (savedStates *stateStack) top() *textState {
 									if savedStates.empty() {
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										return nil
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									return (*savedStates)[savedStates.size()-1]
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// empty returns true if the textState stack is empty.
 								func (savedStates *stateStack) empty() bool {
 									return len(*savedStates) == 0
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// size returns the number of elements in the textState stack.
 								func (savedStates *stateStack) size() int {
 									return len(*savedStates)
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// 9.3 Text State Parameters and Operators (page 243)
 								// Some of these parameters are expressed in unscaled text space units. This means that they shall
 								// be specified in a coordinate system that shall be defined by the text matrix, Tm but shall not be
 								// scaled by the font size parameter, Tfs.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
 								// textState represents the text state.
 								type textState struct {
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									tc       float64        // Character spacing. Unscaled text space units.
 									tw       float64        // Word spacing. Unscaled text space units.
 									th       float64        // Horizontal scaling.
 									tl       float64        // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
 									tfs      float64        // Text font size.
 									tmode    RenderMode     // Text rendering mode.
 									trise    float64        // Text rise. Unscaled text space units. Set by Ts.
 									tfont    *model.PdfFont // Text font.
 									mediaBox model.PdfRectangle
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									// For debugging
 									numChars  int
 									numMisses int
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+								// String returns a description of `state`.
 								func (state *textState) String() string {
 									fontName := "[NOT SET]"
 									if state.tfont != nil {
 										fontName = state.tfont.BaseFont()
 									}
 									return fmt.Sprintf("tc=%.2f tw=%.2f tfs=%.2f font=%q",
 										state.tc, state.tw, state.tfs, fontName)
 								}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// 9.4.1 General (page 248)
 								// A PDF text object consists of operators that may show text strings, move the text position, and
 								// set text state and certain other parameters. In addition, two parameters may be specified only
 								// within a text object and shall not persist from one text object to the next:
-												Noted that text extractor is an intermediate version

											
										
										
											2018-06-28 11:11:43 +10:00
+								//   • Tm, the text matrix
 								//   • Tlm, the text line matrix
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								//
 								// Text space is converted to device space by this transform (page 252)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// Trm is the text rendering matrix
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								//        | Tfs x Th   0      0 |
 								// Trm  = | 0         Tfs     0 | × Tm × CTM
 								//        | 0         Trise   1 |
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// This corresponds to the following code in renderText()
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+								//  trm := to.gs.CTM.Mult(stateMatrix).Mult(to.tm)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
 								// textObject represents a PDF text object.
 								type textObject struct {
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									e           *Extractor
 									resources   *model.PdfPageResources
 									gs          contentstream.GraphicsState
 									state       *textState
 									savedStates *stateStack
 									tm          transform.Matrix // Text matrix. For the character pointer.
 									tlm         transform.Matrix // Text line matrix. For the start of line pointer.
 									marks       []*textMark      // Text marks get written here.
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									invalidFont bool             // Flag that gets set true when we can't handle the current font.
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// newTextState returns a default textState.
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+								func newTextState(mediaBox model.PdfRectangle) textState {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									return textState{
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										th:       100,
 										tmode:    RenderModeFill,
 										mediaBox: mediaBox,
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// newTextObject returns a default textObject.
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+								func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+									state *textState, savedStates *stateStack) *textObject {
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return &textObject{
-												Fixed text state save/restore.

											
										
										
											2020-05-26 13:26:09 +10:00
+										e:           e,
 										resources:   resources,
 										gs:          gs,
 										savedStates: savedStates,
 										state:       state,
 										tm:          transform.IdentityMatrix(),
 										tlm:         transform.IdentityMatrix(),
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+								// reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text
 								// object to the identity matrix. In addition, the marks collection is cleared.
 								func (to *textObject) reset() {
 									to.tm = transform.IdentityMatrix()
 									to.tlm = transform.IdentityMatrix()
 									to.marks = nil
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									to.logCursor()
 								}
 								// logCursor is for debugging only. Remove !@#$
 								func (to *textObject) logCursor() {
 									return
 									state := to.state
 									tfs := state.tfs
 									th := state.th / 100.0
 									stateMatrix := transform.NewMatrix(
 										tfs*th, 0,
 , tfs,
 , state.trise)
 									trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)
 									cur := translation(trm)
 									common.Log.Info("showTrm: %s cur=%.2f tm=%.2f CTM=%.2f",
 										fileLine(1, false), cur, to.tm, to.gs.CTM)
-												Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects

											
										
										
											2019-10-30 22:36:35 +02:00
+								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// renderText processes and renders byte array `data` for extraction purposes.
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+								// It extracts textMarks based the charcodes in `data` and the currect text and graphics states
 								// are tracked in `to`.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) renderText(data []byte) error {
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									if to.invalidFont {
 										common.Log.Debug("renderText: Invalid font. Not processing.")
 										return nil
 									}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+									font := to.getCurrentFont()
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
+									charcodes := font.BytesToCharcodes(data)
-												Treat CMap entries as strings instead of runes to handle multi-byte encodings.

											
										
										
											2020-05-20 18:43:09 +10:00
+									runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									if numMisses > 0 {
 										common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
 									}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.numChars += numChars
 									to.state.numMisses += numMisses
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									state := to.state
 									tfs := state.tfs
 									th := state.th / 100.0
-												Remove extra GetRuneCharMetrics function - use GetRuneMetrics

											
										
										
											2019-03-09 18:03:43 +00:00
+									spaceMetrics, ok := font.GetRuneMetrics(' ')
-												Look for CharMetrics for char code 32 when finding space width.

											
										
										
											2018-12-02 13:09:32 +11:00
+									if !ok {
 										spaceMetrics, ok = font.GetCharMetrics(32)
 									}
-												Documented font code. Fall back to StandardEncoding when no encoding is speficied for a font.

											
										
										
											2018-12-02 09:14:58 +11:00
+									if !ok {
-												Remove extra GetRuneCharMetrics function - use GetRuneMetrics

											
										
										
											2019-03-09 18:03:43 +00:00
+										spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+									}
 									spaceWidth := spaceMetrics.Wx * glyphTextRatio
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+									stateMatrix := transform.NewMatrix(
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										tfs*th, 0,
 , tfs,
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+, state.trise)
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									if verbose {
 										common.Log.Info("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runeSlices)
 									}
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
-												Treat CMap entries as strings instead of runes to handle multi-byte encodings.

											
										
										
											2020-05-20 18:43:09 +10:00
+									for i, r := range runeSlices {
 										if len(r) == 1 && r[0] == '\x00' {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+											continue
 										}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
+										code := charcodes[i]
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										// The location of the text on the page in device coordinates is given by trm, the text
 										// rendering matrix.
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+										trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
 										// calculate the text location displacement due to writing `r`. We will use this to update
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										// to.tm
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
 										// w is the unscaled movement at the end of a word.
 										w := 0.0
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										if len(r) == 1 && r[0] == 32 {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											w = state.tw
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										}
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
+										m, ok := font.GetCharMetrics(code)
 										if !ok {
-												First attempt at getting font metrics by character code.

											
										
										
											2018-11-08 15:20:12 +11:00
+											common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
-												Treat CMap entries as strings instead of runes to handle multi-byte encodings.

											
										
										
											2020-05-20 18:43:09 +10:00
+											return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										// c is the character size in unscaled text units.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+										c := transform.Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+										// t0 is the end of this character.
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										// t is the displacement of the text cursor when the character is rendered.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+										t0 := transform.Point{X: (c.X*tfs + w) * th}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+										if verbose {
 											common.Log.Info("tfs=%.2f tc=%.2f tw=%.2f th=%.2f", tfs, state.tc, state.tw, th)
 											common.Log.Info("dx,dy=%.3f t0=%.2f t=%.2f", c, t0, t)
 										}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
-												Fixed text matrix multiplication order.

											
										
										
											2018-11-19 14:19:50 +11:00
+										// td, td0 are t, t0 in matrix form.
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+										// td0 is where this character ends. td is where the next character starts.
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+										td0 := translationMatrix(t0)
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										td := translationMatrix(t)
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										end := to.gs.CTM.Mult(to.tm).Mult(td0)
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+										if verbose {
 											common.Log.Info("end:\n\tCTM=%s\n\t tm=%s\n"+
 												"\t td=%s xlat=%s\n"+
 												"\ttd0=%s\n\t → %s xlat=%s",
 												to.gs.CTM, to.tm,
 												td, translation(to.gs.CTM.Mult(to.tm).Mult(td)),
 												td0, end, translation(end))
 										}
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										mark, onPage := to.newTextMark(
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+											textencoding.ExpandLigatures(r),
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+											trm,
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+											translation(end),
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+											math.Abs(spaceWidth*trm.ScalingFactorX()),
 											font,
 											to.state.tc)
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										if !onPage {
 											common.Log.Debug("Text mark outside page. Skipping")
 											continue
 										}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+										if font == nil {
 											common.Log.Debug("ERROR: No font.")
 										} else if font.Encoder() == nil {
 											common.Log.Debug("ERROR: No encoding. font=%s", font)
 										} else {
 											original, ok := font.Encoder().CharcodeToRune(code)
 											if ok {
 												mark.original = string(original)
 											}
 										}
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+										common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
-												Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

											
										
										
											2020-05-25 09:39:30 +10:00
+										to.marks = append(to.marks, &mark)
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
 										// update the text matrix by the displacement of the text location.
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+										to.tm.Concat(td)
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										if i != len(runeSlices)-1 {
 											to.logCursor()
 										}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// glyphTextRatio converts Glyph metrics units to unscaled text space units.
 								const glyphTextRatio = 1.0 / 1000.0
 								// translation returns the translation part of `m`.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+								func translation(m transform.Matrix) transform.Point {
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+									tx, ty := m.Translation()
-												Update Jenkinsfile for matching examples branch. Address go vet.

											
										
										
											2019-03-09 20:45:19 +00:00
+									return transform.Point{X: tx, Y: ty}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								}
 								// translationMatrix returns a matrix that translates by `p`.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+								func translationMatrix(p transform.Point) transform.Matrix {
 									return transform.TranslationMatrix(p.X, p.Y)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								}
 								// moveTo moves the start of line pointer by `tx`,`ty` and sets the text pointer to the
 								// start of line pointer.
 								// Move to the start of the next line, offset from the start of the current line by (tx, ty).
 								// `tx` and `ty` are in unscaled text space units.
 								func (to *textObject) moveTo(tx, ty float64) {
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+									to.tlm.Concat(transform.NewMatrix(1, 0, 0, 1, tx, ty))
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.tm = to.tlm
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								// isTextSpace returns true if `text` contains nothing but space code points.
 								func isTextSpace(text string) bool {
 									for _, r := range text {
 										if !unicode.IsSpace(r) {
 											return false
 										}
 									}
 									return true
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// PageText represents the layout of text on a device page.
 								type PageText struct {
-												Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

											
										
										
											2020-05-25 09:39:30 +10:00
+									marks     []*textMark // Texts and their positions on a PDF page.
 									viewText  string      // Extracted page text.
 									viewMarks []TextMark  // Public view of `marks`.
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									pageSize  model.PdfRectangle
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+								}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// String returns a string describing `pt`.
 								func (pt PageText) String() string {
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									summary := fmt.Sprintf("PageText: %d elements", len(pt.marks))
 									parts := []string{"-" + summary}
 									for _, tm := range pt.marks {
 										parts = append(parts, tm.String())
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									parts = append(parts, "+"+summary)
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									return strings.Join(parts, "\n")
 								}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								// Text returns the extracted page text.
 								func (pt PageText) Text() string {
 									return pt.viewText
 								}
 								// ToText returns the page text as a single string.
 								// Deprecated: This function is deprecated and will be removed in a future major version. Please use
 								// Text() instead.
 								func (pt PageText) ToText() string {
 									return pt.Text()
 								}
 								// Marks returns the TextMark collection for a page. It represents all the text on the page.
 								func (pt PageText) Marks() *TextMarkArray {
 									return &TextMarkArray{marks: pt.viewMarks}
 								}
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+								// computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
 								// `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
 								// The comments above the TextMark definition describe how to use the []TextMark to
 								// maps substrings of the page text to locations on the PDF page.
 								func (pt *PageText) computeViews() {
 									common.Log.Trace("ToTextLocation: %d elements", len(pt.marks))
 									paras := makeTextPage(pt.marks, pt.pageSize, 0)
 									b := new(bytes.Buffer)
 									paras.writeText(b)
 									pt.viewText = b.String()
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									pt.viewMarks = paras.toTextMarks()
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+								}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								// TextMarkArray is a collection of TextMarks.
 								type TextMarkArray struct {
 									marks []TextMark
 								}
-												Add Append to TextMarkArray

Useful when processing and grouping text marks.

											
										
										
											2019-08-04 09:29:21 +00:00
+								// Append appends `mark` to the mark array.
 								func (ma *TextMarkArray) Append(mark TextMark) {
 									ma.marks = append(ma.marks, mark)
 								}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								// String returns a string describing `ma`.
 								func (ma TextMarkArray) String() string {
 									n := len(ma.marks)
 									if n == 0 {
 										return "EMPTY"
 									}
 									m0 := ma.marks[0]
 									m1 := ma.marks[n-1]
 									return fmt.Sprintf("{TEXTMARKARRAY: %d elements\n\tfirst=%s\n\t last=%s}", n, m0, m1)
 								}
 								// Elements returns the TextMarks in `ma`.
 								func (ma *TextMarkArray) Elements() []TextMark {
 									return ma.marks
 								}
 								// Len returns the number of TextMarks in `ma`.
 								func (ma *TextMarkArray) Len() int {
 									if ma == nil {
 										return 0
 									}
 									return len(ma.marks)
 								}
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+								// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
 								// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
 								// `start` and `end` are offsets in the extracted text.
 								// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ﬃ ligature so the first and
 								// last elements of the returned TextMarkArray may only partially overlap text[start:end].
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								func (ma *TextMarkArray) RangeOffset(start, end int) (*TextMarkArray, error) {
 									if ma == nil {
 										return nil, errors.New("ma==nil")
 									}
 									if end < start {
 										return nil, fmt.Errorf("end < start. RangeOffset not defined. start=%d end=%d ", start, end)
 									}
 									n := len(ma.marks)
 									if n == 0 {
 										return ma, nil
 									}
 									if start < ma.marks[0].Offset {
 										start = ma.marks[0].Offset
 									}
 									if end > ma.marks[n-1].Offset+1 {
 										end = ma.marks[n-1].Offset + 1
 									}
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									iStart := sort.Search(n, func(i int) bool { return ma.marks[i].Offset+len(ma.marks[i].Text)-1 >= start })
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									if !(0 <= iStart && iStart < n) {
 										err := fmt.Errorf("Out of range. start=%d iStart=%d len=%d\n\tfirst=%v\n\t last=%v",
 											start, iStart, n, ma.marks[0], ma.marks[n-1])
 										return nil, err
 									}
 									iEnd := sort.Search(n, func(i int) bool { return ma.marks[i].Offset > end-1 })
 									if !(0 <= iEnd && iEnd < n) {
 										err := fmt.Errorf("Out of range. end=%d iEnd=%d len=%d\n\tfirst=%v\n\t last=%v",
 											end, iEnd, n, ma.marks[0], ma.marks[n-1])
 										return nil, err
 									}
 									if iEnd <= iStart {
 										// This should never happen.
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+										return nil, fmt.Errorf("iEnd <= iStart: start=%d end=%d iStart=%d iEnd=%d",
 											start, end, iStart, iEnd)
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									}
 									return &TextMarkArray{marks: ma.marks[iStart:iEnd]}, nil
 								}
 								// BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
 								func (ma *TextMarkArray) BBox() (model.PdfRectangle, bool) {
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									var bbox model.PdfRectangle
 									found := false
 									for _, tm := range ma.marks {
 										if tm.Meta || isTextSpace(tm.Text) {
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+											continue
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										}
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										if found {
 											bbox = rectUnion(bbox, tm.BBox)
 										} else {
 											bbox = tm.BBox
 											found = true
 										}
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									}
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									return bbox, found
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+								}
 								// TextMark represents extracted text on a page with information regarding both textual content,
 								// formatting (font and size) and positioning.
 								// It is the smallest unit of text on a PDF page, typically a single character.
 								//
 								// getBBox() in test_text.go shows how to compute bounding boxes of substrings of extracted text.
 								// The following code extracts the text on PDF page `page` into `text` then finds the bounding box
 								// `bbox` of substring `term` in `text`.
 								//
 								//     ex, _ := New(page)
 								//     // handle errors
 								//     pageText, _, _, err := ex.ExtractPageText()
 								//     // handle errors
 								//     text := pageText.Text()
 								//     textMarks := pageText.Marks()
 								//
 								//     	start := strings.Index(text, term)
 								//      end := start + len(term)
 								//      spanMarks, err := textMarks.RangeOffset(start, end)
 								//      // handle errors
 								//      bbox, ok := spanMarks.BBox()
 								//      // handle errors
 								type TextMark struct {
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+									count int64
-												Finding bounding boxes of substrings of extracted text. (#109)

* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
											
										
										
											2019-07-18 16:41:47 +10:00
+									// Text is the extracted text. It has been decoded to Unicode via ToUnicode().
 									Text string
 									// Original is the text in the PDF. It has not been decoded like `Text`.
 									Original string
 									// BBox is the bounding box of the text.
 									BBox model.PdfRectangle
 									// Font is the font the text was drawn with.
 									Font *model.PdfFont
 									// FontSize is the font size the text was drawn with.
 									FontSize float64
 									// Offset is the offset of the start of TextMark.Text in the extracted text. If you do this
 									//   text, textMarks := pageText.Text(), pageText.Marks()
 									//   marks := textMarks.Elements()
 									// then marks[i].Offset is the offset of marks[i].Text in text.
 									Offset int
 									// Meta is set true for spaces and line breaks that we insert in the extracted text. We insert
 									// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
 									//  distance  apart. See wordJoiner (lineJoiner) in PageText.computeViews().
 									Meta bool
 								}
 								// String returns a string describing `tm`.
 								func (tm TextMark) String() string {
 									b := tm.BBox
 									var font string
 									if tm.Font != nil {
 										font = tm.Font.String()
 										if len(font) > 50 {
 											font = font[:50] + "..."
 										}
 									}
 									var meta string
 									if tm.Meta {
 										meta = " *M*"
 									}
-												Got text_test.go passing.

											
										
										
											2020-05-27 18:15:18 +10:00
+									return fmt.Sprintf("{@%04d TextMark: %d %q=%02x (%6.2f, %6.2f) (%6.2f, %6.2f) %s%s}",
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+										tm.count, tm.Offset, tm.Text, []rune(tm.Text), b.Llx, b.Lly, b.Urx, b.Ury, font, meta)
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+								}
-												First version of text extraction that recognizes columns

											
										
										
											2020-05-24 21:00:37 +10:00
+								// spaceMark is a special TextMark used for spaces.
 								var spaceMark = TextMark{
 									Text:     "[X]",
 									Original: " ",
 									Meta:     true,
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+								}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
 								// empty.
 								func (to *textObject) getCurrentFont() *model.PdfFont {
-												Handle more cases of fonts not being set in text extraction code.

											
										
										
											2020-05-28 12:08:15 +10:00
+									var font *model.PdfFont
 									if !to.savedStates.empty() {
 										font = to.savedStates.top().tfont
 									}
 									if font == nil {
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										common.Log.Debug("ERROR: No font defined. Using default.")
 										return model.DefaultFont()
 									}
-												Handle more cases of fonts not being set in text extraction code.

											
										
										
											2020-05-28 12:08:15 +10:00
+									return font
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								}
-												Noted that text extractor is an intermediate version

											
										
										
											2018-06-28 11:11:43 +10:00
+								// getFont returns the font named `name` if it exists in the page's resources or an error if it
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+								// doesn't. It caches the returned fonts.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) getFont(name string) (*model.PdfFont, error) {
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+									if to.e.fontCache != nil {
 										to.e.accessCount++
 										entry, ok := to.e.fontCache[name]
 										if ok {
 											entry.access = to.e.accessCount
 											return entry.font, nil
 										}
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+									}
 									// Font not in cache. Load it.
 									font, err := to.getFontDirect(name)
 									if err != nil {
 										return nil, err
 									}
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+									if to.e.fontCache != nil {
 										entry := fontEntry{font, to.e.accessCount}
 										// Eject a victim if the cache is full.
 										if len(to.e.fontCache) >= maxFontCache {
-												define slices with a var instead of an empty literal

											
										
										
											2018-12-09 19:28:50 +02:00
+											var names []string
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+											for name := range to.e.fontCache {
 												names = append(names, name)
 											}
 											sort.Slice(names, func(i, j int) bool {
 												return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
 											})
 											delete(to.e.fontCache, names[0])
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+										}
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+										to.e.fontCache[name] = entry
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+									}
 									return font, nil
 								}
-												Cleaned up some comments.

											
										
										
											2018-09-21 16:43:10 +10:00
+								// fontEntry is a entry in the font cache.
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+								type fontEntry struct {
 									font   *model.PdfFont // The font being cached.
 									access int64          // Last access. Used to determine LRU cache victims.
 								}
 								// maxFontCache is the maximum number of PdfFont's in fontCache.
 								const maxFontCache = 10
 								// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// it doesn't. Accesses page resources directly (not cached).
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+								func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									fontObj, err := to.getFontDict(name)
 									if err != nil {
 										return nil, err
 									}
 									font, err := model.NewPdfFontFromPdfObject(fontObj)
 									if err != nil {
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+										common.Log.Debug("getFontDirect: NewPdfFontFromPdfObject failed. name=%#q err=%v", name, err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
 									return font, err
 								}
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+								// getFontDict returns the font dict with key `name` if it exists in the page's or form's Font
 								// resources or an error if it doesn't.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									resources := to.resources
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if resources == nil {
 										common.Log.Debug("getFontDict. No resources. name=%#q", name)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										return nil, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if !found {
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										return nil, errors.New("font not in resources")
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return fontObj, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}