unipdf/extractor/text.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"errors"
	"fmt"
	"math"
	"sort"
	"strings"
	"unicode"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/contentstream"
	"github.com/unidoc/unipdf/v3/core"
	"github.com/unidoc/unipdf/v3/internal/transform"
	"github.com/unidoc/unipdf/v3/model"
	"golang.org/x/text/unicode/norm"
)

// ExtractText processes and extracts all text data in content streams and returns as a string.
// It takes into account character encodings in the PDF file, which are decoded by
// CharcodeBytesToUnicode.
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
func (e *Extractor) ExtractText() (string, error) {
	text, _, _, err := e.ExtractTextWithStats()
	return text, err
}

// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
// (`numChars`) and the number of characters that were not decoded (`numMisses`).
func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
	pageText, numChars, numMisses, err := e.ExtractPageText()
	if err != nil {
		return "", numChars, numMisses, err
	}
	return pageText.ToText(), numChars, numMisses, nil
}

// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
	return e.extractPageText(e.contents, e.resources, 0)
}

// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
// PageText.
// This can be called on a page or a form XObject.
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (*PageText, int, int, error) {

	common.Log.Trace("extractPageText: level=%d", level)
	pageText := &PageText{}
	state := newTextState()
	fontStack := fontStacker{}
	var to *textObject

	cstreamParser := contentstream.NewContentStreamParser(contents)
	operations, err := cstreamParser.Parse()
	if err != nil {
		common.Log.Debug("ERROR: extractPageText parse failed. err=%v", err)
		return pageText, state.numChars, state.numMisses, err
	}

	processor := contentstream.NewContentStreamProcessor(*operations)

	processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
		func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
			resources *model.PdfPageResources) error {

			operand := op.Operand

			switch operand {
			case "q":
				if !fontStack.empty() {
					common.Log.Trace("Save font state: %s\n%s",
						fontStack.peek(), fontStack.String())
					fontStack.push(fontStack.peek())
				}
				if state.tfont != nil {
					common.Log.Trace("Save font state: %s\n->%s\n%s",
						fontStack.peek(), state.tfont, fontStack.String())
					fontStack.push(state.tfont)
				}
			case "Q":
				if !fontStack.empty() {
					common.Log.Trace("Restore font state: %s\n->%s\n%s",
						fontStack.peek(), fontStack.get(-2), fontStack.String())
					fontStack.pop()
				}
				if len(fontStack) >= 2 {
					common.Log.Trace("Restore font state: %s\n->%s\n%s",
						state.tfont, fontStack.peek(), fontStack.String())
					state.tfont = fontStack.pop()
				}
			case "BT": // Begin text
				// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
				// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
				// not appear before an ET.
				if to != nil {
					common.Log.Debug("BT called while in a text object")
				}
				to = newTextObject(e, resources, gs, &state, &fontStack)
			case "ET": // End Text
				pageText.marks = append(pageText.marks, to.marks...)
				to = nil
			case "T*": // Move to start of next text line
				to.nextLine()
			case "Td": // Move text location
				if ok, err := to.checkOp(op, 2, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				x, y, err := toFloatXY(op.Params)
				if err != nil {
					return err
				}
				to.moveText(x, y)
			case "TD": // Move text location and set leading.
				if ok, err := to.checkOp(op, 2, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				x, y, err := toFloatXY(op.Params)
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.moveTextSetLeading(x, y)
			case "Tj": // Show text.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: Tj op=%s err=%v", op, err)
					return err
				}
				charcodes, ok := core.GetStringBytes(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: Tj op=%s GetStringBytes failed", op)
					return core.ErrTypeError
				}
				return to.showText(charcodes)
			case "TJ": // Show text with adjustable spacing.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: TJ err=%v", err)
					return err
				}
				args, ok := core.GetArray(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: TJ op=%s GetArrayVal failed", op)
					return err
				}
				return to.showTextAdjusted(args)
			case "'": // Move to next line and show text.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: ' err=%v", err)
					return err
				}
				charcodes, ok := core.GetStringBytes(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: ' op=%s GetStringBytes failed", op)
					return core.ErrTypeError
				}
				to.nextLine()
				return to.showText(charcodes)
			case `"`: // Set word and character spacing, move to next line, and show text.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: \" err=%v", err)
					return err
				}
				x, y, err := toFloatXY(op.Params[:2])
				if err != nil {
					return err
				}
				charcodes, ok := core.GetStringBytes(op.Params[2])
				if !ok {
					common.Log.Debug("ERROR: \" op=%s GetStringBytes failed", op)
					return core.ErrTypeError
				}
				to.setCharSpacing(x)
				to.setWordSpacing(y)
				to.nextLine()
				return to.showText(charcodes)
			case "TL": // Set text leading.
				y, err := floatParam(op)
				if err != nil {
					common.Log.Debug("ERROR: TL err=%v", err)
					return err
				}
				to.setTextLeading(y)
			case "Tc": // Set character spacing.
				y, err := floatParam(op)
				if err != nil {
					common.Log.Debug("ERROR: Tc err=%v", err)
					return err
				}
				to.setCharSpacing(y)
			case "Tf": // Set font.
				if to == nil {
					// This is needed for 26-Hazard-Thermal-environment.pdf
					to = newTextObject(e, resources, gs, &state, &fontStack)
				}
				if ok, err := to.checkOp(op, 2, true); !ok {
					common.Log.Debug("ERROR: Tf err=%v", err)
					return err
				}
				name, ok := core.GetNameVal(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: Tf op=%s GetNameVal failed", op)
					return core.ErrTypeError
				}
				size, err := core.GetNumberAsFloat(op.Params[1])
				if !ok {
					common.Log.Debug("ERROR: Tf op=%s GetFloatVal failed. err=%v", op, err)
					return err
				}
				err = to.setFont(name, size)
				if err != nil {
					return err
				}
			case "Tm": // Set text matrix.
				if ok, err := to.checkOp(op, 6, true); !ok {
					common.Log.Debug("ERROR: Tm err=%v", err)
					return err
				}
				floats, err := core.GetNumbersAsFloat(op.Params)
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setTextMatrix(floats)
			case "Tr": // Set text rendering mode.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: Tr err=%v", err)
					return err
				}
				mode, ok := core.GetIntVal(op.Params[0])
				if !ok {
					common.Log.Debug("ERROR: Tr op=%s GetIntVal failed", op)
					return core.ErrTypeError
				}
				to.setTextRenderMode(mode)
			case "Ts": // Set text rise.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: Ts err=%v", err)
					return err
				}
				y, err := core.GetNumberAsFloat(op.Params[0])
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setTextRise(y)
			case "Tw": // Set word spacing.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				y, err := core.GetNumberAsFloat(op.Params[0])
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setWordSpacing(y)
			case "Tz": // Set horizontal scaling.
				if ok, err := to.checkOp(op, 1, true); !ok {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				y, err := core.GetNumberAsFloat(op.Params[0])
				if err != nil {
					common.Log.Debug("ERROR: err=%v", err)
					return err
				}
				to.setHorizScaling(y)

			case "Do":
				// Handle XObjects by recursing through form XObjects.
				name := *op.Params[0].(*core.PdfObjectName)
				_, xtype := resources.GetXObjectByName(name)
				if xtype != model.XObjectTypeForm {
					break
				}
				// Only process each form once.
				formResult, ok := e.formResults[string(name)]
				if !ok {
					xform, err := resources.GetXObjectFormByName(name)
					if err != nil {
						common.Log.Debug("ERROR: %v", err)
						return err
					}
					formContent, err := xform.GetContentStream()
					if err != nil {
						common.Log.Debug("ERROR: %v", err)
						return err
					}
					formResources := xform.Resources
					if formResources == nil {
						formResources = resources
					}
					tList, numChars, numMisses, err := e.extractPageText(string(formContent),
						formResources, level+1)
					if err != nil {
						common.Log.Debug("ERROR: %v", err)
						return err
					}
					formResult = textResult{*tList, numChars, numMisses}
					e.formResults[string(name)] = formResult
				}

				pageText.marks = append(pageText.marks, formResult.pageText.marks...)
				state.numChars += formResult.numChars
				state.numMisses += formResult.numMisses
			}
			return nil
		})

	err = processor.Process(resources)
	if err != nil {
		common.Log.Debug("ERROR: Processing: err=%v", err)
	}
	return pageText, state.numChars, state.numMisses, err
}

type textResult struct {
	pageText  PageText
	numChars  int
	numMisses int
}

//
// Text operators
//

// moveText "Td" Moves start of text by `tx`,`ty`.
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
// tx and ty are in unscaled text space units.
func (to *textObject) moveText(tx, ty float64) {
	to.moveTo(tx, ty)
}

// moveTextSetLeading "TD" Move text location and set leading.
// Move to the start of the next line, offset from the start of the current line by (tx, ty). As a
// side effect, this operator shall set the leading parameter in the text state. This operator shall
// have the same effect as this code:
//  −ty TL
//  tx ty Td
func (to *textObject) moveTextSetLeading(tx, ty float64) {
	to.state.tl = -ty
	to.moveTo(tx, ty)
}

// nextLine "T*"" Moves start of text line to next text line
// Move to the start of the next line. This operator has the same effect as the code
//    0 -Tl Td
// where Tl denotes the current leading parameter in the text state. The negative of Tl is used
// here because Tl is the text leading expressed as a positive number. Going to the next line
// entails decreasing the y coordinate. (page 250)
func (to *textObject) nextLine() {
	to.moveTo(0, -to.state.tl)
}

// setTextMatrix "Tm".
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
// in `f` (page 250).
func (to *textObject) setTextMatrix(f []float64) {
	if len(f) != 6 {
		common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
		return
	}
	a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
	to.tm = transform.NewMatrix(a, b, c, d, tx, ty)
	to.tlm = to.tm
}

// showText "Tj". Show a text string.
func (to *textObject) showText(charcodes []byte) error {
	return to.renderText(charcodes)
}

// showTextAdjusted "TJ". Show text with adjustable spacing.
func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
	vertical := false
	for _, o := range args.Elements() {
		switch o.(type) {
		case *core.PdfObjectFloat, *core.PdfObjectInteger:
			x, err := core.GetNumberAsFloat(o)
			if err != nil {
				common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
				return err
			}
			dx, dy := -x*0.001*to.state.tfs, 0.0
			if vertical {
				dy, dx = dx, dy
			}
			td := translationMatrix(transform.Point{X: dx, Y: dy})
			to.tm.Concat(td)
			common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.tm)
		case *core.PdfObjectString:
			charcodes, ok := core.GetStringBytes(o)
			if !ok {
				common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
				return core.ErrTypeError
			}
			to.renderText(charcodes)
		default:
			common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
			return core.ErrTypeError
		}
	}
	return nil
}

// setTextLeading "TL". Set text leading.
func (to *textObject) setTextLeading(y float64) {
	if to == nil || to.state == nil {
		return
	}
	to.state.tl = y
}

// setCharSpacing "Tc". Set character spacing.
func (to *textObject) setCharSpacing(x float64) {
	if to == nil {
		return
	}
	to.state.tc = x
}

// setFont "Tf". Set font.
func (to *textObject) setFont(name string, size float64) error {
	if to == nil {
		return nil
	}
	font, err := to.getFont(name)
	if err == nil {
		to.state.tfont = font
		if len(*to.fontStack) == 0 {
			to.fontStack.push(font)
		} else {
			(*to.fontStack)[len(*to.fontStack)-1] = font
		}
	} else if err == model.ErrFontNotSupported {
		// TODO(peterwilliams97): Do we need to handle this case in a special way?
		return err
	} else {
		return err
	}
	to.state.tfs = size
	return nil
}

// setTextRenderMode "Tr". Set text rendering mode.
func (to *textObject) setTextRenderMode(mode int) {
	if to == nil {
		return
	}
	to.state.tmode = RenderMode(mode)
}

// setTextRise "Ts". Set text rise.
func (to *textObject) setTextRise(y float64) {
	if to == nil {
		return
	}
	to.state.trise = y
}

// setWordSpacing "Tw". Set word spacing.
func (to *textObject) setWordSpacing(y float64) {
	if to == nil {
		return
	}
	to.state.tw = y
}

// setHorizScaling "Tz". Set horizontal scaling.
func (to *textObject) setHorizScaling(y float64) {
	if to == nil {
		return
	}
	to.state.th = y
}

// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
// a single float parameter or we aren't in a text stream.
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
	if len(op.Params) != 1 {
		err := errors.New("incorrect parameter count")
		common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
			op.Operand, 1, len(op.Params), op.Params)
		return 0.0, err
	}
	return core.GetNumberAsFloat(op.Params[0])
}

// checkOp returns true if we are in a text stream and `op` has `numParams` params.
// If `hard` is true and the number of params don't match, an error is returned.
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
	hard bool) (ok bool, err error) {
	if to == nil {
		var params []core.PdfObject
		if numParams > 0 {
			params = op.Params
			if len(params) > numParams {
				params = params[:numParams]
			}
		}
		common.Log.Debug("%#q operand outside text. params=%+v", op.Operand, params)
	}
	if numParams >= 0 {
		if len(op.Params) != numParams {
			if hard {
				err = errors.New("incorrect parameter count")
			}
			common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
				op.Operand, numParams, len(op.Params), op.Params)
			return false, err
		}
	}
	return true, nil
}

// fontStacker is the PDF font stack implementation.
type fontStacker []*model.PdfFont

// String returns a string describing the current state of the font stack.
func (fontStack *fontStacker) String() string {
	parts := []string{"---- font stack"}
	for i, font := range *fontStack {
		s := "<nil>"
		if font != nil {
			s = font.String()
		}
		parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
	}
	return strings.Join(parts, "\n")
}

// push pushes `font` onto the font stack.
func (fontStack *fontStacker) push(font *model.PdfFont) {
	*fontStack = append(*fontStack, font)
}

// pop pops and returns the element on the top of the font stack if there is one or nil if there isn't.
func (fontStack *fontStacker) pop() *model.PdfFont {
	if fontStack.empty() {
		return nil
	}
	font := (*fontStack)[len(*fontStack)-1]
	*fontStack = (*fontStack)[:len(*fontStack)-1]
	return font
}

// peek returns the element on the top of the font stack if there is one or nil if there isn't.
func (fontStack *fontStacker) peek() *model.PdfFont {
	if fontStack.empty() {
		return nil
	}
	return (*fontStack)[len(*fontStack)-1]
}

// get returns the `idx`'th element of the font stack if there is one or nil if there isn't.
//  idx = 0: bottom of font stack
//  idx = len(fontstack) - 1: top of font stack
//  idx = -n is same as dx = len(fontstack) - n, so fontstack.get(-1) is same as fontstack.peek()
func (fontStack *fontStacker) get(idx int) *model.PdfFont {
	if idx < 0 {
		idx += fontStack.size()
	}
	if idx < 0 || idx > fontStack.size()-1 {
		return nil
	}
	return (*fontStack)[idx]
}

// empty returns true if the font stack is empty.
func (fontStack *fontStacker) empty() bool {
	return len(*fontStack) == 0
}

// size returns the number of elements in the font stack.
func (fontStack *fontStacker) size() int {
	return len(*fontStack)
}

// 9.3 Text State Parameters and Operators (page 243)
// Some of these parameters are expressed in unscaled text space units. This means that they shall
// be specified in a coordinate system that shall be defined by the text matrix, Tm but shall not be
// scaled by the font size parameter, Tfs.

// textState represents the text state.
type textState struct {
	tc    float64        // Character spacing. Unscaled text space units.
	tw    float64        // Word spacing. Unscaled text space units.
	th    float64        // Horizontal scaling.
	tl    float64        // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
	tfs   float64        // Text font size.
	tmode RenderMode     // Text rendering mode.
	trise float64        // Text rise. Unscaled text space units. Set by Ts.
	tfont *model.PdfFont // Text font.
	// For debugging
	numChars  int
	numMisses int
}

// 9.4.1 General (page 248)
// A PDF text object consists of operators that may show text strings, move the text position, and
// set text state and certain other parameters. In addition, two parameters may be specified only
// within a text object and shall not persist from one text object to the next:
//   • Tm, the text matrix
//   • Tlm, the text line matrix
//
// Text space is converted to device space by this transform (page 252)
// Trm is the text rendering matrix
//        | Tfs x Th   0      0 |
// Trm  = | 0         Tfs     0 | × Tm × CTM
//        | 0         Trise   1 |
// This corresponds to the following code in renderText()
//  trm := to.gs.CTM.Mult(stateMatrix).Mult(to.tm)

// textObject represents a PDF text object.
type textObject struct {
	e         *Extractor
	resources *model.PdfPageResources
	gs        contentstream.GraphicsState
	fontStack *fontStacker
	state     *textState
	tm        transform.Matrix // Text matrix. For the character pointer.
	tlm       transform.Matrix // Text line matrix. For the start of line pointer.
	marks     []textMark       // Text marks get written here.
}

// newTextState returns a default textState.
func newTextState() textState {
	return textState{
		th:    100,
		tmode: RenderModeFill,
	}
}

// newTextObject returns a default textObject.
func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
	state *textState,
	fontStack *fontStacker) *textObject {
	return &textObject{
		e:         e,
		resources: resources,
		gs:        gs,
		fontStack: fontStack,
		state:     state,
		tm:        transform.IdentityMatrix(),
		tlm:       transform.IdentityMatrix(),
	}
}

// renderText processes and renders byte array `data` for extraction purposes.
func (to *textObject) renderText(data []byte) error {

	font := to.getCurrentFont()

	charcodes := font.BytesToCharcodes(data)

	runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes)
	if numMisses > 0 {
		common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
	}

	to.state.numChars += numChars
	to.state.numMisses += numMisses

	state := to.state
	tfs := state.tfs
	th := state.th / 100.0
	spaceMetrics, ok := font.GetRuneMetrics(' ')
	if !ok {
		spaceMetrics, ok = font.GetCharMetrics(32)
	}
	if !ok {
		spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
	}
	spaceWidth := spaceMetrics.Wx * glyphTextRatio
	common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs)

	stateMatrix := transform.NewMatrix(
		tfs*th, 0,
		0, tfs,
		0, state.trise)

	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)

	for i, r := range runes {
		// TODO(peterwilliams97): Need to find and fix cases where this happens.
		if r == '\x00' {
			continue
		}

		code := charcodes[i]
		// The location of the text on the page in device coordinates is given by trm, the text
		// rendering matrix.
		trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)

		// calculate the text location displacement due to writing `r`. We will use this to update
		// to.tm

		// w is the unscaled movement at the end of a word.
		w := 0.0
		if r == ' ' {
			w = state.tw
		}

		m, ok := font.GetCharMetrics(code)
		if !ok {
			common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
			return errors.New("no char metrics")
		}

		// c is the character size in unscaled text units.
		c := transform.Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}

		// t0 is the end of this character.
		// t is the displacement of the text cursor when the character is rendered.
		t0 := transform.Point{X: (c.X*tfs + w) * th}
		t := transform.Point{X: (c.X*tfs + state.tc + w) * th}

		// td, td0 are t, t0 in matrix form.
		// td0 is where this character ends. td is where the next character starts.
		td0 := translationMatrix(t0)
		td := translationMatrix(t)

		common.Log.Trace("\"%c\" stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.tm)
		common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.tc, w, state.tw)
		common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.tm).Mult(to.gs.CTM))

		mark := to.newTextMark(
			string(r),
			trm,
			translation(to.gs.CTM.Mult(to.tm).Mult(td0)),
			spaceWidth*trm.ScalingFactorX())
		common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
		to.marks = append(to.marks, mark)

		// update the text matrix by the displacement of the text location.
		to.tm.Concat(td)
		common.Log.Trace("to.tm=%s", to.tm)
	}

	return nil
}

// glyphTextRatio converts Glyph metrics units to unscaled text space units.
const glyphTextRatio = 1.0 / 1000.0

// translation returns the translation part of `m`.
func translation(m transform.Matrix) transform.Point {
	tx, ty := m.Translation()
	return transform.Point{X: tx, Y: ty}
}

// translationMatrix returns a matrix that translates by `p`.
func translationMatrix(p transform.Point) transform.Matrix {
	return transform.TranslationMatrix(p.X, p.Y)
}

// moveTo moves the start of line pointer by `tx`,`ty` and sets the text pointer to the
// start of line pointer.
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
// `tx` and `ty` are in unscaled text space units.
func (to *textObject) moveTo(tx, ty float64) {
	to.tlm.Concat(transform.NewMatrix(1, 0, 0, 1, tx, ty))
	to.tm = to.tlm
}

// textMark represents text drawn on a page and its position in device coordinates.
// All dimensions are in device coordinates.
type textMark struct {
	text          string          // The text.
	orient        int             // The text orientation in degrees. This is the current TRM rounded to 10°.
	orientedStart transform.Point // Left of text in orientation where text is horizontal.
	orientedEnd   transform.Point // Right of text in orientation where text is horizontal.
	height        float64         // Text height.
	spaceWidth    float64         // Best guess at the width of a space in the font the text was rendered with.
	count         int64           // To help with reading debug logs.
}

// newTextMark returns an textMark for text `text` rendered with text rendering matrix (TRM) `trm` and end
// of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in
// the font the text is rendered in device coordinates.
func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point, spaceWidth float64) textMark {
	to.e.textCount++
	theta := trm.Angle()
	orient := nearestMultiple(theta, 10)
	var height float64
	if orient%180 != 90 {
		height = trm.ScalingFactorY()
	} else {
		height = trm.ScalingFactorX()
	}

	return textMark{
		text:          text,
		orient:        orient,
		orientedStart: translation(trm).Rotate(theta),
		orientedEnd:   end.Rotate(theta),
		height:        height,
		spaceWidth:    spaceWidth,
		count:         to.e.textCount,
	}
}

// nearestMultiple return the integer multiple of `m` that is closest to `x`.
func nearestMultiple(x float64, m int) int {
	if m == 0 {
		m = 1
	}
	fac := float64(m)
	return int(math.Round(x/fac) * fac)
}

// String returns a string describing `t`.
func (t textMark) String() string {
	return fmt.Sprintf("textMark{@%03d [%.3f,%.3f] %.1f %d° %q}",
		t.count, t.orientedStart.X, t.orientedStart.Y, t.Width(), t.orient, truncate(t.text, 100))
}

// Width returns the width of `t`.text in the text direction.
func (t textMark) Width() float64 {
	return math.Abs(t.orientedStart.X - t.orientedEnd.X)
}

// PageText represents the layout of text on a device page.
// It's implementation is opaque to allow for future optimizations.
type PageText struct {
	// PageText is currently implemented as a list of texts and their positions on a PDF page.
	marks []textMark
}

// String returns a string describing `pt`.
func (pt PageText) String() string {
	parts := []string{fmt.Sprintf("PageText: %d elements", pt.length())}
	for _, t := range pt.marks {
		parts = append(parts, t.String())
	}
	return strings.Join(parts, "\n")
}

// length returns the number of elements in `pt.marks`.
func (pt PageText) length() int {
	return len(pt.marks)
}

// height returns the max height of the elements in `pt.marks`.
func (pt PageText) height() float64 {
	fontHeight := 0.0
	for _, t := range pt.marks {
		if t.height > fontHeight {
			fontHeight = t.height
		}
	}
	return fontHeight
}

// ToText returns the contents of `pt` as a single string.
func (pt PageText) ToText() string {
	fontHeight := pt.height()
	// We sort with a y tolerance to allow for subscripts, diacritics etc.
	tol := minFloat(fontHeight*0.2, 5.0)
	common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)

	// Uncomment the 2 following Trace statements to see the effects of sorting/
	// common.Log.Trace("ToText: Before sorting %s", pt)
	pt.sortPosition(tol)
	// common.Log.Trace("ToText: After sorting %s", pt)

	lines := pt.toLines(tol)
	texts := make([]string, 0, len(lines))
	for _, l := range lines {
		texts = append(texts, l.text)
	}
	return strings.Join(texts, "\n")
}

// sortPosition sorts a text list by its elements' position on a page.
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
// is horizontal.
func (pt *PageText) sortPosition(tol float64) {
	sort.SliceStable(pt.marks, func(i, j int) bool {
		ti, tj := pt.marks[i], pt.marks[j]
		if ti.orient != tj.orient {
			return ti.orient < tj.orient
		}
		if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol {
			return ti.orientedStart.Y > tj.orientedStart.Y
		}
		return ti.orientedStart.X < tj.orientedStart.X
	})
}

// textLine represents a line of text on a page.
type textLine struct {
	y      float64   // y position of line.
	dxList []float64 // x distance between successive words in line.
	text   string    // text in the line.
	words  []string  // words in the line.
}

// toLines returns the text and positions in `pt.marks` as a slice of textLine.
// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
// that text is horizontal) before calling this function.
func (pt PageText) toLines(tol float64) []textLine {
	// We divide `pt.marks` into slices which contain texts with the same orientation, extract the lines
	// for each orientation then return the concatention of these lines sorted by orientation.
	tlOrient := make(map[int][]textMark, len(pt.marks))
	for _, t := range pt.marks {
		tlOrient[t.orient] = append(tlOrient[t.orient], t)
	}
	var lines []textLine
	for _, o := range orientKeys(tlOrient) {
		lines = append(lines, PageText{tlOrient[o]}.toLinesOrient(tol)...)
	}
	return lines
}

// toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
// NOTE: This function only works on text lists where all text is the same orientation so it should
// only be called from toLines.
// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
// that text is horizontal) before calling this function.
func (pt PageText) toLinesOrient(tol float64) []textLine {
	if len(pt.marks) == 0 {
		return []textLine{}
	}
	var lines []textLine
	var words []string
	var x []float64
	y := pt.marks[0].orientedStart.Y

	scanning := false

	averageCharWidth := exponAve{}
	wordSpacing := exponAve{}
	lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X

	for _, t := range pt.marks {
		if t.orientedStart.Y+tol < y {
			if len(words) > 0 {
				line := newLine(y, x, words)
				if averageCharWidth.running {
					// FIXME(peterwilliams97): Fix and reinstate combineDiacritics.
					// line = combineDiacritics(line, averageCharWidth.ave)
					line = removeDuplicates(line, averageCharWidth.ave)
				}
				lines = append(lines, line)
			}
			words = []string{}
			x = []float64{}
			y = t.orientedStart.Y
			scanning = false
		}

		// Detect text movements that represent spaces on the printed page.
		// We use a heuristic from PdfBox: If the next character starts to the right of where a
		// character after a space at "normal spacing" would start, then there is a space before it.
		// The tricky thing to guess here is the width of a space at normal spacing.
		// We follow PdfBox and use minFloat(deltaSpace, deltaCharWidth).
		deltaSpace := 0.0
		if t.spaceWidth == 0 {
			deltaSpace = math.MaxFloat64
		} else {
			wordSpacing.update(t.spaceWidth)
			deltaSpace = wordSpacing.ave * 0.5
		}
		averageCharWidth.update(t.Width())
		deltaCharWidth := averageCharWidth.ave * 0.3

		isSpace := false
		nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
		if scanning && t.text != " " {
			isSpace = nextWordX < t.orientedStart.X
		}
		common.Log.Trace("t=%s", t)
		common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
			t.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
		common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
			t.text, t.orientedStart.X, t.orientedStart.Y, lastEndX, nextWordX,
			nextWordX-t.orientedStart.X, isSpace)

		if isSpace {
			words = append(words, " ")
			x = append(x, (lastEndX+t.orientedStart.X)*0.5)
		}

		// Add the text to the line.
		lastEndX = t.orientedEnd.X
		words = append(words, t.text)
		x = append(x, t.orientedStart.X)
		scanning = true
		common.Log.Trace("lastEndX=%.2f", lastEndX)
	}
	if len(words) > 0 {
		line := newLine(y, x, words)
		if averageCharWidth.running {
			line = removeDuplicates(line, averageCharWidth.ave)
		}
		lines = append(lines, line)
	}
	return lines
}

// orientKeys returns the keys of `tlOrient` as a sorted slice.
func orientKeys(tlOrient map[int][]textMark) []int {
	keys := []int{}
	for k := range tlOrient {
		keys = append(keys, k)
	}
	sort.Ints(keys)
	return keys
}

// exponAve implements an exponential average.
type exponAve struct {
	ave     float64 // Current average value.
	running bool    // Has `ave` been set?
}

// update updates the exponential average `exp.ave` and returns it.
func (exp *exponAve) update(x float64) float64 {
	if !exp.running {
		exp.ave = x
		exp.running = true
	} else {
		// NOTE(peterwilliams97): 0.5 is a guess. It may be possible to improve average character
		// and space width estimation by tuning this value. It may be that different exponents
		// would work better for character and space estimation.
		exp.ave = (exp.ave + x) * 0.5
	}
	return exp.ave
}

// newLine returns the textLine representation of strings `words` with y coordinate `y` and x
// coordinates `x`.
func newLine(y float64, x []float64, words []string) textLine {
	dxList := make([]float64, 0, len(x))
	for i := 1; i < len(x); i++ {
		dxList = append(dxList, x[i]-x[i-1])
	}
	return textLine{y: y, dxList: dxList, text: strings.Join(words, ""), words: words}
}

// removeDuplicates returns `line` with duplicate characters removed. `charWidth` is the average
// character width for the line.
func removeDuplicates(line textLine, charWidth float64) textLine {
	if len(line.dxList) == 0 {
		return line
	}

	// NOTE(peterwilliams97) 0.3 is a guess. It may be possible to tune this to a better value.
	tol := charWidth * 0.3
	words := []string{line.words[0]}
	var dxList []float64

	w0 := line.words[0]
	for i, dx := range line.dxList {
		w := line.words[i+1]
		if w != w0 || dx > tol {
			words = append(words, w)
			dxList = append(dxList, dx)
		}
		w0 = w
	}
	return textLine{y: line.y, dxList: dxList, text: strings.Join(words, ""), words: words}
}

// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
// `charWidth` is the average character width for the line.
// We have to do this because PDF can render diacritics separately to the characters they attach to
// in extracted text.
func combineDiacritics(line textLine, charWidth float64) textLine {
	if len(line.dxList) == 0 {
		return line
	}

	// NOTE(peterwilliams97) 0.2 is a guess. It may be possible to tune this to a better value.
	tol := charWidth * 0.2
	common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)

	var words []string
	var dxList []float64
	w := line.words[0]
	w, c := countDiacritic(w)
	delta := 0.0
	dx0 := 0.0
	parts := []string{w}
	numChars := c

	for i := 0; i < len(line.dxList); i++ {
		w = line.words[i+1]
		w, c := countDiacritic(w)
		dx := line.dxList[i]
		if numChars+c <= 1 && delta+dx <= tol {
			if len(parts) == 0 {
				dx0 = dx
			} else {
				delta += dx
			}
			parts = append(parts, w)
			numChars += c
		} else {
			if len(parts) > 0 {
				if len(words) > 0 {
					dxList = append(dxList, dx0)
				}
				words = append(words, combine(parts))
			}
			parts = []string{w}
			numChars = c
			dx0 = dx
			delta = 0.0
		}
	}
	if len(parts) > 0 {
		if len(words) > 0 {
			dxList = append(dxList, dx0)
		}
		words = append(words, combine(parts))
	}

	if len(words) != len(dxList)+1 {
		common.Log.Error("Inconsistent: \nwords=%d %q\ndxList=%d %.2f",
			len(words), words, len(dxList), dxList)
		return line
	}
	return textLine{y: line.y, dxList: dxList, text: strings.Join(words, ""), words: words}
}

// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
func combine(parts []string) string {
	if len(parts) == 1 {
		// Must be a non-diacritic.
		return parts[0]
	}

	// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
	diacritic := map[string]bool{}
	for _, w := range parts {
		r := []rune(w)[0]
		diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
	}
	sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })

	// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
	for i, w := range parts {
		parts[i] = strings.TrimSpace(norm.NFKC.String(w))
	}
	return strings.Join(parts, "")
}

// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
// non-diacritics in `w` (0 or 1).
func countDiacritic(w string) (string, int) {
	runes := []rune(w)
	if len(runes) != 1 {
		return w, 1
	}
	r := runes[0]
	c := 1
	if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
		r != '\'' && r != '"' && r != '`' {
		c = 0
	}
	if w2, ok := diacritics[r]; ok {
		c = 0
		w = w2
	}
	return w, c
}

// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
// (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
var diacritics = map[rune]string{
	0x0060: "\u0300",
	0x02CB: "\u0300",
	0x0027: "\u0301",
	0x02B9: "\u0301",
	0x02CA: "\u0301",
	0x005e: "\u0302",
	0x02C6: "\u0302",
	0x007E: "\u0303",
	0x02C9: "\u0304",
	0x00B0: "\u030A",
	0x02BA: "\u030B",
	0x02C7: "\u030C",
	0x02C8: "\u030D",
	0x0022: "\u030E",
	0x02BB: "\u0312",
	0x02BC: "\u0313",
	0x0486: "\u0313",
	0x055A: "\u0313",
	0x02BD: "\u0314",
	0x0485: "\u0314",
	0x0559: "\u0314",
	0x02D4: "\u031D",
	0x02D5: "\u031E",
	0x02D6: "\u031F",
	0x02D7: "\u0320",
	0x02B2: "\u0321",
	0x02CC: "\u0329",
	0x02B7: "\u032B",
	0x02CD: "\u0331",
	0x005F: "\u0332",
	0x204E: "\u0359",
}

// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
// empty.
func (to *textObject) getCurrentFont() *model.PdfFont {
	if to.fontStack.empty() {
		common.Log.Debug("ERROR: No font defined. Using default.")
		return model.DefaultFont()
	}
	return to.fontStack.peek()
}

// getFont returns the font named `name` if it exists in the page's resources or an error if it
// doesn't. It caches the returned fonts.
func (to *textObject) getFont(name string) (*model.PdfFont, error) {
	if to.e.fontCache != nil {
		to.e.accessCount++
		entry, ok := to.e.fontCache[name]
		if ok {
			entry.access = to.e.accessCount
			return entry.font, nil
		}
	}

	// Font not in cache. Load it.
	font, err := to.getFontDirect(name)
	if err != nil {
		return nil, err
	}

	if to.e.fontCache != nil {
		entry := fontEntry{font, to.e.accessCount}

		// Eject a victim if the cache is full.
		if len(to.e.fontCache) >= maxFontCache {
			var names []string
			for name := range to.e.fontCache {
				names = append(names, name)
			}
			sort.Slice(names, func(i, j int) bool {
				return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
			})
			delete(to.e.fontCache, names[0])
		}
		to.e.fontCache[name] = entry
	}

	return font, nil
}

// fontEntry is a entry in the font cache.
type fontEntry struct {
	font   *model.PdfFont // The font being cached.
	access int64          // Last access. Used to determine LRU cache victims.
}

// maxFontCache is the maximum number of PdfFont's in fontCache.
const maxFontCache = 10

// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
// it doesn't. Accesses page resources directly (not cached).
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
	fontObj, err := to.getFontDict(name)
	if err != nil {
		return nil, err
	}
	font, err := model.NewPdfFontFromPdfObject(fontObj)
	if err != nil {
		common.Log.Debug("getFontDirect: NewPdfFontFromPdfObject failed. name=%#q err=%v", name, err)
	}
	return font, err
}

// getFontDict returns the font dict with key `name` if it exists in the page's or form's Font
// resources or an error if it doesn't.
func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
	resources := to.resources
	if resources == nil {
		common.Log.Debug("getFontDict. No resources. name=%#q", name)
		return nil, nil
	}
	fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
	if !found {
		common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
		return nil, errors.New("font not in resources")
	}
	return fontObj, nil
}
-												Add LICENSE.md with reference to AGPL and Commercial license.  Add license header info to code.

											
										
										
											2018-03-22 14:03:47 +00:00
+								/*
 								 * This file is subject to the terms and conditions defined in
 								 * file 'LICENSE.md', which is part of this source code package.
 								 */
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								package extractor
 								import (
 									"errors"
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									"fmt"
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+									"math"
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									"sort"
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									"strings"
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									"unicode"
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
-												Update module version and import paths (#1)

* Update import path to use unipdf
* Update module name and version

											
										
										
											2019-05-16 23:08:40 +03:00
+									"github.com/unidoc/unipdf/v3/common"
-												Remmove pdf folder and move packages up one level (#2)


											
										
										
											2019-05-16 23:44:51 +03:00
+									"github.com/unidoc/unipdf/v3/contentstream"
 									"github.com/unidoc/unipdf/v3/core"
 									"github.com/unidoc/unipdf/v3/internal/transform"
 									"github.com/unidoc/unipdf/v3/model"
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									"golang.org/x/text/unicode/norm"
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								)
-												Major changes to font code
- Added Type1 font parsing.
- Added Standard 14 font parsing.
- Fixed some bugs in cmap code.
- Started re-structuring of font code. Moved common font fields to `fontSkeleton`

											
										
										
											2018-06-27 12:25:59 +10:00
+								// ExtractText processes and extracts all text data in content streams and returns as a string.
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// It takes into account character encodings in the PDF file, which are decoded by
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// CharcodeBytesToUnicode.
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+								func (e *Extractor) ExtractText() (string, error) {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									text, _, _, err := e.ExtractTextWithStats()
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									return text, err
 								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// ExtractTextWithStats works like ExtractText but returns the number of characters in the output
 								// (`numChars`) and the number of characters that were not decoded (`numMisses`).
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+								func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numMisses int, err error) {
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									pageText, numChars, numMisses, err := e.ExtractPageText()
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if err != nil {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										return "", numChars, numMisses, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									return pageText.ToText(), numChars, numMisses, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
 								func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
 									return e.extractPageText(e.contents, e.resources, 0)
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
 								// PageText.
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+								// This can be called on a page or a form XObject.
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (*PageText, int, int, error) {
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									common.Log.Trace("extractPageText: level=%d", level)
 									pageText := &PageText{}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									state := newTextState()
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									fontStack := fontStacker{}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									var to *textObject
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									cstreamParser := contentstream.NewContentStreamParser(contents)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									operations, err := cstreamParser.Parse()
 									if err != nil {
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+										common.Log.Debug("ERROR: extractPageText parse failed. err=%v", err)
 										return pageText, state.numChars, state.numMisses, err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									}
 									processor := contentstream.NewContentStreamProcessor(*operations)
 									processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
 											resources *model.PdfPageResources) error {
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+											operand := op.Operand
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+											switch operand {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+											case "q":
 												if !fontStack.empty() {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+													common.Log.Trace("Save font state: %s\n%s",
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+														fontStack.peek(), fontStack.String())
 													fontStack.push(fontStack.peek())
 												}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+												if state.tfont != nil {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+													common.Log.Trace("Save font state: %s\n->%s\n%s",
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+														fontStack.peek(), state.tfont, fontStack.String())
 													fontStack.push(state.tfont)
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+												}
 											case "Q":
 												if !fontStack.empty() {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+													common.Log.Trace("Restore font state: %s\n->%s\n%s",
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+														fontStack.peek(), fontStack.get(-2), fontStack.String())
 													fontStack.pop()
 												}
 												if len(fontStack) >= 2 {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+													common.Log.Trace("Restore font state: %s\n->%s\n%s",
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+														state.tfont, fontStack.peek(), fontStack.String())
 													state.tfont = fontStack.pop()
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											case "BT": // Begin text
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
 												// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
 												// not appear before an ET.
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												if to != nil {
 													common.Log.Debug("BT called while in a text object")
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												to = newTextObject(e, resources, gs, &state, &fontStack)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											case "ET": // End Text
-												(*pageText). -> pageText.

											
										
										
											2019-01-05 14:10:54 +11:00
+												pageText.marks = append(pageText.marks, to.marks...)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												to = nil
 											case "T*": // Move to start of next text line
 												to.nextLine()
 											case "Td": // Move text location
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 2, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												x, y, err := toFloatXY(op.Params)
 												if err != nil {
 													return err
 												}
 												to.moveText(x, y)
-												Premultiply coordinate transforms to text matrix in text extraction.

											
										
										
											2018-11-26 08:09:52 +11:00
+											case "TD": // Move text location and set leading.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 2, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												x, y, err := toFloatXY(op.Params)
 												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												to.moveTextSetLeading(x, y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tj": // Show text.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tj op=%s err=%v", op, err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												charcodes, ok := core.GetStringBytes(op.Params[0])
 												if !ok {
 													common.Log.Debug("ERROR: Tj op=%s GetStringBytes failed", op)
 													return core.ErrTypeError
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												return to.showText(charcodes)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "TJ": // Show text with adjustable spacing.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: TJ err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Removed GetArrayVal

											
										
										
											2018-07-25 13:19:09 +10:00
+												args, ok := core.GetArray(op.Params[0])
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												if !ok {
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+													common.Log.Debug("ERROR: TJ op=%s GetArrayVal failed", op)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												return to.showTextAdjusted(args)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "'": // Move to next line and show text.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: ' err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												charcodes, ok := core.GetStringBytes(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: ' op=%s GetStringBytes failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
 												to.nextLine()
 												return to.showText(charcodes)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case `"`: // Set word and character spacing, move to next line, and show text.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: \" err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												x, y, err := toFloatXY(op.Params[:2])
 												if err != nil {
 													return err
 												}
 												charcodes, ok := core.GetStringBytes(op.Params[2])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: \" op=%s GetStringBytes failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												to.setCharSpacing(x)
 												to.setWordSpacing(y)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												to.nextLine()
 												return to.showText(charcodes)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "TL": // Set text leading.
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+												y, err := floatParam(op)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if err != nil {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: TL err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setTextLeading(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tc": // Set character spacing.
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+												y, err := floatParam(op)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if err != nil {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tc err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setCharSpacing(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tf": // Set font.
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+												if to == nil {
-												Addressing review comments

											
										
										
											2018-11-30 23:01:04 +00:00
+													// This is needed for 26-Hazard-Thermal-environment.pdf
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+													to = newTextObject(e, resources, gs, &state, &fontStack)
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 2, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tf err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												name, ok := core.GetNameVal(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tf op=%s GetNameVal failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												size, err := core.GetNumberAsFloat(op.Params[1])
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												if !ok {
 													common.Log.Debug("ERROR: Tf op=%s GetFloatVal failed. err=%v", op, err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												err = to.setFont(name, size)
 												if err != nil {
 													return err
 												}
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tm": // Set text matrix.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 6, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tm err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+												floats, err := core.GetNumbersAsFloat(op.Params)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setTextMatrix(floats)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tr": // Set text rendering mode.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tr err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												mode, ok := core.GetIntVal(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Tr op=%s GetIntVal failed", op)
 													return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												}
 												to.setTextRenderMode(mode)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Ts": // Set text rise.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+													common.Log.Debug("ERROR: Ts err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												y, err := core.GetNumberAsFloat(op.Params[0])
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setTextRise(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tw": // Set word spacing.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												y, err := core.GetNumberAsFloat(op.Params[0])
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+												}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												to.setWordSpacing(y)
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+											case "Tz": // Set horizontal scaling.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												if ok, err := to.checkOp(op, 1, true); !ok {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+												y, err := core.GetNumberAsFloat(op.Params[0])
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												if err != nil {
-												Added NewStandard14Font() to make existing fonts.Font code work with *PdfFont

											
										
										
											2018-07-07 09:45:55 +10:00
+													common.Log.Debug("ERROR: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+													return err
 												}
 												to.setHorizScaling(y)
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+											case "Do":
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+												// Handle XObjects by recursing through form XObjects.
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												name := *op.Params[0].(*core.PdfObjectName)
 												_, xtype := resources.GetXObjectByName(name)
 												if xtype != model.XObjectTypeForm {
 													break
 												}
-												fixed comment

											
										
										
											2018-12-27 20:53:37 +11:00
+												// Only process each form once.
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												formResult, ok := e.formResults[string(name)]
 												if !ok {
 													xform, err := resources.GetXObjectFormByName(name)
 													if err != nil {
 														common.Log.Debug("ERROR: %v", err)
 														return err
 													}
 													formContent, err := xform.GetContentStream()
 													if err != nil {
 														common.Log.Debug("ERROR: %v", err)
 														return err
 													}
 													formResources := xform.Resources
 													if formResources == nil {
 														formResources = resources
 													}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+													tList, numChars, numMisses, err := e.extractPageText(string(formContent),
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+														formResources, level+1)
 													if err != nil {
 														common.Log.Debug("ERROR: %v", err)
 														return err
 													}
 													formResult = textResult{*tList, numChars, numMisses}
 													e.formResults[string(name)] = formResult
 												}
-												(*pageText). -> pageText.

											
										
										
											2019-01-05 14:10:54 +11:00
+												pageText.marks = append(pageText.marks, formResult.pageText.marks...)
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+												state.numChars += formResult.numChars
 												state.numMisses += formResult.numMisses
 											}
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+											return nil
 										})
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									err = processor.Process(resources)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if err != nil {
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+										common.Log.Debug("ERROR: Processing: err=%v", err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									return pageText, state.numChars, state.numMisses, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+								type textResult struct {
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									pageText  PageText
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									numChars  int
 									numMisses int
 								}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								//
 								// Text operators
 								//
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// moveText "Td" Moves start of text by `tx`,`ty`.
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Move to the start of the next line, offset from the start of the current line by (tx, ty).
 								// tx and ty are in unscaled text space units.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) moveText(tx, ty float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									to.moveTo(tx, ty)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// moveTextSetLeading "TD" Move text location and set leading.
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Move to the start of the next line, offset from the start of the current line by (tx, ty). As a
 								// side effect, this operator shall set the leading parameter in the text state. This operator shall
 								// have the same effect as this code:
 								//  −ty TL
 								//  tx ty Td
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) moveTextSetLeading(tx, ty float64) {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tl = -ty
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									to.moveTo(tx, ty)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Fixed some typos.

											
										
										
											2019-01-03 15:41:36 +11:00
+								// nextLine "T*"" Moves start of text line to next text line
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Move to the start of the next line. This operator has the same effect as the code
 								//    0 -Tl Td
 								// where Tl denotes the current leading parameter in the text state. The negative of Tl is used
 								// here because Tl is the text leading expressed as a positive number. Going to the next line
 								// entails decreasing the y coordinate. (page 250)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) nextLine() {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.moveTo(0, -to.state.tl)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+								// setTextMatrix "Tm".
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// in `f` (page 250).
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextMatrix(f []float64) {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									if len(f) != 6 {
 										common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
 										return
 									}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.tm = transform.NewMatrix(a, b, c, d, tx, ty)
 									to.tlm = to.tm
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// showText "Tj". Show a text string.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) showText(charcodes []byte) error {
-												first attempt at parsing FontFile

											
										
										
											2018-07-02 16:46:43 +10:00
+									return to.renderText(charcodes)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// showTextAdjusted "TJ". Show text with adjustable spacing.
-												Removed GetArrayVal

											
										
										
											2018-07-25 13:19:09 +10:00
+								func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									vertical := false
-												Removed GetArrayVal

											
										
										
											2018-07-25 13:19:09 +10:00
+									for _, o := range args.Elements() {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										switch o.(type) {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										case *core.PdfObjectFloat, *core.PdfObjectInteger:
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											x, err := core.GetNumberAsFloat(o)
 											if err != nil {
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+												common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+												return err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											dx, dy := -x*0.001*to.state.tfs, 0.0
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											if vertical {
 												dy, dx = dx, dy
 											}
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+											td := translationMatrix(transform.Point{X: dx, Y: dy})
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+											to.tm.Concat(td)
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.tm)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										case *core.PdfObjectString:
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+											charcodes, ok := core.GetStringBytes(o)
 											if !ok {
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+												common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
-												Merge remote-tracking branch 'upstream/v3' into render.v3

											
										
										
											2018-07-21 21:20:39 +10:00
+												return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											to.renderText(charcodes)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										default:
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+											common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											return core.ErrTypeError
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										}
 									}
 									return nil
 								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setTextLeading "TL". Set text leading.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextLeading(y float64) {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									if to == nil || to.state == nil {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tl = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setCharSpacing "Tc". Set character spacing.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setCharSpacing(x float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tc = x
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setFont "Tf". Set font.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setFont(name string, size float64) error {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return nil
 									}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									font, err := to.getFont(name)
-												Parse FontFile entry in FontDescriptor

											
										
										
											2018-07-03 14:26:42 +10:00
+									if err == nil {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										to.state.tfont = font
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										if len(*to.fontStack) == 0 {
 											to.fontStack.push(font)
 										} else {
 											(*to.fontStack)[len(*to.fontStack)-1] = font
 										}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									} else if err == model.ErrFontNotSupported {
-												Merge remote-tracking branch 'peterwilliams97/extract.text' into extract.text

# Conflicts:
#	pdf/extractor/text.go

											
										
										
											2018-12-27 12:40:55 +02:00
+										// TODO(peterwilliams97): Do we need to handle this case in a special way?
-												Changed error handling. Allow partial encoding maps. Don't continue processing unsupported fonts

											
										
										
											2018-07-04 18:00:37 +10:00
+										return err
-												Parse FontFile entry in FontDescriptor

											
										
										
											2018-07-03 14:26:42 +10:00
+									} else {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										return err
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tfs = size
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									return nil
 								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setTextRenderMode "Tr". Set text rendering mode.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextRenderMode(mode int) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tmode = RenderMode(mode)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setTextRise "Ts". Set text rise.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setTextRise(y float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.trise = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setWordSpacing "Tw". Set word spacing.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setWordSpacing(y float64) {
-												Fixed encoding selection for standard 14 fonts.

											
										
										
											2018-11-22 22:01:04 +11:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.tw = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// setHorizScaling "Tz". Set horizontal scaling.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) setHorizScaling(y float64) {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									if to == nil {
 										return
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.th = y
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// a single float parameter or we aren't in a text stream.
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+								func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
 									if len(op.Params) != 1 {
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										err := errors.New("incorrect parameter count")
-												allow change of text state outside BT..ET

											
										
										
											2018-07-15 16:45:47 +10:00
+										common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
 											op.Operand, 1, len(op.Params), op.Params)
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										return 0.0, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									return core.GetNumberAsFloat(op.Params[0])
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// checkOp returns true if we are in a text stream and `op` has `numParams` params.
 								// If `hard` is true and the number of params don't match, an error is returned.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									hard bool) (ok bool, err error) {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if to == nil {
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+										var params []core.PdfObject
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										if numParams > 0 {
 											params = op.Params
 											if len(params) > numParams {
 												params = params[:numParams]
 											}
 										}
 										common.Log.Debug("%#q operand outside text. params=%+v", op.Operand, params)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
 									if numParams >= 0 {
 										if len(op.Params) != numParams {
 											if hard {
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+												err = errors.New("incorrect parameter count")
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+											}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+											common.Log.Debug("ERROR: %#q should have %d input params, got %d %+v",
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+												op.Operand, numParams, len(op.Params), op.Params)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+											return false, err
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+										}
 									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return true, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// fontStacker is the PDF font stack implementation.
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								type fontStacker []*model.PdfFont
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								// String returns a string describing the current state of the font stack.
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								func (fontStack *fontStacker) String() string {
 									parts := []string{"---- font stack"}
 									for i, font := range *fontStack {
 										s := "<nil>"
 										if font != nil {
 											s = font.String()
 										}
 										parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
 									}
 									return strings.Join(parts, "\n")
 								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
 								// push pushes `font` onto the font stack.
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								func (fontStack *fontStacker) push(font *model.PdfFont) {
 									*fontStack = append(*fontStack, font)
 								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// pop pops and returns the element on the top of the font stack if there is one or nil if there isn't.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								func (fontStack *fontStacker) pop() *model.PdfFont {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									if fontStack.empty() {
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+										return nil
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									font := (*fontStack)[len(*fontStack)-1]
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									*fontStack = (*fontStack)[:len(*fontStack)-1]
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									return font
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// peek returns the element on the top of the font stack if there is one or nil if there isn't.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (fontStack *fontStacker) peek() *model.PdfFont {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									if fontStack.empty() {
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										return nil
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return (*fontStack)[len(*fontStack)-1]
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// get returns the `idx`'th element of the font stack if there is one or nil if there isn't.
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+								//  idx = 0: bottom of font stack
 								//  idx = len(fontstack) - 1: top of font stack
 								//  idx = -n is same as dx = len(fontstack) - n, so fontstack.get(-1) is same as fontstack.peek()
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (fontStack *fontStacker) get(idx int) *model.PdfFont {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									if idx < 0 {
 										idx += fontStack.size()
 									}
 									if idx < 0 || idx > fontStack.size()-1 {
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										return nil
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return (*fontStack)[idx]
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
 								// empty returns true if the font stack is empty.
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								func (fontStack *fontStacker) empty() bool {
 									return len(*fontStack) == 0
 								}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
 								// size returns the number of elements in the font stack.
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+								func (fontStack *fontStacker) size() int {
 									return len(*fontStack)
 								}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								// 9.3 Text State Parameters and Operators (page 243)
 								// Some of these parameters are expressed in unscaled text space units. This means that they shall
 								// be specified in a coordinate system that shall be defined by the text matrix, Tm but shall not be
 								// scaled by the font size parameter, Tfs.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
 								// textState represents the text state.
 								type textState struct {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									tc    float64        // Character spacing. Unscaled text space units.
 									tw    float64        // Word spacing. Unscaled text space units.
 									th    float64        // Horizontal scaling.
 									tl    float64        // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
 									tfs   float64        // Text font size.
 									tmode RenderMode     // Text rendering mode.
 									trise float64        // Text rise. Unscaled text space units. Set by Ts.
 									tfont *model.PdfFont // Text font.
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									// For debugging
 									numChars  int
 									numMisses int
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
 								// 9.4.1 General (page 248)
 								// A PDF text object consists of operators that may show text strings, move the text position, and
 								// set text state and certain other parameters. In addition, two parameters may be specified only
 								// within a text object and shall not persist from one text object to the next:
-												Noted that text extractor is an intermediate version

											
										
										
											2018-06-28 11:11:43 +10:00
+								//   • Tm, the text matrix
 								//   • Tlm, the text line matrix
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								//
 								// Text space is converted to device space by this transform (page 252)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// Trm is the text rendering matrix
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								//        | Tfs x Th   0      0 |
 								// Trm  = | 0         Tfs     0 | × Tm × CTM
 								//        | 0         Trise   1 |
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// This corresponds to the following code in renderText()
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+								//  trm := to.gs.CTM.Mult(stateMatrix).Mult(to.tm)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
 								// textObject represents a PDF text object.
 								type textObject struct {
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									e         *Extractor
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									resources *model.PdfPageResources
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+									gs        contentstream.GraphicsState
 									fontStack *fontStacker
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									state     *textState
 									tm        transform.Matrix // Text matrix. For the character pointer.
 									tlm       transform.Matrix // Text line matrix. For the start of line pointer.
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+									marks     []textMark       // Text marks get written here.
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// newTextState returns a default textState.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func newTextState() textState {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									return textState{
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										th:    100,
 										tmode: RenderModeFill,
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// newTextObject returns a default textObject.
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+								func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
 									state *textState,
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									fontStack *fontStacker) *textObject {
 									return &textObject{
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										e:         e,
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+										resources: resources,
-												Refactored font code to improve text extraction

											
										
										
											2018-07-13 17:40:27 +10:00
+										gs:        gs,
 										fontStack: fontStack,
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										state:     state,
 										tm:        transform.IdentityMatrix(),
 										tlm:       transform.IdentityMatrix(),
-												Extractor package with powerful text extraction capabilities and CMap handling. Closes #17

											
										
										
											2018-03-22 13:01:04 +00:00
+									}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// renderText processes and renders byte array `data` for extraction purposes.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) renderText(data []byte) error {
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+									font := to.getCurrentFont()
-												simplification

											
										
										
											2018-09-18 12:18:04 +10:00
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
+									charcodes := font.BytesToCharcodes(data)
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes)
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									if numMisses > 0 {
 										common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
 									}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.state.numChars += numChars
 									to.state.numMisses += numMisses
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									state := to.state
 									tfs := state.tfs
 									th := state.th / 100.0
-												Remove extra GetRuneCharMetrics function - use GetRuneMetrics

											
										
										
											2019-03-09 18:03:43 +00:00
+									spaceMetrics, ok := font.GetRuneMetrics(' ')
-												Look for CharMetrics for char code 32 when finding space width.

											
										
										
											2018-12-02 13:09:32 +11:00
+									if !ok {
 										spaceMetrics, ok = font.GetCharMetrics(32)
 									}
-												Documented font code. Fall back to StandardEncoding when no encoding is speficied for a font.

											
										
										
											2018-12-02 09:14:58 +11:00
+									if !ok {
-												Remove extra GetRuneCharMetrics function - use GetRuneMetrics

											
										
										
											2019-03-09 18:03:43 +00:00
+										spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+									}
 									spaceWidth := spaceMetrics.Wx * glyphTextRatio
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+									stateMatrix := transform.NewMatrix(
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										tfs*th, 0,
 , tfs,
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+, state.trise)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									for i, r := range runes {
-												Addressing review comments

											
										
										
											2018-11-30 23:01:04 +00:00
+										// TODO(peterwilliams97): Need to find and fix cases where this happens.
-												assert types for the new code as well

											
										
										
											2018-12-07 18:43:24 +02:00
+										if r == '\x00' {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+											continue
 										}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
+										code := charcodes[i]
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										// The location of the text on the page in device coordinates is given by trm, the text
 										// rendering matrix.
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+										trm := to.gs.CTM.Mult(to.tm).Mult(stateMatrix)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
 										// calculate the text location displacement due to writing `r`. We will use this to update
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										// to.tm
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
 										// w is the unscaled movement at the end of a word.
 										w := 0.0
-												assert types for the new code as well

											
										
										
											2018-12-07 18:43:24 +02:00
+										if r == ' ' {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											w = state.tw
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										}
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
+										m, ok := font.GetCharMetrics(code)
 										if !ok {
-												First attempt at getting font metrics by character code.

											
										
										
											2018-11-08 15:20:12 +11:00
+											common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
+											return errors.New("no char metrics")
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+										}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										// c is the character size in unscaled text units.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+										c := transform.Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
-												Fixed text position tracking.

											
										
										
											2018-10-30 21:55:30 +11:00
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+										// t0 is the end of this character.
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										// t is the displacement of the text cursor when the character is rendered.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+										t0 := transform.Point{X: (c.X*tfs + w) * th}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
-												Fixed text matrix multiplication order.

											
										
										
											2018-11-19 14:19:50 +11:00
+										// td, td0 are t, t0 in matrix form.
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+										// td0 is where this character ends. td is where the next character starts.
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+										td0 := translationMatrix(t0)
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										td := translationMatrix(t)
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										common.Log.Trace("\"%c\" stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.tm)
 										common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.tc, w, state.tw)
 										common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.tm).Mult(to.gs.CTM))
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+										mark := to.newTextMark(
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+											string(r),
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+											trm,
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+											translation(to.gs.CTM.Mult(to.tm).Mult(td0)),
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+											spaceWidth*trm.ScalingFactorX())
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+										common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
 										to.marks = append(to.marks, mark)
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
 										// update the text matrix by the displacement of the text location.
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+										to.tm.Concat(td)
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										common.Log.Trace("to.tm=%s", to.tm)
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// glyphTextRatio converts Glyph metrics units to unscaled text space units.
 								const glyphTextRatio = 1.0 / 1000.0
 								// translation returns the translation part of `m`.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+								func translation(m transform.Matrix) transform.Point {
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+									tx, ty := m.Translation()
-												Update Jenkinsfile for matching examples branch. Address go vet.

											
										
										
											2019-03-09 20:45:19 +00:00
+									return transform.Point{X: tx, Y: ty}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								}
 								// translationMatrix returns a matrix that translates by `p`.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+								func translationMatrix(p transform.Point) transform.Matrix {
 									return transform.TranslationMatrix(p.X, p.Y)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								}
 								// moveTo moves the start of line pointer by `tx`,`ty` and sets the text pointer to the
 								// start of line pointer.
 								// Move to the start of the next line, offset from the start of the current line by (tx, ty).
 								// `tx` and `ty` are in unscaled text space units.
 								func (to *textObject) moveTo(tx, ty float64) {
-												Corrected order of matrix multiplication for cm operator.

The change to Matrix.Concat made for this fix simplified some text extraction matrix code.

											
										
										
											2019-01-22 18:18:27 +11:00
+									to.tlm.Concat(transform.NewMatrix(1, 0, 0, 1, tx, ty))
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									to.tm = to.tlm
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								// textMark represents text drawn on a page and its position in device coordinates.
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// All dimensions are in device coordinates.
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								type textMark struct {
 									text          string          // The text.
 									orient        int             // The text orientation in degrees. This is the current TRM rounded to 10°.
 									orientedStart transform.Point // Left of text in orientation where text is horizontal.
 									orientedEnd   transform.Point // Right of text in orientation where text is horizontal.
 									height        float64         // Text height.
 									spaceWidth    float64         // Best guess at the width of a space in the font the text was rendered with.
-												Another round of addressing review comments

											
										
										
											2018-11-30 16:53:48 +00:00
+									count         int64           // To help with reading debug logs.
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								}
-												Fixed some typos.

											
										
										
											2019-01-03 15:41:36 +11:00
+								// newTextMark returns an textMark for text `text` rendered with text rendering matrix (TRM) `trm` and end
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in
 								// the font the text is rendered in device coordinates.
-												Fixed some typos.

											
										
										
											2019-01-03 15:41:36 +11:00
+								func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point, spaceWidth float64) textMark {
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+									to.e.textCount++
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+									theta := trm.Angle()
-												Made Matrix and Point structs more general and moved them to their own files in pdf/model.

											
										
										
											2018-11-29 17:04:20 +11:00
+									orient := nearestMultiple(theta, 10)
 									var height float64
 									if orient%180 != 90 {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										height = trm.ScalingFactorY()
 									} else {
 										height = trm.ScalingFactorX()
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									return textMark{
 										text:          text,
 										orient:        orient,
 										orientedStart: translation(trm).Rotate(theta),
 										orientedEnd:   end.Rotate(theta),
 										height:        height,
 										spaceWidth:    spaceWidth,
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+										count:         to.e.textCount,
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
+									}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+								// nearestMultiple return the integer multiple of `m` that is closest to `x`.
-												Made Matrix and Point structs more general and moved them to their own files in pdf/model.

											
										
										
											2018-11-29 17:04:20 +11:00
+								func nearestMultiple(x float64, m int) int {
 									if m == 0 {
 										m = 1
 									}
 									fac := float64(m)
 									return int(math.Round(x/fac) * fac)
 								}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								// String returns a string describing `t`.
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								func (t textMark) String() string {
 									return fmt.Sprintf("textMark{@%03d [%.3f,%.3f] %.1f %d° %q}",
 										t.count, t.orientedStart.X, t.orientedStart.Y, t.Width(), t.orient, truncate(t.text, 100))
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+								}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								// Width returns the width of `t`.text in the text direction.
 								func (t textMark) Width() float64 {
 									return math.Abs(t.orientedStart.X - t.orientedEnd.X)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// PageText represents the layout of text on a device page.
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+								// It's implementation is opaque to allow for future optimizations.
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								type PageText struct {
 									// PageText is currently implemented as a list of texts and their positions on a PDF page.
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+									marks []textMark
 								}
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// String returns a string describing `pt`.
 								func (pt PageText) String() string {
 									parts := []string{fmt.Sprintf("PageText: %d elements", pt.length())}
 									for _, t := range pt.marks {
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+										parts = append(parts, t.String())
 									}
 									return strings.Join(parts, "\n")
 								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// length returns the number of elements in `pt.marks`.
 								func (pt PageText) length() int {
 									return len(pt.marks)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// height returns the max height of the elements in `pt.marks`.
 								func (pt PageText) height() float64 {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									fontHeight := 0.0
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									for _, t := range pt.marks {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										if t.height > fontHeight {
 											fontHeight = t.height
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										}
 									}
 									return fontHeight
 								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// ToText returns the contents of `pt` as a single string.
 								func (pt PageText) ToText() string {
 									fontHeight := pt.height()
-												In text extraction, split lines with tolerance on y coordinate.

											
										
										
											2018-11-28 22:13:56 +11:00
+									// We sort with a y tolerance to allow for subscripts, diacritics etc.
-												Addressing review comments

											
										
										
											2018-11-30 23:01:04 +00:00
+									tol := minFloat(fontHeight*0.2, 5.0)
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									common.Log.Trace("ToText: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)
-												In text extraction, split lines with tolerance on y coordinate.

											
										
										
											2018-11-28 22:13:56 +11:00
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+									// Uncomment the 2 following Trace statements to see the effects of sorting/
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									// common.Log.Trace("ToText: Before sorting %s", pt)
 									pt.sortPosition(tol)
 									// common.Log.Trace("ToText: After sorting %s", pt)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									lines := pt.toLines(tol)
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									texts := make([]string, 0, len(lines))
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									for _, l := range lines {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										texts = append(texts, l.text)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									}
 									return strings.Join(texts, "\n")
 								}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								// sortPosition sorts a text list by its elements' position on a page.
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+								// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
 								// is horizontal.
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								func (pt *PageText) sortPosition(tol float64) {
-												(*pt). -> pt.

											
										
										
											2019-01-05 09:14:10 +11:00
+									sort.SliceStable(pt.marks, func(i, j int) bool {
 										ti, tj := pt.marks[i], pt.marks[j]
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										if ti.orient != tj.orient {
 											return ti.orient < tj.orient
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+										}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol {
 											return ti.orientedStart.Y > tj.orientedStart.Y
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+										}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										return ti.orientedStart.X < tj.orientedStart.X
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									})
 								}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								// textLine represents a line of text on a page.
 								type textLine struct {
 									y      float64   // y position of line.
 									dxList []float64 // x distance between successive words in line.
 									text   string    // text in the line.
 									words  []string  // words in the line.
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// toLines returns the text and positions in `pt.marks` as a slice of textLine.
-												left-to-write -> left-to-right

											
										
										
											2018-12-02 18:41:48 +11:00
+								// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+								// that text is horizontal) before calling this function.
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								func (pt PageText) toLines(tol float64) []textLine {
 									// We divide `pt.marks` into slices which contain texts with the same orientation, extract the lines
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+									// for each orientation then return the concatention of these lines sorted by orientation.
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									tlOrient := make(map[int][]textMark, len(pt.marks))
 									for _, t := range pt.marks {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										tlOrient[t.orient] = append(tlOrient[t.orient], t)
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									var lines []textLine
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+									for _, o := range orientKeys(tlOrient) {
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+										lines = append(lines, PageText{tlOrient[o]}.toLinesOrient(tol)...)
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+									}
 									return lines
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+								}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								// toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// NOTE: This function only works on text lists where all text is the same orientation so it should
 								// only be called from toLines.
-												left-to-write -> left-to-right

											
										
										
											2018-12-02 18:41:48 +11:00
+								// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// that text is horizontal) before calling this function.
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+								func (pt PageText) toLinesOrient(tol float64) []textLine {
 									if len(pt.marks) == 0 {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										return []textLine{}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									var lines []textLine
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									var words []string
 									var x []float64
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									y := pt.marks[0].orientedStart.Y
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
 									scanning := false
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+									averageCharWidth := exponAve{}
 									wordSpacing := exponAve{}
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
-												Changes missed in previous commit.

											
										
										
											2019-01-04 16:07:03 +11:00
+									for _, t := range pt.marks {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										if t.orientedStart.Y+tol < y {
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											if len(words) > 0 {
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+												line := newLine(y, x, words)
 												if averageCharWidth.running {
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+													// FIXME(peterwilliams97): Fix and reinstate combineDiacritics.
 													// line = combineDiacritics(line, averageCharWidth.ave)
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+													line = removeDuplicates(line, averageCharWidth.ave)
 												}
 												lines = append(lines, line)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											}
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+											words = []string{}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+											x = []float64{}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											y = t.orientedStart.Y
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+											scanning = false
 										}
 										// Detect text movements that represent spaces on the printed page.
 										// We use a heuristic from PdfBox: If the next character starts to the right of where a
 										// character after a space at "normal spacing" would start, then there is a space before it.
 										// The tricky thing to guess here is the width of a space at normal spacing.
-												Addressing review comments

											
										
										
											2018-11-30 23:01:04 +00:00
+										// We follow PdfBox and use minFloat(deltaSpace, deltaCharWidth).
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										deltaSpace := 0.0
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										if t.spaceWidth == 0 {
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+											deltaSpace = math.MaxFloat64
 										} else {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											wordSpacing.update(t.spaceWidth)
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+											deltaSpace = wordSpacing.ave * 0.5
 										}
-												Removed debugging code.

											
										
										
											2018-10-09 19:05:38 +11:00
+										averageCharWidth.update(t.Width())
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										deltaCharWidth := averageCharWidth.ave * 0.3
 										isSpace := false
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+										nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										if scanning && t.text != " " {
 											isSpace = nextWordX < t.orientedStart.X
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										}
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+										common.Log.Trace("t=%s", t)
 										common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+											t.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+										common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											t.text, t.orientedStart.X, t.orientedStart.Y, lastEndX, nextWordX,
 											nextWordX-t.orientedStart.X, isSpace)
-												Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.

											
										
										
											2018-11-18 17:21:30 +11:00
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										if isSpace {
 											words = append(words, " ")
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+											x = append(x, (lastEndX+t.orientedStart.X)*0.5)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+										}
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
 										// Add the text to the line.
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										lastEndX = t.orientedEnd.X
 										words = append(words, t.text)
 										x = append(x, t.orientedStart.X)
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										scanning = true
-												Fixed orientation handling in text extraction.

											
										
										
											2018-11-26 17:17:17 +11:00
+										common.Log.Trace("lastEndX=%.2f", lastEndX)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									}
 									if len(words) > 0 {
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+										line := newLine(y, x, words)
 										if averageCharWidth.running {
 											line = removeDuplicates(line, averageCharWidth.ave)
 										}
 										lines = append(lines, line)
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									}
 									return lines
 								}
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+								// orientKeys returns the keys of `tlOrient` as a sorted slice.
-												Made TextList an opaque struct and renamed it to PageText to reflect its  purpose rather than its current implementation.

											
										
										
											2019-01-04 16:02:22 +11:00
+								func orientKeys(tlOrient map[int][]textMark) []int {
-												Removed some unused struct fields.

											
										
										
											2018-11-27 13:37:12 +11:00
+									keys := []int{}
 									for k := range tlOrient {
 										keys = append(keys, k)
 									}
 									sort.Ints(keys)
 									return keys
 								}
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// exponAve implements an exponential average.
 								type exponAve struct {
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+									ave     float64 // Current average value.
 									running bool    // Has `ave` been set?
 								}
-												Addressed review comments.
- Removed debug code.
- Explained magic constants
- Added file reference to PdfBox map.

											
										
										
											2018-12-02 18:13:40 +11:00
+								// update updates the exponential average `exp.ave` and returns it.
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								func (exp *exponAve) update(x float64) float64 {
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+									if !exp.running {
 										exp.ave = x
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+										exp.running = true
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+									} else {
-												Removed combineDiacritics from text extraction because it was causing ' and " to be combined with the letters proceeding them.
Need to fix this and reinstate combineDiacritics.

											
										
										
											2019-01-01 12:22:39 +11:00
+										// NOTE(peterwilliams97): 0.5 is a guess. It may be possible to improve average character
-												Addressed review comments.
- Removed debug code.
- Explained magic constants
- Added file reference to PdfBox map.

											
										
										
											2018-12-02 18:13:40 +11:00
+										// and space width estimation by tuning this value. It may be that different exponents
 										// would work better for character and space estimation.
-												First attempt at splitting words in text extraction using a space detection heuristic

											
										
										
											2018-10-09 11:49:59 +11:00
+										exp.ave = (exp.ave + x) * 0.5
 									}
 									return exp.ave
 								}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								// newLine returns the textLine representation of strings `words` with y coordinate `y` and x
-												Cleaned up some comments

											
										
										
											2018-09-03 16:38:58 +10:00
+								// coordinates `x`.
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								func newLine(y float64, x []float64, words []string) textLine {
 									dxList := make([]float64, 0, len(x))
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									for i := 1; i < len(x); i++ {
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										dxList = append(dxList, x[i]-x[i-1])
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									return textLine{y: y, dxList: dxList, text: strings.Join(words, ""), words: words}
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+								}
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
+								// removeDuplicates returns `line` with duplicate characters removed. `charWidth` is the average
 								// character width for the line.
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								func removeDuplicates(line textLine, charWidth float64) textLine {
 									if len(line.dxList) == 0 {
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+										return line
 									}
-												Fixed position sorting for text extraction for landscape text.

											
										
										
											2018-11-10 21:19:02 +11:00
-												Addressed review comments.
- Removed debug code.
- Explained magic constants
- Added file reference to PdfBox map.

											
										
										
											2018-12-02 18:13:40 +11:00
+									// NOTE(peterwilliams97) 0.3 is a guess. It may be possible to tune this to a better value.
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+									tol := charWidth * 0.3
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									words := []string{line.words[0]}
-												Addressed review comments.
- Removed debug code.
- Explained magic constants
- Added file reference to PdfBox map.

											
										
										
											2018-12-02 18:13:40 +11:00
+									var dxList []float64
-												Removed debugging code.

											
										
										
											2018-10-09 19:05:38 +11:00
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									w0 := line.words[0]
 									for i, dx := range line.dxList {
 										w := line.words[i+1]
-												testing hack

											
										
										
											2018-10-09 13:47:43 +11:00
+										if w != w0 || dx > tol {
 											words = append(words, w)
 											dxList = append(dxList, dx)
 										}
 										w0 = w
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									return textLine{y: line.y, dxList: dxList, text: strings.Join(words, ""), words: words}
-												First attempt at extraction based on a full PDF text parser.

											
										
										
											2018-08-22 12:29:34 +10:00
+								}
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+								// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
 								// `charWidth` is the average character width for the line.
 								// We have to do this because PDF can render diacritics separately to the characters they attach to
 								// in extracted text.
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+								func combineDiacritics(line textLine, charWidth float64) textLine {
 									if len(line.dxList) == 0 {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										return line
 									}
-												Addressed review comments.
- Removed debug code.
- Explained magic constants
- Added file reference to PdfBox map.

											
										
										
											2018-12-02 18:13:40 +11:00
+									// NOTE(peterwilliams97) 0.2 is a guess. It may be possible to tune this to a better value.
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									tol := charWidth * 0.2
 									common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)
-												Addressing review comments

											
										
										
											2018-11-30 23:01:04 +00:00
+									var words []string
 									var dxList []float64
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									w := line.words[0]
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+									w, c := countDiacritic(w)
 									delta := 0.0
 									dx0 := 0.0
 									parts := []string{w}
 									numChars := c
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									for i := 0; i < len(line.dxList); i++ {
 										w = line.words[i+1]
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										w, c := countDiacritic(w)
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+										dx := line.dxList[i]
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										if numChars+c <= 1 && delta+dx <= tol {
 											if len(parts) == 0 {
 												dx0 = dx
 											} else {
 												delta += dx
 											}
 											parts = append(parts, w)
 											numChars += c
 										} else {
 											if len(parts) > 0 {
 												if len(words) > 0 {
 													dxList = append(dxList, dx0)
 												}
 												words = append(words, combine(parts))
 											}
 											parts = []string{w}
 											numChars = c
 											dx0 = dx
 											delta = 0.0
 										}
 									}
 									if len(parts) > 0 {
 										if len(words) > 0 {
 											dxList = append(dxList, dx0)
 										}
 										words = append(words, combine(parts))
 									}
 									if len(words) != len(dxList)+1 {
 										common.Log.Error("Inconsistent: \nwords=%d %q\ndxList=%d %.2f",
 											len(words), words, len(dxList), dxList)
 										return line
 									}
-												Made many fields text.go private.

											
										
										
											2019-01-02 10:39:30 +11:00
+									return textLine{y: line.y, dxList: dxList, text: strings.Join(words, ""), words: words}
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+								}
 								// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
 								func combine(parts []string) string {
 									if len(parts) == 1 {
 										// Must be a non-diacritic.
 										return parts[0]
 									}
 									// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
 									diacritic := map[string]bool{}
 									for _, w := range parts {
 										r := []rune(w)[0]
 										diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
 									}
 									sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })
 									// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
 									for i, w := range parts {
 										parts[i] = strings.TrimSpace(norm.NFKC.String(w))
 									}
 									return strings.Join(parts, "")
 								}
 								// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+								// non-diacritics in `w` (0 or 1).
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+								func countDiacritic(w string) (string, int) {
 									runes := []rune(w)
 									if len(runes) != 1 {
 										return w, 1
 									}
 									r := runes[0]
 									c := 1
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
 										r != '\'' && r != '"' && r != '`' {
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+										c = 0
 									}
 									if w2, ok := diacritics[r]; ok {
 										c = 0
 										w = w2
 									}
 									return w, c
 								}
 								// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
 								// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
-												Addressed review comments.
- Removed debug code.
- Explained magic constants
- Added file reference to PdfBox map.

											
										
										
											2018-12-02 18:13:40 +11:00
+								// (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
-												Combine diacritics in text extraction.

											
										
										
											2018-11-28 18:06:03 +11:00
+								var diacritics = map[rune]string{
 x0060: "\u0300",
 x02CB: "\u0300",
 x0027: "\u0301",
 x02B9: "\u0301",
 x02CA: "\u0301",
 x005e: "\u0302",
 x02C6: "\u0302",
 x007E: "\u0303",
 x02C9: "\u0304",
 x00B0: "\u030A",
 x02BA: "\u030B",
 x02C7: "\u030C",
 x02C8: "\u030D",
 x0022: "\u030E",
 x02BB: "\u0312",
 x02BC: "\u0313",
 x0486: "\u0313",
 x055A: "\u0313",
 x02BD: "\u0314",
 x0485: "\u0314",
 x0559: "\u0314",
 x02D4: "\u031D",
 x02D5: "\u031E",
 x02D6: "\u031F",
 x02D7: "\u0320",
 x02B2: "\u0321",
 x02CC: "\u0329",
 x02B7: "\u032B",
 x02CD: "\u0331",
 x005F: "\u0332",
 x204E: "\u0359",
 								}
-												Improvements to text extraction.

											
										
										
											2018-09-20 11:49:44 +10:00
+								// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
 								// empty.
 								func (to *textObject) getCurrentFont() *model.PdfFont {
 									if to.fontStack.empty() {
 										common.Log.Debug("ERROR: No font defined. Using default.")
 										return model.DefaultFont()
 									}
 									return to.fontStack.peek()
 								}
-												Noted that text extractor is an intermediate version

											
										
										
											2018-06-28 11:11:43 +10:00
+								// getFont returns the font named `name` if it exists in the page's resources or an error if it
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+								// doesn't. It caches the returned fonts.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) getFont(name string) (*model.PdfFont, error) {
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+									if to.e.fontCache != nil {
 										to.e.accessCount++
 										entry, ok := to.e.fontCache[name]
 										if ok {
 											entry.access = to.e.accessCount
 											return entry.font, nil
 										}
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+									}
 									// Font not in cache. Load it.
 									font, err := to.getFontDirect(name)
 									if err != nil {
 										return nil, err
 									}
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+									if to.e.fontCache != nil {
 										entry := fontEntry{font, to.e.accessCount}
 										// Eject a victim if the cache is full.
 										if len(to.e.fontCache) >= maxFontCache {
-												define slices with a var instead of an empty literal

											
										
										
											2018-12-09 19:28:50 +02:00
+											var names []string
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+											for name := range to.e.fontCache {
 												names = append(names, name)
 											}
 											sort.Slice(names, func(i, j int) bool {
 												return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
 											})
 											delete(to.e.fontCache, names[0])
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+										}
-												Moved font cache from global variable to Extractor.

											
										
										
											2018-09-22 09:28:18 +10:00
+										to.e.fontCache[name] = entry
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+									}
 									return font, nil
 								}
-												Cleaned up some comments.

											
										
										
											2018-09-21 16:43:10 +10:00
+								// fontEntry is a entry in the font cache.
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+								type fontEntry struct {
 									font   *model.PdfFont // The font being cached.
 									access int64          // Last access. Used to determine LRU cache victims.
 								}
 								// maxFontCache is the maximum number of PdfFont's in fontCache.
 								const maxFontCache = 10
 								// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
-												Addressing review comments

											
										
										
											2018-11-28 23:25:17 +00:00
+								// it doesn't. Accesses page resources directly (not cached).
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+								func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									fontObj, err := to.getFontDict(name)
 									if err != nil {
 										return nil, err
 									}
 									font, err := model.NewPdfFontFromPdfObject(fontObj)
 									if err != nil {
-												Cache PdfFont's in text extractor

											
										
										
											2018-09-17 12:12:06 +10:00
+										common.Log.Debug("getFontDirect: NewPdfFontFromPdfObject failed. name=%#q err=%v", name, err)
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
 									return font, err
 								}
-												Fixed incorrectly named variable.

											
										
										
											2018-12-27 21:33:31 +11:00
+								// getFontDict returns the font dict with key `name` if it exists in the page's or form's Font
 								// resources or an error if it doesn't.
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+								func (to *textObject) getFontDict(name string) (fontObj core.PdfObject, err error) {
-												Recurse through form XObjects for text extractions.

											
										
										
											2018-12-27 20:51:34 +11:00
+									resources := to.resources
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if resources == nil {
 										common.Log.Debug("getFontDict. No resources. name=%#q", name)
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										return nil, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												reduced differences with compositefont branch

											
										
										
											2018-07-15 16:28:56 +10:00
+									fontObj, found := resources.GetFontByName(core.PdfObjectName(name))
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									if !found {
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+										common.Log.Debug("ERROR: getFontDict: Font not found: name=%#q", name)
-												Set font even when Tf operator is not between BT and ET.

											
										
										
											2018-11-21 13:14:11 +11:00
+										return nil, errors.New("font not in resources")
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+									}
-												Removed naked returns. Fixed godoc. Reorganized object extractors

											
										
										
											2018-07-25 12:00:49 +10:00
+									return fontObj, nil
-												Updated the text extractor to use the new font code

											
										
										
											2018-06-27 16:31:28 +10:00
+								}