Merge pull request #5 from unidoc/v3-peterwilliams97-extract.text

Cleaning up v3 extract.text
2025-05-02 22:17:06 +08:00 · 2018-11-29 18:03:50 +11:00 · 2018-11-29 18:03:50 +11:00 · 1cea79b8ef
commit 1cea79b8ef
parent 6529b42a70 e6b768c06c
24 changed files with 334 additions and 443 deletions
--- a/pdf/contentstream/matrix.go
+++ b/pdf/contentstream/matrix.go
@ -0,0 +1,157 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package contentstream
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/unidoc/unidoc/common"
+)
+
+// Matrix is a linear transform matrix in homogenous coordinates.
+// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
+type Matrix [9]float64
+
+// IdentityMatrix returns the identity transform.
+func IdentityMatrix() Matrix {
+	return NewMatrix(1, 0, 0, 1, 0, 0)
+}
+
+// TranslationMatrix returns a matrix that translates by `tx`, `ty`.
+func TranslationMatrix(tx, ty float64) Matrix {
+	return NewMatrix(1, 0, 0, 1, tx, ty)
+}
+
+// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
+//      a  b  0
+//      c  d  0
+//      tx ty 1
+func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
+	m := Matrix{
+		a, b, 0,
+		c, d, 0,
+		tx, ty, 1,
+	}
+	m.fixup()
+	return m
+}
+
+// String returns a string describing `m`.
+func (m Matrix) String() string {
+	a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
+	return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
+}
+
+// Set sets `m` to affine transform a,b,c,d,tx,ty.
+func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
+	m[0], m[1] = a, b
+	m[3], m[4] = c, d
+	m[6], m[7] = tx, ty
+	m.fixup()
+}
+
+// Concat sets `m` to `m` × `b`.
+// `b` needs to be created by newMatrix. i.e. It must be an affine transform.
+//    m00 m01 0     b00 b01 0     m00*b00 + m01*b01        m00*b10 + m01*b11        0
+//    m10 m11 0  ×  b10 b11 0  =  m10*b00 + m11*b01        m10*b10 + m11*b11        0
+//    m20 m21 1     b20 b21 1     m20*b00 + m21*b10 + b20  m20*b01 + m21*b11 + b21  1
+func (m *Matrix) Concat(b Matrix) {
+	*m = Matrix{
+		m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
+		m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
+		m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
+	}
+	m.fixup()
+}
+
+// Mult returns `m` × `b`.
+func (m Matrix) Mult(b Matrix) Matrix {
+	m.Concat(b)
+	return m
+}
+
+// Translate appends a translation of `dx`,`dy` to `m`.
+// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
+func (m *Matrix) Translate(dx, dy float64) {
+	m[6] += dx
+	m[7] += dy
+	m.fixup()
+}
+
+// Translation returns the translation part of `m`.
+func (m *Matrix) Translation() (float64, float64) {
+	return m[6], m[7]
+}
+
+// Translation returns the translation part of `m`.
+func (m *Matrix) ScalingX() float64 {
+	return math.Hypot(m[0], m[1])
+}
+
+// Transform returns coordinates `x`,`y` transformed by `m`.
+func (m *Matrix) Transform(x, y float64) (float64, float64) {
+	xp := x*m[0] + y*m[1] + m[6]
+	yp := x*m[3] + y*m[4] + m[7]
+	return xp, yp
+}
+
+// ScalingFactorX returns X scaling of  the affine transform.
+func (m *Matrix) ScalingFactorX() float64 {
+	return math.Sqrt(m[0]*m[0] + m[1]*m[1])
+}
+
+// ScalingFactorY returns X scaling of  the affine transform.
+func (m *Matrix) ScalingFactorY() float64 {
+	return math.Sqrt(m[3]*m[3] + m[4]*m[4])
+}
+
+// Angle returns the angle of the affine transform.
+// For simplicity, we assume the transform is a multiple of 90 degrees.
+func (m *Matrix) Angle() int {
+	a, b, c, d := m[0], m[1], m[3], m[4]
+	// We are returning θ for
+	// a b    cos θ  -sin θ
+	// c d =  sin θ   cos θ
+	if a > 0 && d > 0 {
+		//  1  0
+		//  0  1
+		return 0
+	} else if b < 0 && c > 0 {
+		//  0  1
+		// -1  0
+		return 90
+	} else if a < 0 && d < 0 {
+		// -1  0
+		//  0 -1
+		return 180
+	} else if b > 0 && c < 0 {
+		// 0 -1
+		// 1  0
+		return 270
+	}
+	common.Log.Debug("ERROR: Angle not a multiple of 90°. m=%s", m)
+	return 0
+}
+
+// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
+// files.
+// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
+func (m *Matrix) fixup() {
+	for i, x := range m {
+		if x > maxAbsNumber {
+			common.Log.Debug("FIXUP: %d -> %d", x, maxAbsNumber)
+			m[i] = maxAbsNumber
+		} else if x < -maxAbsNumber {
+			common.Log.Debug("FIXUP: %d -> %d", x, -maxAbsNumber)
+			m[i] = -maxAbsNumber
+		}
+	}
+}
+
+// largest numbers needed in PDF transforms. Is this correct?
+// TODO(gunnsth): Practical value? Need some reasoning.
+const maxAbsNumber = 1e9
--- a/pdf/contentstream/processor.go
+++ b/pdf/contentstream/processor.go
@ -7,8 +7,6 @@ package contentstream

 import (
 	"errors"
-	"fmt"
-	"math"

 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/core"
@ -62,21 +60,26 @@ type HandlerEntry struct {
 	Handler   HandlerFunc
 }

+// HandlerConditionEnum represents the type of operand content stream processor.
+// HandlerConditionEnumOperand handler handles a single operand, whereas
+// HandlerConditionEnumAllOperands processes all operands.
 type HandlerConditionEnum int

-func (csp HandlerConditionEnum) All() bool {
-	return csp == HandlerConditionEnumAllOperands
-}
-
-func (csp HandlerConditionEnum) Operand() bool {
-	return csp == HandlerConditionEnumOperand
-}
-
 const (
 	HandlerConditionEnumOperand     HandlerConditionEnum = iota
 	HandlerConditionEnumAllOperands HandlerConditionEnum = iota
 )

+// All returns true if `hce` is equivalent to HandlerConditionEnumAllOperands.
+func (hce HandlerConditionEnum) All() bool {
+	return hce == HandlerConditionEnumAllOperands
+}
+
+// Operand returns true if `hce` is equivalent to HandlerConditionEnumOperand.
+func (hce HandlerConditionEnum) Operand() bool {
+	return hce == HandlerConditionEnumOperand
+}
+
 func NewContentStreamProcessor(ops []*ContentStreamOperation) *ContentStreamProcessor {
 	csp := ContentStreamProcessor{}
 	csp.graphicsStack = GraphicStateStack{}
@ -573,144 +576,3 @@ func (proc *ContentStreamProcessor) handleCommand_cm(op *ContentStreamOperation,

 	return nil
 }
-
-// Matrix is a linear transform matrix in homogenous coordinates.
-// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
-type Matrix [9]float64
-
-// IdentityMatrix returns the identity transform.
-func IdentityMatrix() Matrix {
-	return NewMatrix(1, 0, 0, 1, 0, 0)
-}
-
-// TranslationMatrix returns a matrix that translates by `tx`, `ty`.
-func TranslationMatrix(tx, ty float64) Matrix {
-	return NewMatrix(1, 0, 0, 1, tx, ty)
-}
-
-// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
-//      a  b  0
-//      c  d  0
-//      tx ty 1
-func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
-	m := Matrix{
-		a, b, 0,
-		c, d, 0,
-		tx, ty, 1,
-	}
-	m.fixup()
-	return m
-}
-
-// String returns a string describing `m`.
-func (m Matrix) String() string {
-	a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
-	return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
-}
-
-// Set sets `m` to affine transform a,b,c,d,tx,ty.
-func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
-	m[0], m[1] = a, b
-	m[3], m[4] = c, d
-	m[6], m[7] = tx, ty
-	m.fixup()
-}
-
-// Concat sets `m` to `m` × `b`.
-// `b` needs to be created by newMatrix. i.e. It must be an affine transform.
-//    m00 m01 0     b00 b01 0     m00*b00 + m01*b01        m00*b10 + m01*b11        0
-//    m10 m11 0  ×  b10 b11 0  =  m10*b00 + m11*b01        m10*b10 + m11*b11        0
-//    m20 m21 1     b20 b21 1     m20*b00 + m21*b10 + b20  m20*b01 + m21*b11 + b21  1
-func (m *Matrix) Concat(b Matrix) {
-	*m = Matrix{
-		m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
-		m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
-		m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
-	}
-	m.fixup()
-}
-
-// Mult returns `m` × `b`.
-func (m Matrix) Mult(b Matrix) Matrix {
-	m.Concat(b)
-	return m
-}
-
-// Translate appends a translation of `dx`,`dy` to `m`.
-// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
-func (m *Matrix) Translate(dx, dy float64) {
-	m[6] += dx
-	m[7] += dy
-	m.fixup()
-}
-
-// Translation returns the translation part of `m`.
-func (m *Matrix) Translation() (float64, float64) {
-	return m[6], m[7]
-}
-
-// Translation returns the translation part of `m`.
-func (m *Matrix) ScalingX() float64 {
-	return math.Hypot(m[0], m[1])
-}
-
-// Transform returns coordinates `x`,`y` transformed by `m`.
-func (m *Matrix) Transform(x, y float64) (float64, float64) {
-	xp := x*m[0] + y*m[1] + m[6]
-	yp := x*m[3] + y*m[4] + m[7]
-	return xp, yp
-}
-
-// ScalingFactorX returns X scaling of  the affine transform.
-func (m *Matrix) ScalingFactorX() float64 {
-	return math.Sqrt(m[0]*m[0] + m[1]*m[1])
-}
-
-// ScalingFactorY returns X scaling of  the affine transform.
-func (m *Matrix) ScalingFactorY() float64 {
-	return math.Sqrt(m[3]*m[3] + m[4]*m[4])
-}
-
-// Angle returns the angle of the affine transform.
-// For simplicity, we assume the transform is a multiple of 90 degrees.
-func (m *Matrix) Angle() int {
-	a, b, c, d := m[0], m[1], m[3], m[4]
-	// We are returning θ for
-	// a b    cos θ  -sin θ
-	// c d =  sin θ   cos θ
-	if a > 0 && d > 0 {
-		//  1  0
-		//  0  1
-		return 0
-	} else if b < 0 && c > 0 {
-		//  0  1
-		// -1  0
-		return 90
-	} else if a < 0 && d < 0 {
-		// -1  0
-		//  0 -1
-		return 180
-	} else if b > 0 && c < 0 {
-		// 0 -1
-		// 1  0
-		return 270
-	}
-	common.Log.Debug("ERROR: Angle not a mulitple of 90°. m=%s", m)
-	return 0
-}
-
-// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
-// files.
-// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
-func (m *Matrix) fixup() {
-	for i, x := range m {
-		if x > maxAbsNumber {
-			m[i] = maxAbsNumber
-		} else if x < -maxAbsNumber {
-			m[i] = -maxAbsNumber
-		}
-	}
-}
-
-// largest numbers needed in PDF transforms. Is this correct?
-const maxAbsNumber = 1e9
--- a/pdf/extractor/point.go
+++ b/pdf/extractor/point.go
@ -5,7 +5,7 @@
 * Based on pdf/contentstream/draw/point.go
 */

-// XXX(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.
+// FIXME(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.

 package extractor

@ -16,18 +16,18 @@ import (
 	"github.com/unidoc/unidoc/pdf/contentstream"
 )

-// Point defines a point in Cartesian coordinates
+// Point defines a point (X,Y) in Cartesian coordinates.
 type Point struct {
 	X float64
 	Y float64
 }

-// NewPoint returns a Point at 'x', 'y'.
+// NewPoint returns a Point at `x`, `y`.
 func NewPoint(x, y float64) Point {
 	return Point{X: x, Y: y}
 }

-// Set sets `p` to `x`, `y`.
+// Set sets `p` to coordinates `(x, y)`.
 func (p *Point) Set(x, y float64) {
 	p.X, p.Y = x, y
 }
@ -38,12 +38,12 @@ func (p *Point) Transform(a, b, c, d, tx, ty float64) {
 	p.transformByMatrix(m)
 }

-// Displace returns `p` displaced by `delta`.
+// Displace returns a new Point at location `p` + `delta`.
 func (p Point) Displace(delta Point) Point {
 	return Point{p.X + delta.X, p.Y + delta.Y}
 }

-// Rotate returns `p` rotated by `theta` degrees.
+// Rotate rotates `p` by `theta` degrees and returns back.
 func (p Point) Rotate(theta int) Point {
 	switch theta {
 	case 0:
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@ -27,13 +27,13 @@ import (
 // CharcodeBytesToUnicode.
 // Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
 func (e *Extractor) ExtractText() (string, error) {
-	text, _, _, err := e.ExtractText2()
+	text, _, _, err := e.ExtractTextWithStats()
 	return text, err
 }

-// ExtractText2 works like ExtractText but returns the number of characters in the output and the
+// ExtractTextWithStats works like ExtractText but returns the number of characters in the output and the
 // the number of characters that were not decoded.
-func (e *Extractor) ExtractText2() (string, int, int, error) {
+func (e *Extractor) ExtractTextWithStats() (string, int, int, error) {
 	textList, numChars, numMisses, err := e.ExtractXYText()
 	if err != nil {
 		return "", numChars, numMisses, err
@ -313,6 +313,10 @@ func (to *textObject) nextLine() {
 // Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
 // in `f` (page 250).
 func (to *textObject) setTextMatrix(f []float64) {
+	if len(f) != 6 {
+		common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
+		return
+	}
 	a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
 	to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
 	to.Tlm = to.Tm
@ -358,7 +362,7 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {

 // setTextLeading "TL". Set text leading.
 func (to *textObject) setTextLeading(y float64) {
-	if to == nil {
+	if to == nil || to.State == nil {
 		return
 	}
 	to.State.Tl = y
@ -427,7 +431,7 @@ func (to *textObject) setHorizScaling(y float64) {
 	to.State.Th = y
 }

-// floatParam returns the single float parameter of operatr `op`, or an error if it doesn't have
+// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
 // a single float parameter or we aren't in a text stream.
 func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
 	if len(op.Params) != 1 {
@ -444,7 +448,7 @@ func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
 func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
 	hard bool) (ok bool, err error) {
 	if to == nil {
-		params := []core.PdfObject{}
+		var params []core.PdfObject
 		if numParams > 0 {
 			params = op.Params
 			if len(params) > numParams {
@ -596,7 +600,7 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat
 	}
 }

-// renderText emits byte array `data` to the calling program.
+// renderText processes and renders byte array `data` for extraction purposes.
 func (to *textObject) renderText(data []byte) error {
 	font := to.getCurrentFont()

@ -628,7 +632,6 @@ func (to *textObject) renderText(data []byte) error {
 	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)

 	for i, r := range runes {
-
 		// XXX(peterwilliams97) Need to find and fix cases where this happens.
 		if r == "\x00" {
 			continue
@ -784,7 +787,7 @@ func (tl TextList) ToText() string {
 	tl.SortPosition()

 	lines := tl.toLines()
-	texts := []string{}
+	texts := make([]string, 0, len(lines))
 	for _, l := range lines {
 		texts = append(texts, l.Text)
 	}
@ -825,11 +828,11 @@ type Line struct {
 func (tl TextList) toLines() []Line {
 	// We divide `tl` into slices which contain texts with the same orientation, extract the lines
 	// for each orientation then return the concatention of these lines sorted by orientation.
-	tlOrient := map[int]TextList{}
+	tlOrient := make(map[int]TextList, len(tl))
 	for _, t := range tl {
 		tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
 	}
-	lines := []Line{}
+	var lines []Line
 	for _, o := range orientKeys(tlOrient) {
 		lines = append(lines, tlOrient[o].toLinesOrient()...)
 	}
@ -846,15 +849,15 @@ func (tl TextList) toLinesOrient() []Line {
 	if len(tl) == 0 {
 		return []Line{}
 	}
-	lines := []Line{}
-	words := []string{}
-	x := []float64{}
+	var lines []Line
+	var words []string
+	var x []float64
 	y := tl[0].OrientedStart.Y

 	scanning := false

-	averageCharWidth := ExponAve{}
-	wordSpacing := ExponAve{}
+	averageCharWidth := exponAve{}
+	wordSpacing := exponAve{}
 	lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X

 	for _, t := range tl {
@ -889,13 +892,13 @@ func (tl TextList) toLinesOrient() []Line {
 		deltaCharWidth := averageCharWidth.ave * 0.3

 		isSpace := false
-		nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
+		nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
 		if scanning && t.Text != " " {
 			isSpace = nextWordX < t.OrientedStart.X
 		}
 		common.Log.Trace("t=%s", t)
 		common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
-			t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
+			t.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
 		common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
 			t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX,
 			nextWordX-t.OrientedStart.X, isSpace)
@ -940,14 +943,14 @@ func min(a, b float64) float64 {
 	return b
 }

-// ExponAve implements an exponential average.
-type ExponAve struct {
+// exponAve implements an exponential average.
+type exponAve struct {
 	ave     float64 // Current average value.
 	running bool    // Has `ave` been set?
 }

 // update updates the exponential average `exp.ave` and returns it
-func (exp *ExponAve) update(x float64) float64 {
+func (exp *exponAve) update(x float64) float64 {
 	if !exp.running {
 		exp.ave = x
 		exp.running = true
@ -957,9 +960,15 @@ func (exp *ExponAve) update(x float64) float64 {
 	return exp.ave
 }

-// printTexts is a debugging function. XXX(peterwilliams97) Remove this.
+const isDebug = false
+
+// printTexts is a debugging function.
+// TODO(peterwilliams97) Remove this.
 func (tl *TextList) printTexts(message string) {
-	return
+	if !isDebug {
+		return
+	}
+
 	_, file, line, ok := runtime.Caller(1)
 	if !ok {
 		file = "???"
@ -985,7 +994,7 @@ func (tl *TextList) printTexts(message string) {
 // newLine returns the Line representation of strings `words` with y coordinate `y` and x
 // coordinates `x`.
 func newLine(y float64, x []float64, words []string) Line {
-	dx := []float64{}
+	dx := make([]float64, 0, len(x))
 	for i := 1; i < len(x); i++ {
 		dx = append(dx, x[i]-x[i-1])
 	}
@ -1211,18 +1220,8 @@ type fontEntry struct {
 const maxFontCache = 10

 // getFontDirect returns the font named `name` if it exists in the page's resources or an error if
-// is doesn't.
-// This is a direct (uncached access).
+// it doesn't. Accesses page resources directly (not cached).
 func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
-
-	// This is a hack for testing.
-	switch name {
-	case "UniDocCourier":
-		return model.NewStandard14FontMustCompile(model.Courier), nil
-	case "UniDocHelvetica":
-		return model.NewStandard14FontMustCompile(model.Helvetica), nil
-	}
-
 	fontObj, err := to.getFontDict(name)
 	if err != nil {
 		return nil, err
--- a/pdf/extractor/text_test.go
+++ b/pdf/extractor/text_test.go
@ -8,7 +8,6 @@ package extractor
 import (
 	"flag"
 	"os"
-	"os/user"
 	"path/filepath"
 	"regexp"
 	"sort"
@ -20,18 +19,14 @@ import (
 	"golang.org/x/text/unicode/norm"
 )

-// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
-// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
+// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
+// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.

 // forceTest should be set to true to force running all tests.
-const forceTest = false
+// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
+var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"

-// corpusFolders is where we search for test files.
-var corpusFolders = []string{
-	"./testdata",
-	"~/testdata",
-	".",
-}
+var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")

 func init() {
 	common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
@ -40,23 +35,16 @@ func init() {
 	}
 }

-// TestTextExtraction1 tests text extraction on the PDF fragments in `fragmentTests`.
-func TestTextExtraction1(t *testing.T) {
-	for _, f := range fragmentTests {
-		f.testExtraction(t)
-	}
-}
-
-type fragment struct {
-	name     string
-	contents string
-	text     string
-}
-
-var fragmentTests = []fragment{
-
-	{name: "portrait",
-		contents: `
+// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
+func TestTextExtractionFragments(t *testing.T) {
+	fragmentTests := []struct {
+		name     string
+		contents string
+		text     string
+	}{
+		{
+			name: "portrait",
+			contents: `
        BT
        /UniDocCourier 24 Tf
        (Hello World!)Tj
@ -64,10 +52,11 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
-	},
-	{name: "landscape",
-		contents: `
+			text: "Hello World!\nDoink",
+		},
+		{
+			name: "landscape",
+			contents: `
        BT
        /UniDocCourier 24 Tf
        0 1 -1 0 0 0 Tm
@ -76,10 +65,11 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
-	},
-	{name: "180 degree rotation",
-		contents: `
+			text: "Hello World!\nDoink",
+		},
+		{
+			name: "180 degree rotation",
+			contents: `
        BT
        /UniDocCourier 24 Tf
        -1 0 0 -1 0 0 Tm
@ -88,10 +78,11 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
-	},
-	{name: "Helvetica",
-		contents: `
+			text: "Hello World!\nDoink",
+		},
+		{
+			name: "Helvetica",
+			contents: `
        BT
        /UniDocHelvetica 24 Tf
        0 -1 1 0 0 0 Tm
@ -100,35 +91,53 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
-	},
-}
-
-// testExtraction checks that ExtractText() works on fragment `f`.
-func (f fragment) testExtraction(t *testing.T) {
-	e := Extractor{contents: f.contents}
-	text, err := e.ExtractText()
-	if err != nil {
-		t.Fatalf("Error extracting text: %q err=%v", f.name, err)
-		return
+			text: "Hello World!\nDoink",
+		},
 	}
-	if text != f.text {
-		t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
-		return
+
+	// Setup mock resources.
+	resources := model.NewPdfPageResources()
+	{
+		courier := model.NewStandard14FontMustCompile(model.Courier)
+		helvetica := model.NewStandard14FontMustCompile(model.Helvetica)
+		resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
+		resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
+	}
+
+	for _, f := range fragmentTests {
+		t.Run(f.name, func(t *testing.T) {
+			e := Extractor{resources: resources, contents: f.contents}
+			text, err := e.ExtractText()
+			if err != nil {
+				t.Fatalf("Error extracting text: %q err=%v", f.name, err)
+				return
+			}
+			if text != f.text {
+				t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
+				return
+			}
+		})
 	}
 }

-// TestTextExtraction2 tests text extraction on set of PDF files.
+// TestTextExtractionFiles tests text extraction on a set of PDF files.
 // It checks for the existence of specified strings of words on specified pages.
 // We currently only check within lines as our line order is still improving.
-func TestTextExtraction2(t *testing.T) {
-	for _, test := range extract2Tests {
-		testExtract2(t, test.filename, test.expectedPageText)
+func TestTextExtractionFiles(t *testing.T) {
+	if len(corpusFolder) == 0 && !forceTest {
+		t.Log("Corpus folder not set - skipping")
+		return
+	}
+
+	for _, test := range fileExtractionTests {
+		t.Run(test.filename, func(t *testing.T) {
+			testExtractFile(t, test.filename, test.expectedPageText)
+		})
 	}
 }

-// extract2Tests are the PDFs and texts we are looking for on specified pages.
-var extract2Tests = []struct {
+// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
+var fileExtractionTests = []struct {
 	filename         string
 	expectedPageText map[int][]string
 }{
@ -216,21 +225,27 @@ var extract2Tests = []struct {
 	},
 }

-// testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted
+// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
 // text to `expectedPageText`.
-// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
-// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
-// If `filename` cannot be found in `corpusFolders` then the test is skipped.
-func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
-	homeDir, hasHome := getHomeDir()
-	path, ok := searchDirectories(homeDir, hasHome, corpusFolders, filename)
-	if !ok {
+//
+// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
+// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
+// the corpus directory.
+//
+// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
+// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
+func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
+	filepath := filepath.Join(corpusFolder, filename)
+	exists := checkFileExists(filepath)
+	if !exists {
 		if forceTest {
 			t.Fatalf("filename=%q does not exist", filename)
 		}
+		t.Logf("%s not found", filename)
 		return
 	}
-	_, actualPageText := extractPageTexts(t, path)
+
+	_, actualPageText := extractPageTexts(t, filepath)
 	for _, pageNum := range sortedKeys(expectedPageText) {
 		expectedSentences, ok := expectedPageText[pageNum]
 		actualText, ok := actualPageText[pageNum]
@ -239,12 +254,12 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri
 		}
 		actualText = norm.NFKC.String(actualText)
 		if !containsSentences(t, expectedSentences, actualText) {
-			t.Fatalf("Text mismatch filename=%q page=%d", path, pageNum)
+			t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
 		}
 	}
 }

-// extractPageTexts runs ExtractText2 on all pages in PDF `filename` and returns the result as a map
+// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
 // {page number: page text}
 func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 	f, err := os.Open(filename)
@ -272,11 +287,11 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 		if err != nil {
 			t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
 		}
-		text, _, _, err := ex.ExtractText2()
+		text, _, _, err := ex.ExtractTextWithStats()
 		if err != nil {
-			t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
+			t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
 		}
-		// XXX(peterwilliams97)TODO: Improve text extraction space insertion so we don't need reduceSpaces.
+		// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
 		pageText[pageNum] = reduceSpaces(text)
 	}
 	return numPages, pageText
@ -303,30 +318,10 @@ func reduceSpaces(text string) string {

 var reSpace = regexp.MustCompile(`(?m)\s+`)

-// searchDirectories searches `directories` for `filename` and returns the full file path if it is
-// found. `homeDir` and `hasHome` are used for home directory substitution.
-func searchDirectories(homeDir string, hasHome bool, directories []string, filename string) (string, bool) {
-	for _, direct := range directories {
-		if hasHome {
-			direct = strings.Replace(direct, "~", homeDir, 1)
-		}
-		path := filepath.Join(direct, filename)
-		if _, err := os.Stat(path); err == nil {
-			return path, true
-		}
-	}
-	return "", false
-}
-
-// getHomeDir returns the current user's home directory if it is defined and a bool to tell if it
-// is defined.
-func getHomeDir() (string, bool) {
-	usr, err := user.Current()
-	if err != nil {
-		common.Log.Error("No current user. err=%v", err)
-		return "", false
-	}
-	return usr.HomeDir, true
+// checkFileExists returns true if `filepath` exists.
+func checkFileExists(filepath string) bool {
+	_, err := os.Stat(filepath)
+	return err == nil
 }

 // sortedKeys returns the keys of `m` as a sorted slice.
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@ -18,26 +18,30 @@ import (
 	"github.com/unidoc/unidoc/pdf/model/fonts"
 )

-// Font represents a font which is a series of glyphs. Character codes from PDF strings can be
-// mapped to and from glyphs. Each glyph has metrics.
-// XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14
-//      Remove this hack.
-type Font interface {
-	Encoder() textencoding.TextEncoder
-	SetEncoder(encoder textencoding.TextEncoder)
-	GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
-	GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
-	GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
-	ToPdfObject() core.PdfObject
-}
-
 // PdfFont represents an underlying font structure which can be of type:
 // - Type0
 // - Type1
 // - TrueType
 // etc.
 type PdfFont struct {
-	context Font // The underlying font: Type0, Type1, Truetype, etc..
+	context fonts.Font // The underlying font: Type0, Type1, Truetype, etc..
+}
+
+// getCharCodeMetrics is a handy function for getting character metrics given a charcode.
+func (font PdfFont) getCharCodeMetrics(code uint16) (fonts.CharMetrics, bool) {
+	var nometrics fonts.CharMetrics
+
+	enc := font.Encoder()
+	if enc == nil {
+		return nometrics, false
+	}
+
+	glyph, found := enc.CharcodeToGlyph(code)
+	if !found {
+		return nometrics, false
+	}
+
+	return font.GetGlyphCharMetrics(glyph)
 }

 // GetFontDescriptor returns the font descriptor for `font`.
@ -516,18 +520,7 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)

 // GetCharMetrics returns the char metrics for character code `code`.
 func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
-	t := font.actualFont()
-	if t == nil {
-		common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context)
-		return fonts.CharMetrics{}, false
-	}
-	if m, ok := t.GetCharMetrics(code); ok {
-		return m, ok
-	}
-	if descriptor, err := font.GetFontDescriptor(); err == nil && descriptor != nil {
-		return fonts.CharMetrics{Wx: descriptor.missingWidth}, true
-	}
-	return fonts.CharMetrics{}, false
+	return font.getCharCodeMetrics(code)
 }

 // GetRuneCharMetrics returns the char metrics for rune `r`.
@ -550,18 +543,9 @@ func (font PdfFont) GetRuneCharMetrics(r rune) (fonts.CharMetrics, error) {
 	return m, nil
 }

-// GetAverageCharWidth returns the average width of all the characters in `font`.
-func (font PdfFont) GetAverageCharWidth() float64 {
-	t := font.actualFont()
-	if t == nil {
-		common.Log.Debug("ERROR: GetAverageCharWidth Not implemented for font type=%#T", font.context)
-		return 0.0
-	}
-	return t.GetAverageCharWidth()
-}
-
 // actualFont returns the Font in font.context
-func (font PdfFont) actualFont() Font {
+// NOTE(gunnsth): Actually this only sanity checks the font.context as the returned font will be wrapped in an interface.
+func (font PdfFont) actualFont() fonts.Font {
 	if font.context == nil {
 		common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
 	}
--- a/pdf/model/font_composite.go
+++ b/pdf/model/font_composite.go
@ -131,15 +131,6 @@ func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
 	return font.DescendantFont.GetCharMetrics(code)
 }

-// GetAverageCharWidth returns the average width of all the characters in `font`.
-func (font pdfFontType0) GetAverageCharWidth() float64 {
-	if font.DescendantFont == nil {
-		common.Log.Debug("ERROR: No descendant. font=%s", font)
-		return 0.0
-	}
-	return font.DescendantFont.GetAverageCharWidth()
-}
-
 // Encoder returns the font's text encoder.
 func (font pdfFontType0) Encoder() textencoding.TextEncoder {
 	return font.encoder
@ -253,11 +244,6 @@ func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
 	return fonts.CharMetrics{}, true
 }

-// GetAverageCharWidth returns the average width of all the characters in `font`.
-func (font pdfCIDFontType0) GetAverageCharWidth() float64 {
-	return 0.0
-}
-
 // ToPdfObject converts the pdfCIDFontType0 to a PDF representation.
 func (font *pdfCIDFontType0) ToPdfObject() core.PdfObject {
 	return core.MakeNull()
@ -378,18 +364,6 @@ func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
 	return fonts.CharMetrics{Wx: float64(w)}, true
 }

-// GetAverageCharWidth returns the average width of all the characters in `font`.
-func (font pdfCIDFontType2) GetAverageCharWidth() float64 {
-	if len(font.runeToWidthMap) == 0 {
-		return 0.0
-	}
-	total := 0
-	for _, w := range font.runeToWidthMap {
-		total += w
-	}
-	return float64(total) / float64(len(font.runeToWidthMap))
-}
-
 // ToPdfObject converts the pdfCIDFontType2 to a PDF representation.
 func (font *pdfCIDFontType2) ToPdfObject() core.PdfObject {
 	if font.container == nil {
--- a/pdf/model/font_simple.go
+++ b/pdf/model/font_simple.go
@ -149,18 +149,6 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
 	return fonts.CharMetrics{}, false
 }

-// GetAverageCharWidth returns the average width of all the characters in `font`.
-func (font pdfFontSimple) GetAverageCharWidth() float64 {
-	if font.fontMetrics != nil {
-		return fonts.AverageCharWidth(font.fontMetrics)
-	}
-	total := 0.0
-	for _, w := range font.charWidths {
-		total += w
-	}
-	return total / float64(len(font.charWidths))
-}
-
 // newSimpleFontFromPdfObject creates a pdfFontSimple from dictionary `d`. Elements of `d` that
 // are already parsed are contained in `base`.
 // Standard 14 fonts need to to specify their builtin encoders in the `std14Encoder` parameter.
--- a/pdf/model/fonts/courier.go
+++ b/pdf/model/fonts/courier.go
@ -47,11 +47,6 @@ func (font FontCourier) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontCourier) GetAverageCharWidth() float64 {
-	return AverageCharWidth(CourierCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourier) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/courier_bold.go
+++ b/pdf/model/fonts/courier_bold.go
@ -47,11 +47,6 @@ func (font FontCourierBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontCourierBold) GetAverageCharWidth() float64 {
-	return AverageCharWidth(CourierBoldCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourierBold) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/courier_bold_oblique.go
+++ b/pdf/model/fonts/courier_bold_oblique.go
@ -48,11 +48,6 @@ func (font FontCourierBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetric
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontCourierBoldOblique) GetAverageCharWidth() float64 {
-	return AverageCharWidth(CourierBoldObliqueCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourierBoldOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/courier_oblique.go
+++ b/pdf/model/fonts/courier_oblique.go
@ -47,11 +47,6 @@ func (font FontCourierOblique) GetGlyphCharMetrics(glyph string) (CharMetrics, b
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontCourierOblique) GetAverageCharWidth() float64 {
-	return AverageCharWidth(CourierObliqueCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourierOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/font.go
+++ b/pdf/model/fonts/font.go
@ -18,7 +18,6 @@ type Font interface {
 	Encoder() textencoding.TextEncoder
 	SetEncoder(encoder textencoding.TextEncoder)
 	GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
-	GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
 	ToPdfObject() core.PdfObject
 }

@ -32,11 +31,3 @@ type CharMetrics struct {
 func (m CharMetrics) String() string {
 	return fmt.Sprintf("<%q,%.1f,%.1f>", m.GlyphName, m.Wx, m.Wy)
 }
-
-func AverageCharWidth(metrics map[string]CharMetrics) float64 {
-	total := 0.0
-	for _, m := range metrics {
-		total += m.Wx
-	}
-	return total / float64(len(metrics))
-}
--- a/pdf/model/fonts/helvetica.go
+++ b/pdf/model/fonts/helvetica.go
@ -47,11 +47,6 @@ func (font FontHelvetica) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontHelvetica) GetAverageCharWidth() float64 {
-	return AverageCharWidth(HelveticaCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelvetica) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/helvetica_bold.go
+++ b/pdf/model/fonts/helvetica_bold.go
@ -48,11 +48,6 @@ func (font FontHelveticaBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bo
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontHelveticaBold) GetAverageCharWidth() float64 {
-	return AverageCharWidth(HelveticaBoldCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelveticaBold) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/helvetica_bold_oblique.go
+++ b/pdf/model/fonts/helvetica_bold_oblique.go
@ -47,11 +47,6 @@ func (font FontHelveticaBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetr
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontHelveticaBoldOblique) GetAverageCharWidth() float64 {
-	return AverageCharWidth(HelveticaObliqueCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelveticaBoldOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/helvetica_oblique.go
+++ b/pdf/model/fonts/helvetica_oblique.go
@ -47,11 +47,6 @@ func (font FontHelveticaOblique) GetGlyphCharMetrics(glyph string) (CharMetrics,
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontHelveticaOblique) GetAverageCharWidth() float64 {
-	return AverageCharWidth(HelveticaObliqueCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelveticaOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/symbol.go
+++ b/pdf/model/fonts/symbol.go
@ -48,11 +48,6 @@ func (font FontSymbol) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontSymbol) GetAverageCharWidth() float64 {
-	return AverageCharWidth(SymbolCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontSymbol) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_bold.go
+++ b/pdf/model/fonts/times_bold.go
@ -47,11 +47,6 @@ func (font FontTimesBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontTimesBold) GetAverageCharWidth() float64 {
-	return AverageCharWidth(TimesBoldCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesBold) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_bold_italic.go
+++ b/pdf/model/fonts/times_bold_italic.go
@ -47,11 +47,6 @@ func (font FontTimesBoldItalic) GetGlyphCharMetrics(glyph string) (CharMetrics,
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontTimesBoldItalic) GetAverageCharWidth() float64 {
-	return AverageCharWidth(TimesBoldItalicCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesBoldItalic) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_italic.go
+++ b/pdf/model/fonts/times_italic.go
@ -47,11 +47,6 @@ func (font FontTimesItalic) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontTimesItalic) GetAverageCharWidth() float64 {
-	return AverageCharWidth(TimesItalicCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesItalic) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_roman.go
+++ b/pdf/model/fonts/times_roman.go
@ -47,11 +47,6 @@ func (font FontTimesRoman) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontTimesRoman) GetAverageCharWidth() float64 {
-	return AverageCharWidth(TimesRomanCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesRoman) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/zapfdingbats.go
+++ b/pdf/model/fonts/zapfdingbats.go
@ -48,11 +48,6 @@ func (font FontZapfDingbats) GetGlyphCharMetrics(glyph string) (CharMetrics, boo
 	return metrics, true
 }

-// GetAverageCharWidth returns the average width of all glyphs in the font.
-func (font FontZapfDingbats) GetAverageCharWidth() float64 {
-	return AverageCharWidth(ZapfDingbatsCharMetrics)
-}
-
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontZapfDingbats) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/structures.go
+++ b/pdf/model/structures.go
@ -11,6 +11,7 @@ package model
 import (
 	"errors"
 	"fmt"
+	"math"
 	"regexp"
 	"strconv"

@ -58,6 +59,16 @@ func NewPdfRectangle(arr PdfObjectArray) (*PdfRectangle, error) {
 	return &rect, nil
 }

+// Height returns the height of `rect`.
+func (rect *PdfRectangle) Height() float64 {
+	return math.Abs(rect.Ury - rect.Lly)
+}
+
+// Width returns the width of `rect`.
+func (rect *PdfRectangle) Width() float64 {
+	return math.Abs(rect.Urx - rect.Llx)
+}
+
 // Convert to a PDF object.
 func (rect *PdfRectangle) ToPdfObject() PdfObject {
 	arr := MakeArray(MakeFloat(rect.Llx), MakeFloat(rect.Lly), MakeFloat(rect.Urx), MakeFloat(rect.Ury))