Merge pull request #5 from unidoc/v3-peterwilliams97-extract.text

Cleaning up v3 extract.text
2025-05-05 19:30:30 +08:00 · 2018-11-29 18:03:50 +11:00 · 2018-11-29 18:03:50 +11:00 · 1cea79b8ef
commit 1cea79b8ef
parent 6529b42a70 e6b768c06c
24 changed files with 334 additions and 443 deletions
--- a/pdf/contentstream/matrix.go
+++ b/pdf/contentstream/matrix.go
@ -0,0 +1,157 @@
 /*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */
 package contentstream
 import (
 	"fmt"
 	"math"
 	"github.com/unidoc/unidoc/common"
 )
 // Matrix is a linear transform matrix in homogenous coordinates.
 // PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
 type Matrix [9]float64
 // IdentityMatrix returns the identity transform.
 func IdentityMatrix() Matrix {
 	return NewMatrix(1, 0, 0, 1, 0, 0)
 }
 // TranslationMatrix returns a matrix that translates by `tx`, `ty`.
 func TranslationMatrix(tx, ty float64) Matrix {
 	return NewMatrix(1, 0, 0, 1, tx, ty)
 }
 // NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
 //      a  b  0
 //      c  d  0
 //      tx ty 1
 func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
 	m := Matrix{
 		a, b, 0,
 		c, d, 0,
 		tx, ty, 1,
 	}
 	m.fixup()
 	return m
 }
 // String returns a string describing `m`.
 func (m Matrix) String() string {
 	a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
 	return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
 }
 // Set sets `m` to affine transform a,b,c,d,tx,ty.
 func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
 	m[0], m[1] = a, b
 	m[3], m[4] = c, d
 	m[6], m[7] = tx, ty
 	m.fixup()
 }
 // Concat sets `m` to `m` × `b`.
 // `b` needs to be created by newMatrix. i.e. It must be an affine transform.
 //    m00 m01 0     b00 b01 0     m00*b00 + m01*b01        m00*b10 + m01*b11        0
 //    m10 m11 0  ×  b10 b11 0  =  m10*b00 + m11*b01        m10*b10 + m11*b11        0
 //    m20 m21 1     b20 b21 1     m20*b00 + m21*b10 + b20  m20*b01 + m21*b11 + b21  1
 func (m *Matrix) Concat(b Matrix) {
 	*m = Matrix{
 		m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
 		m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
 		m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
 	}
 	m.fixup()
 }
 // Mult returns `m` × `b`.
 func (m Matrix) Mult(b Matrix) Matrix {
 	m.Concat(b)
 	return m
 }
 // Translate appends a translation of `dx`,`dy` to `m`.
 // m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
 func (m *Matrix) Translate(dx, dy float64) {
 	m[6] += dx
 	m[7] += dy
 	m.fixup()
 }
 // Translation returns the translation part of `m`.
 func (m *Matrix) Translation() (float64, float64) {
 	return m[6], m[7]
 }
 // Translation returns the translation part of `m`.
 func (m *Matrix) ScalingX() float64 {
 	return math.Hypot(m[0], m[1])
 }
 // Transform returns coordinates `x`,`y` transformed by `m`.
 func (m *Matrix) Transform(x, y float64) (float64, float64) {
 	xp := x*m[0] + y*m[1] + m[6]
 	yp := x*m[3] + y*m[4] + m[7]
 	return xp, yp
 }
 // ScalingFactorX returns X scaling of  the affine transform.
 func (m *Matrix) ScalingFactorX() float64 {
 	return math.Sqrt(m[0]*m[0] + m[1]*m[1])
 }
 // ScalingFactorY returns X scaling of  the affine transform.
 func (m *Matrix) ScalingFactorY() float64 {
 	return math.Sqrt(m[3]*m[3] + m[4]*m[4])
 }
 // Angle returns the angle of the affine transform.
 // For simplicity, we assume the transform is a multiple of 90 degrees.
 func (m *Matrix) Angle() int {
 	a, b, c, d := m[0], m[1], m[3], m[4]
 	// We are returning θ for
 	// a b    cos θ  -sin θ
 	// c d =  sin θ   cos θ
 	if a > 0 && d > 0 {
 		//  1  0
 		//  0  1
 		return 0
 	} else if b < 0 && c > 0 {
 		//  0  1
 		// -1  0
 		return 90
 	} else if a < 0 && d < 0 {
 		// -1  0
 		//  0 -1
 		return 180
 	} else if b > 0 && c < 0 {
 		// 0 -1
 		// 1  0
 		return 270
 	}
 	common.Log.Debug("ERROR: Angle not a multiple of 90°. m=%s", m)
 	return 0
 }
 // fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
 // files.
 // Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
 func (m *Matrix) fixup() {
 	for i, x := range m {
 		if x > maxAbsNumber {
 			common.Log.Debug("FIXUP: %d -> %d", x, maxAbsNumber)
 			m[i] = maxAbsNumber
 		} else if x < -maxAbsNumber {
 			common.Log.Debug("FIXUP: %d -> %d", x, -maxAbsNumber)
 			m[i] = -maxAbsNumber
 		}
 	}
 }
 // largest numbers needed in PDF transforms. Is this correct?
 // TODO(gunnsth): Practical value? Need some reasoning.
 const maxAbsNumber = 1e9
--- a/pdf/contentstream/processor.go
+++ b/pdf/contentstream/processor.go
@ -7,8 +7,6 @@ package contentstream
 import (
 	"errors"
 	"fmt"
 	"math"
 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/core"
@ -62,21 +60,26 @@ type HandlerEntry struct {
 	Handler   HandlerFunc
 }
 // HandlerConditionEnum represents the type of operand content stream processor.
 // HandlerConditionEnumOperand handler handles a single operand, whereas
 // HandlerConditionEnumAllOperands processes all operands.
 type HandlerConditionEnum int
 func (csp HandlerConditionEnum) All() bool {
 	return csp == HandlerConditionEnumAllOperands
 }
 func (csp HandlerConditionEnum) Operand() bool {
 	return csp == HandlerConditionEnumOperand
 }
 const (
 	HandlerConditionEnumOperand     HandlerConditionEnum = iota
 	HandlerConditionEnumAllOperands HandlerConditionEnum = iota
 )
 // All returns true if `hce` is equivalent to HandlerConditionEnumAllOperands.
 func (hce HandlerConditionEnum) All() bool {
 	return hce == HandlerConditionEnumAllOperands
 }
 // Operand returns true if `hce` is equivalent to HandlerConditionEnumOperand.
 func (hce HandlerConditionEnum) Operand() bool {
 	return hce == HandlerConditionEnumOperand
 }
 func NewContentStreamProcessor(ops []*ContentStreamOperation) *ContentStreamProcessor {
 	csp := ContentStreamProcessor{}
 	csp.graphicsStack = GraphicStateStack{}
@ -573,144 +576,3 @@ func (proc *ContentStreamProcessor) handleCommand_cm(op *ContentStreamOperation,
 	return nil
 }
 // Matrix is a linear transform matrix in homogenous coordinates.
 // PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
 type Matrix [9]float64
 // IdentityMatrix returns the identity transform.
 func IdentityMatrix() Matrix {
 	return NewMatrix(1, 0, 0, 1, 0, 0)
 }
 // TranslationMatrix returns a matrix that translates by `tx`, `ty`.
 func TranslationMatrix(tx, ty float64) Matrix {
 	return NewMatrix(1, 0, 0, 1, tx, ty)
 }
 // NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
 //      a  b  0
 //      c  d  0
 //      tx ty 1
 func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
 	m := Matrix{
 		a, b, 0,
 		c, d, 0,
 		tx, ty, 1,
 	}
 	m.fixup()
 	return m
 }
 // String returns a string describing `m`.
 func (m Matrix) String() string {
 	a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
 	return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
 }
 // Set sets `m` to affine transform a,b,c,d,tx,ty.
 func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
 	m[0], m[1] = a, b
 	m[3], m[4] = c, d
 	m[6], m[7] = tx, ty
 	m.fixup()
 }
 // Concat sets `m` to `m` × `b`.
 // `b` needs to be created by newMatrix. i.e. It must be an affine transform.
 //    m00 m01 0     b00 b01 0     m00*b00 + m01*b01        m00*b10 + m01*b11        0
 //    m10 m11 0  ×  b10 b11 0  =  m10*b00 + m11*b01        m10*b10 + m11*b11        0
 //    m20 m21 1     b20 b21 1     m20*b00 + m21*b10 + b20  m20*b01 + m21*b11 + b21  1
 func (m *Matrix) Concat(b Matrix) {
 	*m = Matrix{
 		m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
 		m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
 		m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
 	}
 	m.fixup()
 }
 // Mult returns `m` × `b`.
 func (m Matrix) Mult(b Matrix) Matrix {
 	m.Concat(b)
 	return m
 }
 // Translate appends a translation of `dx`,`dy` to `m`.
 // m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
 func (m *Matrix) Translate(dx, dy float64) {
 	m[6] += dx
 	m[7] += dy
 	m.fixup()
 }
 // Translation returns the translation part of `m`.
 func (m *Matrix) Translation() (float64, float64) {
 	return m[6], m[7]
 }
 // Translation returns the translation part of `m`.
 func (m *Matrix) ScalingX() float64 {
 	return math.Hypot(m[0], m[1])
 }
 // Transform returns coordinates `x`,`y` transformed by `m`.
 func (m *Matrix) Transform(x, y float64) (float64, float64) {
 	xp := x*m[0] + y*m[1] + m[6]
 	yp := x*m[3] + y*m[4] + m[7]
 	return xp, yp
 }
 // ScalingFactorX returns X scaling of  the affine transform.
 func (m *Matrix) ScalingFactorX() float64 {
 	return math.Sqrt(m[0]*m[0] + m[1]*m[1])
 }
 // ScalingFactorY returns X scaling of  the affine transform.
 func (m *Matrix) ScalingFactorY() float64 {
 	return math.Sqrt(m[3]*m[3] + m[4]*m[4])
 }
 // Angle returns the angle of the affine transform.
 // For simplicity, we assume the transform is a multiple of 90 degrees.
 func (m *Matrix) Angle() int {
 	a, b, c, d := m[0], m[1], m[3], m[4]
 	// We are returning θ for
 	// a b    cos θ  -sin θ
 	// c d =  sin θ   cos θ
 	if a > 0 && d > 0 {
 		//  1  0
 		//  0  1
 		return 0
 	} else if b < 0 && c > 0 {
 		//  0  1
 		// -1  0
 		return 90
 	} else if a < 0 && d < 0 {
 		// -1  0
 		//  0 -1
 		return 180
 	} else if b > 0 && c < 0 {
 		// 0 -1
 		// 1  0
 		return 270
 	}
 	common.Log.Debug("ERROR: Angle not a mulitple of 90°. m=%s", m)
 	return 0
 }
 // fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
 // files.
 // Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
 func (m *Matrix) fixup() {
 	for i, x := range m {
 		if x > maxAbsNumber {
 			m[i] = maxAbsNumber
 		} else if x < -maxAbsNumber {
 			m[i] = -maxAbsNumber
 		}
 	}
 }
 // largest numbers needed in PDF transforms. Is this correct?
 const maxAbsNumber = 1e9
--- a/pdf/extractor/point.go
+++ b/pdf/extractor/point.go
@ -5,7 +5,7 @@
 * Based on pdf/contentstream/draw/point.go
 */
-// XXX(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.
+// FIXME(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.
 package extractor
@ -16,18 +16,18 @@ import (
 	"github.com/unidoc/unidoc/pdf/contentstream"
 )
-// Point defines a point in Cartesian coordinates
+// Point defines a point (X,Y) in Cartesian coordinates.
 type Point struct {
 	X float64
 	Y float64
 }
-// NewPoint returns a Point at 'x', 'y'.
+// NewPoint returns a Point at `x`, `y`.
 func NewPoint(x, y float64) Point {
 	return Point{X: x, Y: y}
 }
-// Set sets `p` to `x`, `y`.
+// Set sets `p` to coordinates `(x, y)`.
 func (p *Point) Set(x, y float64) {
 	p.X, p.Y = x, y
 }
@ -38,12 +38,12 @@ func (p *Point) Transform(a, b, c, d, tx, ty float64) {
 	p.transformByMatrix(m)
 }
-// Displace returns `p` displaced by `delta`.
+// Displace returns a new Point at location `p` + `delta`.
 func (p Point) Displace(delta Point) Point {
 	return Point{p.X + delta.X, p.Y + delta.Y}
 }
-// Rotate returns `p` rotated by `theta` degrees.
+// Rotate rotates `p` by `theta` degrees and returns back.
 func (p Point) Rotate(theta int) Point {
 	switch theta {
 	case 0:
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@ -27,13 +27,13 @@ import (
 // CharcodeBytesToUnicode.
 // Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
 func (e *Extractor) ExtractText() (string, error) {
-	text, _, _, err := e.ExtractText2()
+	text, _, _, err := e.ExtractTextWithStats()
 	return text, err
 }
-// ExtractText2 works like ExtractText but returns the number of characters in the output and the
+// ExtractTextWithStats works like ExtractText but returns the number of characters in the output and the
 // the number of characters that were not decoded.
-func (e *Extractor) ExtractText2() (string, int, int, error) {
+func (e *Extractor) ExtractTextWithStats() (string, int, int, error) {
 	textList, numChars, numMisses, err := e.ExtractXYText()
 	if err != nil {
 		return "", numChars, numMisses, err
@ -313,6 +313,10 @@ func (to *textObject) nextLine() {
 // Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
 // in `f` (page 250).
 func (to *textObject) setTextMatrix(f []float64) {
 	if len(f) != 6 {
 		common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
 		return
 	}
 	a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
 	to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
 	to.Tlm = to.Tm
@ -358,7 +362,7 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 // setTextLeading "TL". Set text leading.
 func (to *textObject) setTextLeading(y float64) {
-	if to == nil {
+	if to == nil || to.State == nil {
 		return
 	}
 	to.State.Tl = y
@ -427,7 +431,7 @@ func (to *textObject) setHorizScaling(y float64) {
 	to.State.Th = y
 }
-// floatParam returns the single float parameter of operatr `op`, or an error if it doesn't have
+// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
 // a single float parameter or we aren't in a text stream.
 func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
 	if len(op.Params) != 1 {
@ -444,7 +448,7 @@ func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
 func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
 	hard bool) (ok bool, err error) {
 	if to == nil {
-		params := []core.PdfObject{}
+		var params []core.PdfObject
 		if numParams > 0 {
 			params = op.Params
 			if len(params) > numParams {
@ -596,7 +600,7 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat
 	}
 }
-// renderText emits byte array `data` to the calling program.
+// renderText processes and renders byte array `data` for extraction purposes.
 func (to *textObject) renderText(data []byte) error {
 	font := to.getCurrentFont()
@ -628,7 +632,6 @@ func (to *textObject) renderText(data []byte) error {
 	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
 	for i, r := range runes {
 		// XXX(peterwilliams97) Need to find and fix cases where this happens.
 		if r == "\x00" {
 			continue
@ -784,7 +787,7 @@ func (tl TextList) ToText() string {
 	tl.SortPosition()
 	lines := tl.toLines()
-	texts := []string{}
+	texts := make([]string, 0, len(lines))
 	for _, l := range lines {
 		texts = append(texts, l.Text)
 	}
@ -825,11 +828,11 @@ type Line struct {
 func (tl TextList) toLines() []Line {
 	// We divide `tl` into slices which contain texts with the same orientation, extract the lines
 	// for each orientation then return the concatention of these lines sorted by orientation.
-	tlOrient := map[int]TextList{}
+	tlOrient := make(map[int]TextList, len(tl))
 	for _, t := range tl {
 		tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
 	}
-	lines := []Line{}
+	var lines []Line
 	for _, o := range orientKeys(tlOrient) {
 		lines = append(lines, tlOrient[o].toLinesOrient()...)
 	}
@ -846,15 +849,15 @@ func (tl TextList) toLinesOrient() []Line {
 	if len(tl) == 0 {
 		return []Line{}
 	}
-	lines := []Line{}
+	var lines []Line
-	words := []string{}
+	var words []string
-	x := []float64{}
+	var x []float64
 	y := tl[0].OrientedStart.Y
 	scanning := false
-	averageCharWidth := ExponAve{}
+	averageCharWidth := exponAve{}
-	wordSpacing := ExponAve{}
+	wordSpacing := exponAve{}
 	lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X
 	for _, t := range tl {
@ -889,13 +892,13 @@ func (tl TextList) toLinesOrient() []Line {
 		deltaCharWidth := averageCharWidth.ave * 0.3
 		isSpace := false
-		nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
+		nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
 		if scanning && t.Text != " " {
 			isSpace = nextWordX < t.OrientedStart.X
 		}
 		common.Log.Trace("t=%s", t)
 		common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
-			t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
+			t.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
 		common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
 			t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX,
 			nextWordX-t.OrientedStart.X, isSpace)
@ -940,14 +943,14 @@ func min(a, b float64) float64 {
 	return b
 }
-// ExponAve implements an exponential average.
+// exponAve implements an exponential average.
-type ExponAve struct {
+type exponAve struct {
 	ave     float64 // Current average value.
 	running bool    // Has `ave` been set?
 }
 // update updates the exponential average `exp.ave` and returns it
-func (exp *ExponAve) update(x float64) float64 {
+func (exp *exponAve) update(x float64) float64 {
 	if !exp.running {
 		exp.ave = x
 		exp.running = true
@ -957,9 +960,15 @@ func (exp *ExponAve) update(x float64) float64 {
 	return exp.ave
 }
-// printTexts is a debugging function. XXX(peterwilliams97) Remove this.
+const isDebug = false
 // printTexts is a debugging function.
 // TODO(peterwilliams97) Remove this.
 func (tl *TextList) printTexts(message string) {
-	return
+	if !isDebug {
 		return
 	}
 	_, file, line, ok := runtime.Caller(1)
 	if !ok {
 		file = "???"
@ -985,7 +994,7 @@ func (tl *TextList) printTexts(message string) {
 // newLine returns the Line representation of strings `words` with y coordinate `y` and x
 // coordinates `x`.
 func newLine(y float64, x []float64, words []string) Line {
-	dx := []float64{}
+	dx := make([]float64, 0, len(x))
 	for i := 1; i < len(x); i++ {
 		dx = append(dx, x[i]-x[i-1])
 	}
@ -1211,18 +1220,8 @@ type fontEntry struct {
 const maxFontCache = 10
 // getFontDirect returns the font named `name` if it exists in the page's resources or an error if
-// is doesn't.
+// it doesn't. Accesses page resources directly (not cached).
 // This is a direct (uncached access).
 func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
 	// This is a hack for testing.
 	switch name {
 	case "UniDocCourier":
 		return model.NewStandard14FontMustCompile(model.Courier), nil
 	case "UniDocHelvetica":
 		return model.NewStandard14FontMustCompile(model.Helvetica), nil
 	}
 	fontObj, err := to.getFontDict(name)
 	if err != nil {
 		return nil, err
--- a/pdf/extractor/text_test.go
+++ b/pdf/extractor/text_test.go
@ -8,7 +8,6 @@ package extractor
 import (
 	"flag"
 	"os"
 	"os/user"
 	"path/filepath"
 	"regexp"
 	"sort"
@ -20,18 +19,14 @@ import (
 	"golang.org/x/text/unicode/norm"
 )
-// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
+// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
-// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
+// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.
 // forceTest should be set to true to force running all tests.
-const forceTest = false
+// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
 var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
-// corpusFolders is where we search for test files.
+var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
 var corpusFolders = []string{
 	"./testdata",
 	"~/testdata",
 	".",
 }
 func init() {
 	common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
@ -40,23 +35,16 @@ func init() {
 	}
 }
-// TestTextExtraction1 tests text extraction on the PDF fragments in `fragmentTests`.
+// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
-func TestTextExtraction1(t *testing.T) {
+func TestTextExtractionFragments(t *testing.T) {
-	for _, f := range fragmentTests {
+	fragmentTests := []struct {
-		f.testExtraction(t)
+		name     string
-	}
+		contents string
-}
+		text     string
-
+	}{
-type fragment struct {
+		{
-	name     string
+			name: "portrait",
-	contents string
+			contents: `
 	text     string
 }
 var fragmentTests = []fragment{
 	{name: "portrait",
 		contents: `
        BT
        /UniDocCourier 24 Tf
        (Hello World!)Tj
@ -64,10 +52,11 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
+			text: "Hello World!\nDoink",
-	},
+		},
-	{name: "landscape",
+		{
-		contents: `
+			name: "landscape",
 			contents: `
        BT
        /UniDocCourier 24 Tf
        0 1 -1 0 0 0 Tm
@ -76,10 +65,11 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
+			text: "Hello World!\nDoink",
-	},
+		},
-	{name: "180 degree rotation",
+		{
-		contents: `
+			name: "180 degree rotation",
 			contents: `
        BT
        /UniDocCourier 24 Tf
        -1 0 0 -1 0 0 Tm
@ -88,10 +78,11 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
+			text: "Hello World!\nDoink",
-	},
+		},
-	{name: "Helvetica",
+		{
-		contents: `
+			name: "Helvetica",
 			contents: `
        BT
        /UniDocHelvetica 24 Tf
        0 -1 1 0 0 0 Tm
@ -100,35 +91,53 @@ var fragmentTests = []fragment{
        (Doink)Tj
        ET
        `,
-		text: "Hello World!\nDoink",
+			text: "Hello World!\nDoink",
-	},
+		},
 }
 // testExtraction checks that ExtractText() works on fragment `f`.
 func (f fragment) testExtraction(t *testing.T) {
 	e := Extractor{contents: f.contents}
 	text, err := e.ExtractText()
 	if err != nil {
 		t.Fatalf("Error extracting text: %q err=%v", f.name, err)
 		return
 	}
-	if text != f.text {
+
-		t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
+	// Setup mock resources.
-		return
+	resources := model.NewPdfPageResources()
 	{
 		courier := model.NewStandard14FontMustCompile(model.Courier)
 		helvetica := model.NewStandard14FontMustCompile(model.Helvetica)
 		resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
 		resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
 	}
 	for _, f := range fragmentTests {
 		t.Run(f.name, func(t *testing.T) {
 			e := Extractor{resources: resources, contents: f.contents}
 			text, err := e.ExtractText()
 			if err != nil {
 				t.Fatalf("Error extracting text: %q err=%v", f.name, err)
 				return
 			}
 			if text != f.text {
 				t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
 				return
 			}
 		})
 	}
 }
-// TestTextExtraction2 tests text extraction on set of PDF files.
+// TestTextExtractionFiles tests text extraction on a set of PDF files.
 // It checks for the existence of specified strings of words on specified pages.
 // We currently only check within lines as our line order is still improving.
-func TestTextExtraction2(t *testing.T) {
+func TestTextExtractionFiles(t *testing.T) {
-	for _, test := range extract2Tests {
+	if len(corpusFolder) == 0 && !forceTest {
-		testExtract2(t, test.filename, test.expectedPageText)
+		t.Log("Corpus folder not set - skipping")
 		return
 	}
 	for _, test := range fileExtractionTests {
 		t.Run(test.filename, func(t *testing.T) {
 			testExtractFile(t, test.filename, test.expectedPageText)
 		})
 	}
 }
-// extract2Tests are the PDFs and texts we are looking for on specified pages.
+// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
-var extract2Tests = []struct {
+var fileExtractionTests = []struct {
 	filename         string
 	expectedPageText map[int][]string
 }{
@ -216,21 +225,27 @@ var extract2Tests = []struct {
 	},
 }
-// testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted
+// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
 // text to `expectedPageText`.
-// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
+//
-// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
+// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
-// If `filename` cannot be found in `corpusFolders` then the test is skipped.
+// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
-func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
+// the corpus directory.
-	homeDir, hasHome := getHomeDir()
+//
-	path, ok := searchDirectories(homeDir, hasHome, corpusFolders, filename)
+// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
-	if !ok {
+// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
 func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
 	filepath := filepath.Join(corpusFolder, filename)
 	exists := checkFileExists(filepath)
 	if !exists {
 		if forceTest {
 			t.Fatalf("filename=%q does not exist", filename)
 		}
 		t.Logf("%s not found", filename)
 		return
 	}
-	_, actualPageText := extractPageTexts(t, path)
+
 	_, actualPageText := extractPageTexts(t, filepath)
 	for _, pageNum := range sortedKeys(expectedPageText) {
 		expectedSentences, ok := expectedPageText[pageNum]
 		actualText, ok := actualPageText[pageNum]
@ -239,12 +254,12 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri
 		}
 		actualText = norm.NFKC.String(actualText)
 		if !containsSentences(t, expectedSentences, actualText) {
-			t.Fatalf("Text mismatch filename=%q page=%d", path, pageNum)
+			t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
 		}
 	}
 }
-// extractPageTexts runs ExtractText2 on all pages in PDF `filename` and returns the result as a map
+// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
 // {page number: page text}
 func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 	f, err := os.Open(filename)
@ -272,11 +287,11 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 		if err != nil {
 			t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
 		}
-		text, _, _, err := ex.ExtractText2()
+		text, _, _, err := ex.ExtractTextWithStats()
 		if err != nil {
-			t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
+			t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
 		}
-		// XXX(peterwilliams97)TODO: Improve text extraction space insertion so we don't need reduceSpaces.
+		// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
 		pageText[pageNum] = reduceSpaces(text)
 	}
 	return numPages, pageText
@ -303,30 +318,10 @@ func reduceSpaces(text string) string {
 var reSpace = regexp.MustCompile(`(?m)\s+`)
-// searchDirectories searches `directories` for `filename` and returns the full file path if it is
+// checkFileExists returns true if `filepath` exists.
-// found. `homeDir` and `hasHome` are used for home directory substitution.
+func checkFileExists(filepath string) bool {
-func searchDirectories(homeDir string, hasHome bool, directories []string, filename string) (string, bool) {
+	_, err := os.Stat(filepath)
-	for _, direct := range directories {
+	return err == nil
 		if hasHome {
 			direct = strings.Replace(direct, "~", homeDir, 1)
 		}
 		path := filepath.Join(direct, filename)
 		if _, err := os.Stat(path); err == nil {
 			return path, true
 		}
 	}
 	return "", false
 }
 // getHomeDir returns the current user's home directory if it is defined and a bool to tell if it
 // is defined.
 func getHomeDir() (string, bool) {
 	usr, err := user.Current()
 	if err != nil {
 		common.Log.Error("No current user. err=%v", err)
 		return "", false
 	}
 	return usr.HomeDir, true
 }
 // sortedKeys returns the keys of `m` as a sorted slice.
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@ -18,26 +18,30 @@ import (
 	"github.com/unidoc/unidoc/pdf/model/fonts"
 )
 // Font represents a font which is a series of glyphs. Character codes from PDF strings can be
 // mapped to and from glyphs. Each glyph has metrics.
 // XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14
 //      Remove this hack.
 type Font interface {
 	Encoder() textencoding.TextEncoder
 	SetEncoder(encoder textencoding.TextEncoder)
 	GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
 	GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
 	GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
 	ToPdfObject() core.PdfObject
 }
 // PdfFont represents an underlying font structure which can be of type:
 // - Type0
 // - Type1
 // - TrueType
 // etc.
 type PdfFont struct {
-	context Font // The underlying font: Type0, Type1, Truetype, etc..
+	context fonts.Font // The underlying font: Type0, Type1, Truetype, etc..
 }
 // getCharCodeMetrics is a handy function for getting character metrics given a charcode.
 func (font PdfFont) getCharCodeMetrics(code uint16) (fonts.CharMetrics, bool) {
 	var nometrics fonts.CharMetrics
 	enc := font.Encoder()
 	if enc == nil {
 		return nometrics, false
 	}
 	glyph, found := enc.CharcodeToGlyph(code)
 	if !found {
 		return nometrics, false
 	}
 	return font.GetGlyphCharMetrics(glyph)
 }
 // GetFontDescriptor returns the font descriptor for `font`.
@ -516,18 +520,7 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
 // GetCharMetrics returns the char metrics for character code `code`.
 func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
-	t := font.actualFont()
+	return font.getCharCodeMetrics(code)
 	if t == nil {
 		common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context)
 		return fonts.CharMetrics{}, false
 	}
 	if m, ok := t.GetCharMetrics(code); ok {
 		return m, ok
 	}
 	if descriptor, err := font.GetFontDescriptor(); err == nil && descriptor != nil {
 		return fonts.CharMetrics{Wx: descriptor.missingWidth}, true
 	}
 	return fonts.CharMetrics{}, false
 }
 // GetRuneCharMetrics returns the char metrics for rune `r`.
@ -550,18 +543,9 @@ func (font PdfFont) GetRuneCharMetrics(r rune) (fonts.CharMetrics, error) {
 	return m, nil
 }
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font PdfFont) GetAverageCharWidth() float64 {
 	t := font.actualFont()
 	if t == nil {
 		common.Log.Debug("ERROR: GetAverageCharWidth Not implemented for font type=%#T", font.context)
 		return 0.0
 	}
 	return t.GetAverageCharWidth()
 }
 // actualFont returns the Font in font.context
-func (font PdfFont) actualFont() Font {
+// NOTE(gunnsth): Actually this only sanity checks the font.context as the returned font will be wrapped in an interface.
 func (font PdfFont) actualFont() fonts.Font {
 	if font.context == nil {
 		common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
 	}
--- a/pdf/model/font_composite.go
+++ b/pdf/model/font_composite.go
@ -131,15 +131,6 @@ func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
 	return font.DescendantFont.GetCharMetrics(code)
 }
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font pdfFontType0) GetAverageCharWidth() float64 {
 	if font.DescendantFont == nil {
 		common.Log.Debug("ERROR: No descendant. font=%s", font)
 		return 0.0
 	}
 	return font.DescendantFont.GetAverageCharWidth()
 }
 // Encoder returns the font's text encoder.
 func (font pdfFontType0) Encoder() textencoding.TextEncoder {
 	return font.encoder
@ -253,11 +244,6 @@ func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
 	return fonts.CharMetrics{}, true
 }
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font pdfCIDFontType0) GetAverageCharWidth() float64 {
 	return 0.0
 }
 // ToPdfObject converts the pdfCIDFontType0 to a PDF representation.
 func (font *pdfCIDFontType0) ToPdfObject() core.PdfObject {
 	return core.MakeNull()
@ -378,18 +364,6 @@ func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
 	return fonts.CharMetrics{Wx: float64(w)}, true
 }
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font pdfCIDFontType2) GetAverageCharWidth() float64 {
 	if len(font.runeToWidthMap) == 0 {
 		return 0.0
 	}
 	total := 0
 	for _, w := range font.runeToWidthMap {
 		total += w
 	}
 	return float64(total) / float64(len(font.runeToWidthMap))
 }
 // ToPdfObject converts the pdfCIDFontType2 to a PDF representation.
 func (font *pdfCIDFontType2) ToPdfObject() core.PdfObject {
 	if font.container == nil {
--- a/pdf/model/font_simple.go
+++ b/pdf/model/font_simple.go
@ -149,18 +149,6 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
 	return fonts.CharMetrics{}, false
 }
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font pdfFontSimple) GetAverageCharWidth() float64 {
 	if font.fontMetrics != nil {
 		return fonts.AverageCharWidth(font.fontMetrics)
 	}
 	total := 0.0
 	for _, w := range font.charWidths {
 		total += w
 	}
 	return total / float64(len(font.charWidths))
 }
 // newSimpleFontFromPdfObject creates a pdfFontSimple from dictionary `d`. Elements of `d` that
 // are already parsed are contained in `base`.
 // Standard 14 fonts need to to specify their builtin encoders in the `std14Encoder` parameter.
--- a/pdf/model/fonts/courier.go
+++ b/pdf/model/fonts/courier.go
@ -47,11 +47,6 @@ func (font FontCourier) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontCourier) GetAverageCharWidth() float64 {
 	return AverageCharWidth(CourierCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourier) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/courier_bold.go
+++ b/pdf/model/fonts/courier_bold.go
@ -47,11 +47,6 @@ func (font FontCourierBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontCourierBold) GetAverageCharWidth() float64 {
 	return AverageCharWidth(CourierBoldCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourierBold) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/courier_bold_oblique.go
+++ b/pdf/model/fonts/courier_bold_oblique.go
@ -48,11 +48,6 @@ func (font FontCourierBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetric
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontCourierBoldOblique) GetAverageCharWidth() float64 {
 	return AverageCharWidth(CourierBoldObliqueCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourierBoldOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/courier_oblique.go
+++ b/pdf/model/fonts/courier_oblique.go
@ -47,11 +47,6 @@ func (font FontCourierOblique) GetGlyphCharMetrics(glyph string) (CharMetrics, b
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontCourierOblique) GetAverageCharWidth() float64 {
 	return AverageCharWidth(CourierObliqueCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontCourierOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/font.go
+++ b/pdf/model/fonts/font.go
@ -18,7 +18,6 @@ type Font interface {
 	Encoder() textencoding.TextEncoder
 	SetEncoder(encoder textencoding.TextEncoder)
 	GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
 	GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
 	ToPdfObject() core.PdfObject
 }
@ -32,11 +31,3 @@ type CharMetrics struct {
 func (m CharMetrics) String() string {
 	return fmt.Sprintf("<%q,%.1f,%.1f>", m.GlyphName, m.Wx, m.Wy)
 }
 func AverageCharWidth(metrics map[string]CharMetrics) float64 {
 	total := 0.0
 	for _, m := range metrics {
 		total += m.Wx
 	}
 	return total / float64(len(metrics))
 }
--- a/pdf/model/fonts/helvetica.go
+++ b/pdf/model/fonts/helvetica.go
@ -47,11 +47,6 @@ func (font FontHelvetica) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontHelvetica) GetAverageCharWidth() float64 {
 	return AverageCharWidth(HelveticaCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelvetica) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/helvetica_bold.go
+++ b/pdf/model/fonts/helvetica_bold.go
@ -48,11 +48,6 @@ func (font FontHelveticaBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bo
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontHelveticaBold) GetAverageCharWidth() float64 {
 	return AverageCharWidth(HelveticaBoldCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelveticaBold) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/helvetica_bold_oblique.go
+++ b/pdf/model/fonts/helvetica_bold_oblique.go
@ -47,11 +47,6 @@ func (font FontHelveticaBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetr
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontHelveticaBoldOblique) GetAverageCharWidth() float64 {
 	return AverageCharWidth(HelveticaObliqueCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelveticaBoldOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/helvetica_oblique.go
+++ b/pdf/model/fonts/helvetica_oblique.go
@ -47,11 +47,6 @@ func (font FontHelveticaOblique) GetGlyphCharMetrics(glyph string) (CharMetrics,
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontHelveticaOblique) GetAverageCharWidth() float64 {
 	return AverageCharWidth(HelveticaObliqueCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontHelveticaOblique) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/symbol.go
+++ b/pdf/model/fonts/symbol.go
@ -48,11 +48,6 @@ func (font FontSymbol) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontSymbol) GetAverageCharWidth() float64 {
 	return AverageCharWidth(SymbolCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontSymbol) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_bold.go
+++ b/pdf/model/fonts/times_bold.go
@ -47,11 +47,6 @@ func (font FontTimesBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontTimesBold) GetAverageCharWidth() float64 {
 	return AverageCharWidth(TimesBoldCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesBold) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_bold_italic.go
+++ b/pdf/model/fonts/times_bold_italic.go
@ -47,11 +47,6 @@ func (font FontTimesBoldItalic) GetGlyphCharMetrics(glyph string) (CharMetrics,
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontTimesBoldItalic) GetAverageCharWidth() float64 {
 	return AverageCharWidth(TimesBoldItalicCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesBoldItalic) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_italic.go
+++ b/pdf/model/fonts/times_italic.go
@ -47,11 +47,6 @@ func (font FontTimesItalic) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontTimesItalic) GetAverageCharWidth() float64 {
 	return AverageCharWidth(TimesItalicCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesItalic) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/times_roman.go
+++ b/pdf/model/fonts/times_roman.go
@ -47,11 +47,6 @@ func (font FontTimesRoman) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontTimesRoman) GetAverageCharWidth() float64 {
 	return AverageCharWidth(TimesRomanCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontTimesRoman) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/fonts/zapfdingbats.go
+++ b/pdf/model/fonts/zapfdingbats.go
@ -48,11 +48,6 @@ func (font FontZapfDingbats) GetGlyphCharMetrics(glyph string) (CharMetrics, boo
 	return metrics, true
 }
 // GetAverageCharWidth returns the average width of all glyphs in the font.
 func (font FontZapfDingbats) GetAverageCharWidth() float64 {
 	return AverageCharWidth(ZapfDingbatsCharMetrics)
 }
 // ToPdfObject returns a primitive PDF object representation of the font.
 func (font FontZapfDingbats) ToPdfObject() core.PdfObject {
 	fontDict := core.MakeDict()
--- a/pdf/model/structures.go
+++ b/pdf/model/structures.go
@ -11,6 +11,7 @@ package model
 import (
 	"errors"
 	"fmt"
 	"math"
 	"regexp"
 	"strconv"
@ -58,6 +59,16 @@ func NewPdfRectangle(arr PdfObjectArray) (*PdfRectangle, error) {
 	return &rect, nil
 }
 // Height returns the height of `rect`.
 func (rect *PdfRectangle) Height() float64 {
 	return math.Abs(rect.Ury - rect.Lly)
 }
 // Width returns the width of `rect`.
 func (rect *PdfRectangle) Width() float64 {
 	return math.Abs(rect.Urx - rect.Llx)
 }
 // Convert to a PDF object.
 func (rect *PdfRectangle) ToPdfObject() PdfObject {
 	arr := MakeArray(MakeFloat(rect.Llx), MakeFloat(rect.Lly), MakeFloat(rect.Urx), MakeFloat(rect.Ury))