Merge pull request #5 from unidoc/v3-peterwilliams97-extract.text

Cleaning up v3 extract.text
This commit is contained in:
Peter Williams 2018-11-29 18:03:50 +11:00 committed by GitHub
commit 1cea79b8ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 334 additions and 443 deletions

157
pdf/contentstream/matrix.go Normal file
View File

@ -0,0 +1,157 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package contentstream
import (
"fmt"
"math"
"github.com/unidoc/unidoc/common"
)
// Matrix is a linear transform matrix in homogenous coordinates.
// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
type Matrix [9]float64
// IdentityMatrix returns the identity transform.
func IdentityMatrix() Matrix {
return NewMatrix(1, 0, 0, 1, 0, 0)
}
// TranslationMatrix returns a matrix that translates by `tx`, `ty`.
func TranslationMatrix(tx, ty float64) Matrix {
return NewMatrix(1, 0, 0, 1, tx, ty)
}
// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
// a b 0
// c d 0
// tx ty 1
func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
m := Matrix{
a, b, 0,
c, d, 0,
tx, ty, 1,
}
m.fixup()
return m
}
// String returns a string describing `m`.
func (m Matrix) String() string {
a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
}
// Set sets `m` to affine transform a,b,c,d,tx,ty.
func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
m[0], m[1] = a, b
m[3], m[4] = c, d
m[6], m[7] = tx, ty
m.fixup()
}
// Concat sets `m` to `m` × `b`.
// `b` needs to be created by newMatrix. i.e. It must be an affine transform.
// m00 m01 0 b00 b01 0 m00*b00 + m01*b01 m00*b10 + m01*b11 0
// m10 m11 0 × b10 b11 0 = m10*b00 + m11*b01 m10*b10 + m11*b11 0
// m20 m21 1 b20 b21 1 m20*b00 + m21*b10 + b20 m20*b01 + m21*b11 + b21 1
func (m *Matrix) Concat(b Matrix) {
*m = Matrix{
m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
}
m.fixup()
}
// Mult returns `m` × `b`.
func (m Matrix) Mult(b Matrix) Matrix {
m.Concat(b)
return m
}
// Translate appends a translation of `dx`,`dy` to `m`.
// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
func (m *Matrix) Translate(dx, dy float64) {
m[6] += dx
m[7] += dy
m.fixup()
}
// Translation returns the translation part of `m`.
func (m *Matrix) Translation() (float64, float64) {
return m[6], m[7]
}
// Translation returns the translation part of `m`.
func (m *Matrix) ScalingX() float64 {
return math.Hypot(m[0], m[1])
}
// Transform returns coordinates `x`,`y` transformed by `m`.
func (m *Matrix) Transform(x, y float64) (float64, float64) {
xp := x*m[0] + y*m[1] + m[6]
yp := x*m[3] + y*m[4] + m[7]
return xp, yp
}
// ScalingFactorX returns X scaling of the affine transform.
func (m *Matrix) ScalingFactorX() float64 {
return math.Sqrt(m[0]*m[0] + m[1]*m[1])
}
// ScalingFactorY returns X scaling of the affine transform.
func (m *Matrix) ScalingFactorY() float64 {
return math.Sqrt(m[3]*m[3] + m[4]*m[4])
}
// Angle returns the angle of the affine transform.
// For simplicity, we assume the transform is a multiple of 90 degrees.
func (m *Matrix) Angle() int {
a, b, c, d := m[0], m[1], m[3], m[4]
// We are returning θ for
// a b cos θ -sin θ
// c d = sin θ cos θ
if a > 0 && d > 0 {
// 1 0
// 0 1
return 0
} else if b < 0 && c > 0 {
// 0 1
// -1 0
return 90
} else if a < 0 && d < 0 {
// -1 0
// 0 -1
return 180
} else if b > 0 && c < 0 {
// 0 -1
// 1 0
return 270
}
common.Log.Debug("ERROR: Angle not a multiple of 90°. m=%s", m)
return 0
}
// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
// files.
// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
func (m *Matrix) fixup() {
for i, x := range m {
if x > maxAbsNumber {
common.Log.Debug("FIXUP: %d -> %d", x, maxAbsNumber)
m[i] = maxAbsNumber
} else if x < -maxAbsNumber {
common.Log.Debug("FIXUP: %d -> %d", x, -maxAbsNumber)
m[i] = -maxAbsNumber
}
}
}
// largest numbers needed in PDF transforms. Is this correct?
// TODO(gunnsth): Practical value? Need some reasoning.
const maxAbsNumber = 1e9

View File

@ -7,8 +7,6 @@ package contentstream
import (
"errors"
"fmt"
"math"
"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/core"
@ -62,21 +60,26 @@ type HandlerEntry struct {
Handler HandlerFunc
}
// HandlerConditionEnum represents the type of operand content stream processor.
// HandlerConditionEnumOperand handler handles a single operand, whereas
// HandlerConditionEnumAllOperands processes all operands.
type HandlerConditionEnum int
func (csp HandlerConditionEnum) All() bool {
return csp == HandlerConditionEnumAllOperands
}
func (csp HandlerConditionEnum) Operand() bool {
return csp == HandlerConditionEnumOperand
}
const (
HandlerConditionEnumOperand HandlerConditionEnum = iota
HandlerConditionEnumAllOperands HandlerConditionEnum = iota
)
// All returns true if `hce` is equivalent to HandlerConditionEnumAllOperands.
func (hce HandlerConditionEnum) All() bool {
return hce == HandlerConditionEnumAllOperands
}
// Operand returns true if `hce` is equivalent to HandlerConditionEnumOperand.
func (hce HandlerConditionEnum) Operand() bool {
return hce == HandlerConditionEnumOperand
}
func NewContentStreamProcessor(ops []*ContentStreamOperation) *ContentStreamProcessor {
csp := ContentStreamProcessor{}
csp.graphicsStack = GraphicStateStack{}
@ -573,144 +576,3 @@ func (proc *ContentStreamProcessor) handleCommand_cm(op *ContentStreamOperation,
return nil
}
// Matrix is a linear transform matrix in homogenous coordinates.
// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
type Matrix [9]float64
// IdentityMatrix returns the identity transform.
func IdentityMatrix() Matrix {
return NewMatrix(1, 0, 0, 1, 0, 0)
}
// TranslationMatrix returns a matrix that translates by `tx`, `ty`.
func TranslationMatrix(tx, ty float64) Matrix {
return NewMatrix(1, 0, 0, 1, tx, ty)
}
// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
// a b 0
// c d 0
// tx ty 1
func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
m := Matrix{
a, b, 0,
c, d, 0,
tx, ty, 1,
}
m.fixup()
return m
}
// String returns a string describing `m`.
func (m Matrix) String() string {
a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
}
// Set sets `m` to affine transform a,b,c,d,tx,ty.
func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
m[0], m[1] = a, b
m[3], m[4] = c, d
m[6], m[7] = tx, ty
m.fixup()
}
// Concat sets `m` to `m` × `b`.
// `b` needs to be created by newMatrix. i.e. It must be an affine transform.
// m00 m01 0 b00 b01 0 m00*b00 + m01*b01 m00*b10 + m01*b11 0
// m10 m11 0 × b10 b11 0 = m10*b00 + m11*b01 m10*b10 + m11*b11 0
// m20 m21 1 b20 b21 1 m20*b00 + m21*b10 + b20 m20*b01 + m21*b11 + b21 1
func (m *Matrix) Concat(b Matrix) {
*m = Matrix{
m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
}
m.fixup()
}
// Mult returns `m` × `b`.
func (m Matrix) Mult(b Matrix) Matrix {
m.Concat(b)
return m
}
// Translate appends a translation of `dx`,`dy` to `m`.
// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
func (m *Matrix) Translate(dx, dy float64) {
m[6] += dx
m[7] += dy
m.fixup()
}
// Translation returns the translation part of `m`.
func (m *Matrix) Translation() (float64, float64) {
return m[6], m[7]
}
// Translation returns the translation part of `m`.
func (m *Matrix) ScalingX() float64 {
return math.Hypot(m[0], m[1])
}
// Transform returns coordinates `x`,`y` transformed by `m`.
func (m *Matrix) Transform(x, y float64) (float64, float64) {
xp := x*m[0] + y*m[1] + m[6]
yp := x*m[3] + y*m[4] + m[7]
return xp, yp
}
// ScalingFactorX returns X scaling of the affine transform.
func (m *Matrix) ScalingFactorX() float64 {
return math.Sqrt(m[0]*m[0] + m[1]*m[1])
}
// ScalingFactorY returns X scaling of the affine transform.
func (m *Matrix) ScalingFactorY() float64 {
return math.Sqrt(m[3]*m[3] + m[4]*m[4])
}
// Angle returns the angle of the affine transform.
// For simplicity, we assume the transform is a multiple of 90 degrees.
func (m *Matrix) Angle() int {
a, b, c, d := m[0], m[1], m[3], m[4]
// We are returning θ for
// a b cos θ -sin θ
// c d = sin θ cos θ
if a > 0 && d > 0 {
// 1 0
// 0 1
return 0
} else if b < 0 && c > 0 {
// 0 1
// -1 0
return 90
} else if a < 0 && d < 0 {
// -1 0
// 0 -1
return 180
} else if b > 0 && c < 0 {
// 0 -1
// 1 0
return 270
}
common.Log.Debug("ERROR: Angle not a mulitple of 90°. m=%s", m)
return 0
}
// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
// files.
// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
func (m *Matrix) fixup() {
for i, x := range m {
if x > maxAbsNumber {
m[i] = maxAbsNumber
} else if x < -maxAbsNumber {
m[i] = -maxAbsNumber
}
}
}
// largest numbers needed in PDF transforms. Is this correct?
const maxAbsNumber = 1e9

View File

@ -5,7 +5,7 @@
* Based on pdf/contentstream/draw/point.go
*/
// XXX(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.
// FIXME(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.
package extractor
@ -16,18 +16,18 @@ import (
"github.com/unidoc/unidoc/pdf/contentstream"
)
// Point defines a point in Cartesian coordinates
// Point defines a point (X,Y) in Cartesian coordinates.
type Point struct {
X float64
Y float64
}
// NewPoint returns a Point at 'x', 'y'.
// NewPoint returns a Point at `x`, `y`.
func NewPoint(x, y float64) Point {
return Point{X: x, Y: y}
}
// Set sets `p` to `x`, `y`.
// Set sets `p` to coordinates `(x, y)`.
func (p *Point) Set(x, y float64) {
p.X, p.Y = x, y
}
@ -38,12 +38,12 @@ func (p *Point) Transform(a, b, c, d, tx, ty float64) {
p.transformByMatrix(m)
}
// Displace returns `p` displaced by `delta`.
// Displace returns a new Point at location `p` + `delta`.
func (p Point) Displace(delta Point) Point {
return Point{p.X + delta.X, p.Y + delta.Y}
}
// Rotate returns `p` rotated by `theta` degrees.
// Rotate rotates `p` by `theta` degrees and returns back.
func (p Point) Rotate(theta int) Point {
switch theta {
case 0:

View File

@ -27,13 +27,13 @@ import (
// CharcodeBytesToUnicode.
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
func (e *Extractor) ExtractText() (string, error) {
text, _, _, err := e.ExtractText2()
text, _, _, err := e.ExtractTextWithStats()
return text, err
}
// ExtractText2 works like ExtractText but returns the number of characters in the output and the
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output and the
// the number of characters that were not decoded.
func (e *Extractor) ExtractText2() (string, int, int, error) {
func (e *Extractor) ExtractTextWithStats() (string, int, int, error) {
textList, numChars, numMisses, err := e.ExtractXYText()
if err != nil {
return "", numChars, numMisses, err
@ -313,6 +313,10 @@ func (to *textObject) nextLine() {
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
// in `f` (page 250).
func (to *textObject) setTextMatrix(f []float64) {
if len(f) != 6 {
common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
return
}
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
to.Tlm = to.Tm
@ -358,7 +362,7 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
// setTextLeading "TL". Set text leading.
func (to *textObject) setTextLeading(y float64) {
if to == nil {
if to == nil || to.State == nil {
return
}
to.State.Tl = y
@ -427,7 +431,7 @@ func (to *textObject) setHorizScaling(y float64) {
to.State.Th = y
}
// floatParam returns the single float parameter of operatr `op`, or an error if it doesn't have
// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
// a single float parameter or we aren't in a text stream.
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
if len(op.Params) != 1 {
@ -444,7 +448,7 @@ func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
hard bool) (ok bool, err error) {
if to == nil {
params := []core.PdfObject{}
var params []core.PdfObject
if numParams > 0 {
params = op.Params
if len(params) > numParams {
@ -596,7 +600,7 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat
}
}
// renderText emits byte array `data` to the calling program.
// renderText processes and renders byte array `data` for extraction purposes.
func (to *textObject) renderText(data []byte) error {
font := to.getCurrentFont()
@ -628,7 +632,6 @@ func (to *textObject) renderText(data []byte) error {
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
for i, r := range runes {
// XXX(peterwilliams97) Need to find and fix cases where this happens.
if r == "\x00" {
continue
@ -784,7 +787,7 @@ func (tl TextList) ToText() string {
tl.SortPosition()
lines := tl.toLines()
texts := []string{}
texts := make([]string, 0, len(lines))
for _, l := range lines {
texts = append(texts, l.Text)
}
@ -825,11 +828,11 @@ type Line struct {
func (tl TextList) toLines() []Line {
// We divide `tl` into slices which contain texts with the same orientation, extract the lines
// for each orientation then return the concatention of these lines sorted by orientation.
tlOrient := map[int]TextList{}
tlOrient := make(map[int]TextList, len(tl))
for _, t := range tl {
tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
}
lines := []Line{}
var lines []Line
for _, o := range orientKeys(tlOrient) {
lines = append(lines, tlOrient[o].toLinesOrient()...)
}
@ -846,15 +849,15 @@ func (tl TextList) toLinesOrient() []Line {
if len(tl) == 0 {
return []Line{}
}
lines := []Line{}
words := []string{}
x := []float64{}
var lines []Line
var words []string
var x []float64
y := tl[0].OrientedStart.Y
scanning := false
averageCharWidth := ExponAve{}
wordSpacing := ExponAve{}
averageCharWidth := exponAve{}
wordSpacing := exponAve{}
lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X
for _, t := range tl {
@ -889,13 +892,13 @@ func (tl TextList) toLinesOrient() []Line {
deltaCharWidth := averageCharWidth.ave * 0.3
isSpace := false
nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
if scanning && t.Text != " " {
isSpace = nextWordX < t.OrientedStart.X
}
common.Log.Trace("t=%s", t)
common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
t.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX,
nextWordX-t.OrientedStart.X, isSpace)
@ -940,14 +943,14 @@ func min(a, b float64) float64 {
return b
}
// ExponAve implements an exponential average.
type ExponAve struct {
// exponAve implements an exponential average.
type exponAve struct {
ave float64 // Current average value.
running bool // Has `ave` been set?
}
// update updates the exponential average `exp.ave` and returns it
func (exp *ExponAve) update(x float64) float64 {
func (exp *exponAve) update(x float64) float64 {
if !exp.running {
exp.ave = x
exp.running = true
@ -957,9 +960,15 @@ func (exp *ExponAve) update(x float64) float64 {
return exp.ave
}
// printTexts is a debugging function. XXX(peterwilliams97) Remove this.
const isDebug = false
// printTexts is a debugging function.
// TODO(peterwilliams97) Remove this.
func (tl *TextList) printTexts(message string) {
return
if !isDebug {
return
}
_, file, line, ok := runtime.Caller(1)
if !ok {
file = "???"
@ -985,7 +994,7 @@ func (tl *TextList) printTexts(message string) {
// newLine returns the Line representation of strings `words` with y coordinate `y` and x
// coordinates `x`.
func newLine(y float64, x []float64, words []string) Line {
dx := []float64{}
dx := make([]float64, 0, len(x))
for i := 1; i < len(x); i++ {
dx = append(dx, x[i]-x[i-1])
}
@ -1211,18 +1220,8 @@ type fontEntry struct {
const maxFontCache = 10
// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
// is doesn't.
// This is a direct (uncached access).
// it doesn't. Accesses page resources directly (not cached).
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
// This is a hack for testing.
switch name {
case "UniDocCourier":
return model.NewStandard14FontMustCompile(model.Courier), nil
case "UniDocHelvetica":
return model.NewStandard14FontMustCompile(model.Helvetica), nil
}
fontObj, err := to.getFontDict(name)
if err != nil {
return nil, err

View File

@ -8,7 +8,6 @@ package extractor
import (
"flag"
"os"
"os/user"
"path/filepath"
"regexp"
"sort"
@ -20,18 +19,14 @@ import (
"golang.org/x/text/unicode/norm"
)
// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.
// forceTest should be set to true to force running all tests.
const forceTest = false
// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
// corpusFolders is where we search for test files.
var corpusFolders = []string{
"./testdata",
"~/testdata",
".",
}
var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
func init() {
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
@ -40,23 +35,16 @@ func init() {
}
}
// TestTextExtraction1 tests text extraction on the PDF fragments in `fragmentTests`.
func TestTextExtraction1(t *testing.T) {
for _, f := range fragmentTests {
f.testExtraction(t)
}
}
type fragment struct {
name string
contents string
text string
}
var fragmentTests = []fragment{
{name: "portrait",
contents: `
// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
func TestTextExtractionFragments(t *testing.T) {
fragmentTests := []struct {
name string
contents string
text string
}{
{
name: "portrait",
contents: `
BT
/UniDocCourier 24 Tf
(Hello World!)Tj
@ -64,10 +52,11 @@ var fragmentTests = []fragment{
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
{name: "landscape",
contents: `
text: "Hello World!\nDoink",
},
{
name: "landscape",
contents: `
BT
/UniDocCourier 24 Tf
0 1 -1 0 0 0 Tm
@ -76,10 +65,11 @@ var fragmentTests = []fragment{
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
{name: "180 degree rotation",
contents: `
text: "Hello World!\nDoink",
},
{
name: "180 degree rotation",
contents: `
BT
/UniDocCourier 24 Tf
-1 0 0 -1 0 0 Tm
@ -88,10 +78,11 @@ var fragmentTests = []fragment{
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
{name: "Helvetica",
contents: `
text: "Hello World!\nDoink",
},
{
name: "Helvetica",
contents: `
BT
/UniDocHelvetica 24 Tf
0 -1 1 0 0 0 Tm
@ -100,35 +91,53 @@ var fragmentTests = []fragment{
(Doink)Tj
ET
`,
text: "Hello World!\nDoink",
},
}
// testExtraction checks that ExtractText() works on fragment `f`.
func (f fragment) testExtraction(t *testing.T) {
e := Extractor{contents: f.contents}
text, err := e.ExtractText()
if err != nil {
t.Fatalf("Error extracting text: %q err=%v", f.name, err)
return
text: "Hello World!\nDoink",
},
}
if text != f.text {
t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
return
// Setup mock resources.
resources := model.NewPdfPageResources()
{
courier := model.NewStandard14FontMustCompile(model.Courier)
helvetica := model.NewStandard14FontMustCompile(model.Helvetica)
resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
}
for _, f := range fragmentTests {
t.Run(f.name, func(t *testing.T) {
e := Extractor{resources: resources, contents: f.contents}
text, err := e.ExtractText()
if err != nil {
t.Fatalf("Error extracting text: %q err=%v", f.name, err)
return
}
if text != f.text {
t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
return
}
})
}
}
// TestTextExtraction2 tests text extraction on set of PDF files.
// TestTextExtractionFiles tests text extraction on a set of PDF files.
// It checks for the existence of specified strings of words on specified pages.
// We currently only check within lines as our line order is still improving.
func TestTextExtraction2(t *testing.T) {
for _, test := range extract2Tests {
testExtract2(t, test.filename, test.expectedPageText)
func TestTextExtractionFiles(t *testing.T) {
if len(corpusFolder) == 0 && !forceTest {
t.Log("Corpus folder not set - skipping")
return
}
for _, test := range fileExtractionTests {
t.Run(test.filename, func(t *testing.T) {
testExtractFile(t, test.filename, test.expectedPageText)
})
}
}
// extract2Tests are the PDFs and texts we are looking for on specified pages.
var extract2Tests = []struct {
// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
var fileExtractionTests = []struct {
filename string
expectedPageText map[int][]string
}{
@ -216,21 +225,27 @@ var extract2Tests = []struct {
},
}
// testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted
// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
// text to `expectedPageText`.
// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
// If `filename` cannot be found in `corpusFolders` then the test is skipped.
func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
homeDir, hasHome := getHomeDir()
path, ok := searchDirectories(homeDir, hasHome, corpusFolders, filename)
if !ok {
//
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
// the corpus directory.
//
// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
filepath := filepath.Join(corpusFolder, filename)
exists := checkFileExists(filepath)
if !exists {
if forceTest {
t.Fatalf("filename=%q does not exist", filename)
}
t.Logf("%s not found", filename)
return
}
_, actualPageText := extractPageTexts(t, path)
_, actualPageText := extractPageTexts(t, filepath)
for _, pageNum := range sortedKeys(expectedPageText) {
expectedSentences, ok := expectedPageText[pageNum]
actualText, ok := actualPageText[pageNum]
@ -239,12 +254,12 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri
}
actualText = norm.NFKC.String(actualText)
if !containsSentences(t, expectedSentences, actualText) {
t.Fatalf("Text mismatch filename=%q page=%d", path, pageNum)
t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
}
}
}
// extractPageTexts runs ExtractText2 on all pages in PDF `filename` and returns the result as a map
// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
// {page number: page text}
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
f, err := os.Open(filename)
@ -272,11 +287,11 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
if err != nil {
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
text, _, _, err := ex.ExtractText2()
text, _, _, err := ex.ExtractTextWithStats()
if err != nil {
t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
}
// XXX(peterwilliams97)TODO: Improve text extraction space insertion so we don't need reduceSpaces.
// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
pageText[pageNum] = reduceSpaces(text)
}
return numPages, pageText
@ -303,30 +318,10 @@ func reduceSpaces(text string) string {
var reSpace = regexp.MustCompile(`(?m)\s+`)
// searchDirectories searches `directories` for `filename` and returns the full file path if it is
// found. `homeDir` and `hasHome` are used for home directory substitution.
func searchDirectories(homeDir string, hasHome bool, directories []string, filename string) (string, bool) {
for _, direct := range directories {
if hasHome {
direct = strings.Replace(direct, "~", homeDir, 1)
}
path := filepath.Join(direct, filename)
if _, err := os.Stat(path); err == nil {
return path, true
}
}
return "", false
}
// getHomeDir returns the current user's home directory if it is defined and a bool to tell if it
// is defined.
func getHomeDir() (string, bool) {
usr, err := user.Current()
if err != nil {
common.Log.Error("No current user. err=%v", err)
return "", false
}
return usr.HomeDir, true
// checkFileExists returns true if `filepath` exists.
func checkFileExists(filepath string) bool {
_, err := os.Stat(filepath)
return err == nil
}
// sortedKeys returns the keys of `m` as a sorted slice.

View File

@ -18,26 +18,30 @@ import (
"github.com/unidoc/unidoc/pdf/model/fonts"
)
// Font represents a font which is a series of glyphs. Character codes from PDF strings can be
// mapped to and from glyphs. Each glyph has metrics.
// XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14
// Remove this hack.
type Font interface {
Encoder() textencoding.TextEncoder
SetEncoder(encoder textencoding.TextEncoder)
GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
ToPdfObject() core.PdfObject
}
// PdfFont represents an underlying font structure which can be of type:
// - Type0
// - Type1
// - TrueType
// etc.
type PdfFont struct {
context Font // The underlying font: Type0, Type1, Truetype, etc..
context fonts.Font // The underlying font: Type0, Type1, Truetype, etc..
}
// getCharCodeMetrics is a handy function for getting character metrics given a charcode.
func (font PdfFont) getCharCodeMetrics(code uint16) (fonts.CharMetrics, bool) {
var nometrics fonts.CharMetrics
enc := font.Encoder()
if enc == nil {
return nometrics, false
}
glyph, found := enc.CharcodeToGlyph(code)
if !found {
return nometrics, false
}
return font.GetGlyphCharMetrics(glyph)
}
// GetFontDescriptor returns the font descriptor for `font`.
@ -516,18 +520,7 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
// GetCharMetrics returns the char metrics for character code `code`.
func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
t := font.actualFont()
if t == nil {
common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context)
return fonts.CharMetrics{}, false
}
if m, ok := t.GetCharMetrics(code); ok {
return m, ok
}
if descriptor, err := font.GetFontDescriptor(); err == nil && descriptor != nil {
return fonts.CharMetrics{Wx: descriptor.missingWidth}, true
}
return fonts.CharMetrics{}, false
return font.getCharCodeMetrics(code)
}
// GetRuneCharMetrics returns the char metrics for rune `r`.
@ -550,18 +543,9 @@ func (font PdfFont) GetRuneCharMetrics(r rune) (fonts.CharMetrics, error) {
return m, nil
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font PdfFont) GetAverageCharWidth() float64 {
t := font.actualFont()
if t == nil {
common.Log.Debug("ERROR: GetAverageCharWidth Not implemented for font type=%#T", font.context)
return 0.0
}
return t.GetAverageCharWidth()
}
// actualFont returns the Font in font.context
func (font PdfFont) actualFont() Font {
// NOTE(gunnsth): Actually this only sanity checks the font.context as the returned font will be wrapped in an interface.
func (font PdfFont) actualFont() fonts.Font {
if font.context == nil {
common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
}

View File

@ -131,15 +131,6 @@ func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
return font.DescendantFont.GetCharMetrics(code)
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font pdfFontType0) GetAverageCharWidth() float64 {
if font.DescendantFont == nil {
common.Log.Debug("ERROR: No descendant. font=%s", font)
return 0.0
}
return font.DescendantFont.GetAverageCharWidth()
}
// Encoder returns the font's text encoder.
func (font pdfFontType0) Encoder() textencoding.TextEncoder {
return font.encoder
@ -253,11 +244,6 @@ func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
return fonts.CharMetrics{}, true
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font pdfCIDFontType0) GetAverageCharWidth() float64 {
return 0.0
}
// ToPdfObject converts the pdfCIDFontType0 to a PDF representation.
func (font *pdfCIDFontType0) ToPdfObject() core.PdfObject {
return core.MakeNull()
@ -378,18 +364,6 @@ func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
return fonts.CharMetrics{Wx: float64(w)}, true
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font pdfCIDFontType2) GetAverageCharWidth() float64 {
if len(font.runeToWidthMap) == 0 {
return 0.0
}
total := 0
for _, w := range font.runeToWidthMap {
total += w
}
return float64(total) / float64(len(font.runeToWidthMap))
}
// ToPdfObject converts the pdfCIDFontType2 to a PDF representation.
func (font *pdfCIDFontType2) ToPdfObject() core.PdfObject {
if font.container == nil {

View File

@ -149,18 +149,6 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
return fonts.CharMetrics{}, false
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font pdfFontSimple) GetAverageCharWidth() float64 {
if font.fontMetrics != nil {
return fonts.AverageCharWidth(font.fontMetrics)
}
total := 0.0
for _, w := range font.charWidths {
total += w
}
return total / float64(len(font.charWidths))
}
// newSimpleFontFromPdfObject creates a pdfFontSimple from dictionary `d`. Elements of `d` that
// are already parsed are contained in `base`.
// Standard 14 fonts need to to specify their builtin encoders in the `std14Encoder` parameter.

View File

@ -47,11 +47,6 @@ func (font FontCourier) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontCourier) GetAverageCharWidth() float64 {
return AverageCharWidth(CourierCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontCourier) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontCourierBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontCourierBold) GetAverageCharWidth() float64 {
return AverageCharWidth(CourierBoldCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontCourierBold) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -48,11 +48,6 @@ func (font FontCourierBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetric
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontCourierBoldOblique) GetAverageCharWidth() float64 {
return AverageCharWidth(CourierBoldObliqueCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontCourierBoldOblique) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontCourierOblique) GetGlyphCharMetrics(glyph string) (CharMetrics, b
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontCourierOblique) GetAverageCharWidth() float64 {
return AverageCharWidth(CourierObliqueCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontCourierOblique) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -18,7 +18,6 @@ type Font interface {
Encoder() textencoding.TextEncoder
SetEncoder(encoder textencoding.TextEncoder)
GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
ToPdfObject() core.PdfObject
}
@ -32,11 +31,3 @@ type CharMetrics struct {
func (m CharMetrics) String() string {
return fmt.Sprintf("<%q,%.1f,%.1f>", m.GlyphName, m.Wx, m.Wy)
}
func AverageCharWidth(metrics map[string]CharMetrics) float64 {
total := 0.0
for _, m := range metrics {
total += m.Wx
}
return total / float64(len(metrics))
}

View File

@ -47,11 +47,6 @@ func (font FontHelvetica) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontHelvetica) GetAverageCharWidth() float64 {
return AverageCharWidth(HelveticaCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontHelvetica) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -48,11 +48,6 @@ func (font FontHelveticaBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bo
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontHelveticaBold) GetAverageCharWidth() float64 {
return AverageCharWidth(HelveticaBoldCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontHelveticaBold) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontHelveticaBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetr
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontHelveticaBoldOblique) GetAverageCharWidth() float64 {
return AverageCharWidth(HelveticaObliqueCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontHelveticaBoldOblique) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontHelveticaOblique) GetGlyphCharMetrics(glyph string) (CharMetrics,
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontHelveticaOblique) GetAverageCharWidth() float64 {
return AverageCharWidth(HelveticaObliqueCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontHelveticaOblique) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -48,11 +48,6 @@ func (font FontSymbol) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontSymbol) GetAverageCharWidth() float64 {
return AverageCharWidth(SymbolCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontSymbol) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontTimesBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontTimesBold) GetAverageCharWidth() float64 {
return AverageCharWidth(TimesBoldCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontTimesBold) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontTimesBoldItalic) GetGlyphCharMetrics(glyph string) (CharMetrics,
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontTimesBoldItalic) GetAverageCharWidth() float64 {
return AverageCharWidth(TimesBoldItalicCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontTimesBoldItalic) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontTimesItalic) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontTimesItalic) GetAverageCharWidth() float64 {
return AverageCharWidth(TimesItalicCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontTimesItalic) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -47,11 +47,6 @@ func (font FontTimesRoman) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontTimesRoman) GetAverageCharWidth() float64 {
return AverageCharWidth(TimesRomanCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontTimesRoman) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -48,11 +48,6 @@ func (font FontZapfDingbats) GetGlyphCharMetrics(glyph string) (CharMetrics, boo
return metrics, true
}
// GetAverageCharWidth returns the average width of all glyphs in the font.
func (font FontZapfDingbats) GetAverageCharWidth() float64 {
return AverageCharWidth(ZapfDingbatsCharMetrics)
}
// ToPdfObject returns a primitive PDF object representation of the font.
func (font FontZapfDingbats) ToPdfObject() core.PdfObject {
fontDict := core.MakeDict()

View File

@ -11,6 +11,7 @@ package model
import (
"errors"
"fmt"
"math"
"regexp"
"strconv"
@ -58,6 +59,16 @@ func NewPdfRectangle(arr PdfObjectArray) (*PdfRectangle, error) {
return &rect, nil
}
// Height returns the height of `rect`.
func (rect *PdfRectangle) Height() float64 {
return math.Abs(rect.Ury - rect.Lly)
}
// Width returns the width of `rect`.
func (rect *PdfRectangle) Width() float64 {
return math.Abs(rect.Urx - rect.Llx)
}
// Convert to a PDF object.
func (rect *PdfRectangle) ToPdfObject() PdfObject {
arr := MakeArray(MakeFloat(rect.Llx), MakeFloat(rect.Lly), MakeFloat(rect.Urx), MakeFloat(rect.Ury))