mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-05 19:30:30 +08:00
Merge pull request #5 from unidoc/v3-peterwilliams97-extract.text
Cleaning up v3 extract.text
This commit is contained in:
commit
1cea79b8ef
157
pdf/contentstream/matrix.go
Normal file
157
pdf/contentstream/matrix.go
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
/*
|
||||||
|
* This file is subject to the terms and conditions defined in
|
||||||
|
* file 'LICENSE.md', which is part of this source code package.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package contentstream
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"github.com/unidoc/unidoc/common"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Matrix is a linear transform matrix in homogenous coordinates.
|
||||||
|
// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
|
||||||
|
type Matrix [9]float64
|
||||||
|
|
||||||
|
// IdentityMatrix returns the identity transform.
|
||||||
|
func IdentityMatrix() Matrix {
|
||||||
|
return NewMatrix(1, 0, 0, 1, 0, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TranslationMatrix returns a matrix that translates by `tx`, `ty`.
|
||||||
|
func TranslationMatrix(tx, ty float64) Matrix {
|
||||||
|
return NewMatrix(1, 0, 0, 1, tx, ty)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
|
||||||
|
// a b 0
|
||||||
|
// c d 0
|
||||||
|
// tx ty 1
|
||||||
|
func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
|
||||||
|
m := Matrix{
|
||||||
|
a, b, 0,
|
||||||
|
c, d, 0,
|
||||||
|
tx, ty, 1,
|
||||||
|
}
|
||||||
|
m.fixup()
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns a string describing `m`.
|
||||||
|
func (m Matrix) String() string {
|
||||||
|
a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
|
||||||
|
return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set sets `m` to affine transform a,b,c,d,tx,ty.
|
||||||
|
func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
|
||||||
|
m[0], m[1] = a, b
|
||||||
|
m[3], m[4] = c, d
|
||||||
|
m[6], m[7] = tx, ty
|
||||||
|
m.fixup()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Concat sets `m` to `m` × `b`.
|
||||||
|
// `b` needs to be created by newMatrix. i.e. It must be an affine transform.
|
||||||
|
// m00 m01 0 b00 b01 0 m00*b00 + m01*b01 m00*b10 + m01*b11 0
|
||||||
|
// m10 m11 0 × b10 b11 0 = m10*b00 + m11*b01 m10*b10 + m11*b11 0
|
||||||
|
// m20 m21 1 b20 b21 1 m20*b00 + m21*b10 + b20 m20*b01 + m21*b11 + b21 1
|
||||||
|
func (m *Matrix) Concat(b Matrix) {
|
||||||
|
*m = Matrix{
|
||||||
|
m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
|
||||||
|
m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
|
||||||
|
m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
|
||||||
|
}
|
||||||
|
m.fixup()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mult returns `m` × `b`.
|
||||||
|
func (m Matrix) Mult(b Matrix) Matrix {
|
||||||
|
m.Concat(b)
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translate appends a translation of `dx`,`dy` to `m`.
|
||||||
|
// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
|
||||||
|
func (m *Matrix) Translate(dx, dy float64) {
|
||||||
|
m[6] += dx
|
||||||
|
m[7] += dy
|
||||||
|
m.fixup()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translation returns the translation part of `m`.
|
||||||
|
func (m *Matrix) Translation() (float64, float64) {
|
||||||
|
return m[6], m[7]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translation returns the translation part of `m`.
|
||||||
|
func (m *Matrix) ScalingX() float64 {
|
||||||
|
return math.Hypot(m[0], m[1])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transform returns coordinates `x`,`y` transformed by `m`.
|
||||||
|
func (m *Matrix) Transform(x, y float64) (float64, float64) {
|
||||||
|
xp := x*m[0] + y*m[1] + m[6]
|
||||||
|
yp := x*m[3] + y*m[4] + m[7]
|
||||||
|
return xp, yp
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScalingFactorX returns X scaling of the affine transform.
|
||||||
|
func (m *Matrix) ScalingFactorX() float64 {
|
||||||
|
return math.Sqrt(m[0]*m[0] + m[1]*m[1])
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScalingFactorY returns X scaling of the affine transform.
|
||||||
|
func (m *Matrix) ScalingFactorY() float64 {
|
||||||
|
return math.Sqrt(m[3]*m[3] + m[4]*m[4])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Angle returns the angle of the affine transform.
|
||||||
|
// For simplicity, we assume the transform is a multiple of 90 degrees.
|
||||||
|
func (m *Matrix) Angle() int {
|
||||||
|
a, b, c, d := m[0], m[1], m[3], m[4]
|
||||||
|
// We are returning θ for
|
||||||
|
// a b cos θ -sin θ
|
||||||
|
// c d = sin θ cos θ
|
||||||
|
if a > 0 && d > 0 {
|
||||||
|
// 1 0
|
||||||
|
// 0 1
|
||||||
|
return 0
|
||||||
|
} else if b < 0 && c > 0 {
|
||||||
|
// 0 1
|
||||||
|
// -1 0
|
||||||
|
return 90
|
||||||
|
} else if a < 0 && d < 0 {
|
||||||
|
// -1 0
|
||||||
|
// 0 -1
|
||||||
|
return 180
|
||||||
|
} else if b > 0 && c < 0 {
|
||||||
|
// 0 -1
|
||||||
|
// 1 0
|
||||||
|
return 270
|
||||||
|
}
|
||||||
|
common.Log.Debug("ERROR: Angle not a multiple of 90°. m=%s", m)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
|
||||||
|
// files.
|
||||||
|
// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
|
||||||
|
func (m *Matrix) fixup() {
|
||||||
|
for i, x := range m {
|
||||||
|
if x > maxAbsNumber {
|
||||||
|
common.Log.Debug("FIXUP: %d -> %d", x, maxAbsNumber)
|
||||||
|
m[i] = maxAbsNumber
|
||||||
|
} else if x < -maxAbsNumber {
|
||||||
|
common.Log.Debug("FIXUP: %d -> %d", x, -maxAbsNumber)
|
||||||
|
m[i] = -maxAbsNumber
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// largest numbers needed in PDF transforms. Is this correct?
|
||||||
|
// TODO(gunnsth): Practical value? Need some reasoning.
|
||||||
|
const maxAbsNumber = 1e9
|
@ -7,8 +7,6 @@ package contentstream
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
|
||||||
"math"
|
|
||||||
|
|
||||||
"github.com/unidoc/unidoc/common"
|
"github.com/unidoc/unidoc/common"
|
||||||
"github.com/unidoc/unidoc/pdf/core"
|
"github.com/unidoc/unidoc/pdf/core"
|
||||||
@ -62,21 +60,26 @@ type HandlerEntry struct {
|
|||||||
Handler HandlerFunc
|
Handler HandlerFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HandlerConditionEnum represents the type of operand content stream processor.
|
||||||
|
// HandlerConditionEnumOperand handler handles a single operand, whereas
|
||||||
|
// HandlerConditionEnumAllOperands processes all operands.
|
||||||
type HandlerConditionEnum int
|
type HandlerConditionEnum int
|
||||||
|
|
||||||
func (csp HandlerConditionEnum) All() bool {
|
|
||||||
return csp == HandlerConditionEnumAllOperands
|
|
||||||
}
|
|
||||||
|
|
||||||
func (csp HandlerConditionEnum) Operand() bool {
|
|
||||||
return csp == HandlerConditionEnumOperand
|
|
||||||
}
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
HandlerConditionEnumOperand HandlerConditionEnum = iota
|
HandlerConditionEnumOperand HandlerConditionEnum = iota
|
||||||
HandlerConditionEnumAllOperands HandlerConditionEnum = iota
|
HandlerConditionEnumAllOperands HandlerConditionEnum = iota
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// All returns true if `hce` is equivalent to HandlerConditionEnumAllOperands.
|
||||||
|
func (hce HandlerConditionEnum) All() bool {
|
||||||
|
return hce == HandlerConditionEnumAllOperands
|
||||||
|
}
|
||||||
|
|
||||||
|
// Operand returns true if `hce` is equivalent to HandlerConditionEnumOperand.
|
||||||
|
func (hce HandlerConditionEnum) Operand() bool {
|
||||||
|
return hce == HandlerConditionEnumOperand
|
||||||
|
}
|
||||||
|
|
||||||
func NewContentStreamProcessor(ops []*ContentStreamOperation) *ContentStreamProcessor {
|
func NewContentStreamProcessor(ops []*ContentStreamOperation) *ContentStreamProcessor {
|
||||||
csp := ContentStreamProcessor{}
|
csp := ContentStreamProcessor{}
|
||||||
csp.graphicsStack = GraphicStateStack{}
|
csp.graphicsStack = GraphicStateStack{}
|
||||||
@ -573,144 +576,3 @@ func (proc *ContentStreamProcessor) handleCommand_cm(op *ContentStreamOperation,
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Matrix is a linear transform matrix in homogenous coordinates.
|
|
||||||
// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
|
|
||||||
type Matrix [9]float64
|
|
||||||
|
|
||||||
// IdentityMatrix returns the identity transform.
|
|
||||||
func IdentityMatrix() Matrix {
|
|
||||||
return NewMatrix(1, 0, 0, 1, 0, 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TranslationMatrix returns a matrix that translates by `tx`, `ty`.
|
|
||||||
func TranslationMatrix(tx, ty float64) Matrix {
|
|
||||||
return NewMatrix(1, 0, 0, 1, tx, ty)
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
|
|
||||||
// a b 0
|
|
||||||
// c d 0
|
|
||||||
// tx ty 1
|
|
||||||
func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
|
|
||||||
m := Matrix{
|
|
||||||
a, b, 0,
|
|
||||||
c, d, 0,
|
|
||||||
tx, ty, 1,
|
|
||||||
}
|
|
||||||
m.fixup()
|
|
||||||
return m
|
|
||||||
}
|
|
||||||
|
|
||||||
// String returns a string describing `m`.
|
|
||||||
func (m Matrix) String() string {
|
|
||||||
a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
|
|
||||||
return fmt.Sprintf("[%.4f,%.4f,%.4f,%.4f:%.4f,%.4f]", a, b, c, d, tx, ty)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set sets `m` to affine transform a,b,c,d,tx,ty.
|
|
||||||
func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
|
|
||||||
m[0], m[1] = a, b
|
|
||||||
m[3], m[4] = c, d
|
|
||||||
m[6], m[7] = tx, ty
|
|
||||||
m.fixup()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Concat sets `m` to `m` × `b`.
|
|
||||||
// `b` needs to be created by newMatrix. i.e. It must be an affine transform.
|
|
||||||
// m00 m01 0 b00 b01 0 m00*b00 + m01*b01 m00*b10 + m01*b11 0
|
|
||||||
// m10 m11 0 × b10 b11 0 = m10*b00 + m11*b01 m10*b10 + m11*b11 0
|
|
||||||
// m20 m21 1 b20 b21 1 m20*b00 + m21*b10 + b20 m20*b01 + m21*b11 + b21 1
|
|
||||||
func (m *Matrix) Concat(b Matrix) {
|
|
||||||
*m = Matrix{
|
|
||||||
m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
|
|
||||||
m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
|
|
||||||
m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
|
|
||||||
}
|
|
||||||
m.fixup()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Mult returns `m` × `b`.
|
|
||||||
func (m Matrix) Mult(b Matrix) Matrix {
|
|
||||||
m.Concat(b)
|
|
||||||
return m
|
|
||||||
}
|
|
||||||
|
|
||||||
// Translate appends a translation of `dx`,`dy` to `m`.
|
|
||||||
// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
|
|
||||||
func (m *Matrix) Translate(dx, dy float64) {
|
|
||||||
m[6] += dx
|
|
||||||
m[7] += dy
|
|
||||||
m.fixup()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Translation returns the translation part of `m`.
|
|
||||||
func (m *Matrix) Translation() (float64, float64) {
|
|
||||||
return m[6], m[7]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Translation returns the translation part of `m`.
|
|
||||||
func (m *Matrix) ScalingX() float64 {
|
|
||||||
return math.Hypot(m[0], m[1])
|
|
||||||
}
|
|
||||||
|
|
||||||
// Transform returns coordinates `x`,`y` transformed by `m`.
|
|
||||||
func (m *Matrix) Transform(x, y float64) (float64, float64) {
|
|
||||||
xp := x*m[0] + y*m[1] + m[6]
|
|
||||||
yp := x*m[3] + y*m[4] + m[7]
|
|
||||||
return xp, yp
|
|
||||||
}
|
|
||||||
|
|
||||||
// ScalingFactorX returns X scaling of the affine transform.
|
|
||||||
func (m *Matrix) ScalingFactorX() float64 {
|
|
||||||
return math.Sqrt(m[0]*m[0] + m[1]*m[1])
|
|
||||||
}
|
|
||||||
|
|
||||||
// ScalingFactorY returns X scaling of the affine transform.
|
|
||||||
func (m *Matrix) ScalingFactorY() float64 {
|
|
||||||
return math.Sqrt(m[3]*m[3] + m[4]*m[4])
|
|
||||||
}
|
|
||||||
|
|
||||||
// Angle returns the angle of the affine transform.
|
|
||||||
// For simplicity, we assume the transform is a multiple of 90 degrees.
|
|
||||||
func (m *Matrix) Angle() int {
|
|
||||||
a, b, c, d := m[0], m[1], m[3], m[4]
|
|
||||||
// We are returning θ for
|
|
||||||
// a b cos θ -sin θ
|
|
||||||
// c d = sin θ cos θ
|
|
||||||
if a > 0 && d > 0 {
|
|
||||||
// 1 0
|
|
||||||
// 0 1
|
|
||||||
return 0
|
|
||||||
} else if b < 0 && c > 0 {
|
|
||||||
// 0 1
|
|
||||||
// -1 0
|
|
||||||
return 90
|
|
||||||
} else if a < 0 && d < 0 {
|
|
||||||
// -1 0
|
|
||||||
// 0 -1
|
|
||||||
return 180
|
|
||||||
} else if b > 0 && c < 0 {
|
|
||||||
// 0 -1
|
|
||||||
// 1 0
|
|
||||||
return 270
|
|
||||||
}
|
|
||||||
common.Log.Debug("ERROR: Angle not a mulitple of 90°. m=%s", m)
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
|
|
||||||
// files.
|
|
||||||
// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
|
|
||||||
func (m *Matrix) fixup() {
|
|
||||||
for i, x := range m {
|
|
||||||
if x > maxAbsNumber {
|
|
||||||
m[i] = maxAbsNumber
|
|
||||||
} else if x < -maxAbsNumber {
|
|
||||||
m[i] = -maxAbsNumber
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// largest numbers needed in PDF transforms. Is this correct?
|
|
||||||
const maxAbsNumber = 1e9
|
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
* Based on pdf/contentstream/draw/point.go
|
* Based on pdf/contentstream/draw/point.go
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// XXX(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.
|
// FIXME(peterwilliams97) Change to functional style. i.e. Return new value, don't mutate.
|
||||||
|
|
||||||
package extractor
|
package extractor
|
||||||
|
|
||||||
@ -16,18 +16,18 @@ import (
|
|||||||
"github.com/unidoc/unidoc/pdf/contentstream"
|
"github.com/unidoc/unidoc/pdf/contentstream"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Point defines a point in Cartesian coordinates
|
// Point defines a point (X,Y) in Cartesian coordinates.
|
||||||
type Point struct {
|
type Point struct {
|
||||||
X float64
|
X float64
|
||||||
Y float64
|
Y float64
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewPoint returns a Point at 'x', 'y'.
|
// NewPoint returns a Point at `x`, `y`.
|
||||||
func NewPoint(x, y float64) Point {
|
func NewPoint(x, y float64) Point {
|
||||||
return Point{X: x, Y: y}
|
return Point{X: x, Y: y}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set sets `p` to `x`, `y`.
|
// Set sets `p` to coordinates `(x, y)`.
|
||||||
func (p *Point) Set(x, y float64) {
|
func (p *Point) Set(x, y float64) {
|
||||||
p.X, p.Y = x, y
|
p.X, p.Y = x, y
|
||||||
}
|
}
|
||||||
@ -38,12 +38,12 @@ func (p *Point) Transform(a, b, c, d, tx, ty float64) {
|
|||||||
p.transformByMatrix(m)
|
p.transformByMatrix(m)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Displace returns `p` displaced by `delta`.
|
// Displace returns a new Point at location `p` + `delta`.
|
||||||
func (p Point) Displace(delta Point) Point {
|
func (p Point) Displace(delta Point) Point {
|
||||||
return Point{p.X + delta.X, p.Y + delta.Y}
|
return Point{p.X + delta.X, p.Y + delta.Y}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rotate returns `p` rotated by `theta` degrees.
|
// Rotate rotates `p` by `theta` degrees and returns back.
|
||||||
func (p Point) Rotate(theta int) Point {
|
func (p Point) Rotate(theta int) Point {
|
||||||
switch theta {
|
switch theta {
|
||||||
case 0:
|
case 0:
|
||||||
|
@ -27,13 +27,13 @@ import (
|
|||||||
// CharcodeBytesToUnicode.
|
// CharcodeBytesToUnicode.
|
||||||
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
// Characters that can't be decoded are replaced with MissingCodeRune ('\ufffd' = <20>).
|
||||||
func (e *Extractor) ExtractText() (string, error) {
|
func (e *Extractor) ExtractText() (string, error) {
|
||||||
text, _, _, err := e.ExtractText2()
|
text, _, _, err := e.ExtractTextWithStats()
|
||||||
return text, err
|
return text, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExtractText2 works like ExtractText but returns the number of characters in the output and the
|
// ExtractTextWithStats works like ExtractText but returns the number of characters in the output and the
|
||||||
// the number of characters that were not decoded.
|
// the number of characters that were not decoded.
|
||||||
func (e *Extractor) ExtractText2() (string, int, int, error) {
|
func (e *Extractor) ExtractTextWithStats() (string, int, int, error) {
|
||||||
textList, numChars, numMisses, err := e.ExtractXYText()
|
textList, numChars, numMisses, err := e.ExtractXYText()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", numChars, numMisses, err
|
return "", numChars, numMisses, err
|
||||||
@ -313,6 +313,10 @@ func (to *textObject) nextLine() {
|
|||||||
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
|
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
|
||||||
// in `f` (page 250).
|
// in `f` (page 250).
|
||||||
func (to *textObject) setTextMatrix(f []float64) {
|
func (to *textObject) setTextMatrix(f []float64) {
|
||||||
|
if len(f) != 6 {
|
||||||
|
common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
|
||||||
|
return
|
||||||
|
}
|
||||||
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
|
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
|
||||||
to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
||||||
to.Tlm = to.Tm
|
to.Tlm = to.Tm
|
||||||
@ -358,7 +362,7 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
|
|||||||
|
|
||||||
// setTextLeading "TL". Set text leading.
|
// setTextLeading "TL". Set text leading.
|
||||||
func (to *textObject) setTextLeading(y float64) {
|
func (to *textObject) setTextLeading(y float64) {
|
||||||
if to == nil {
|
if to == nil || to.State == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
to.State.Tl = y
|
to.State.Tl = y
|
||||||
@ -427,7 +431,7 @@ func (to *textObject) setHorizScaling(y float64) {
|
|||||||
to.State.Th = y
|
to.State.Th = y
|
||||||
}
|
}
|
||||||
|
|
||||||
// floatParam returns the single float parameter of operatr `op`, or an error if it doesn't have
|
// floatParam returns the single float parameter of operator `op`, or an error if it doesn't have
|
||||||
// a single float parameter or we aren't in a text stream.
|
// a single float parameter or we aren't in a text stream.
|
||||||
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
|
func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
|
||||||
if len(op.Params) != 1 {
|
if len(op.Params) != 1 {
|
||||||
@ -444,7 +448,7 @@ func floatParam(op *contentstream.ContentStreamOperation) (float64, error) {
|
|||||||
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
|
func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParams int,
|
||||||
hard bool) (ok bool, err error) {
|
hard bool) (ok bool, err error) {
|
||||||
if to == nil {
|
if to == nil {
|
||||||
params := []core.PdfObject{}
|
var params []core.PdfObject
|
||||||
if numParams > 0 {
|
if numParams > 0 {
|
||||||
params = op.Params
|
params = op.Params
|
||||||
if len(params) > numParams {
|
if len(params) > numParams {
|
||||||
@ -596,7 +600,7 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// renderText emits byte array `data` to the calling program.
|
// renderText processes and renders byte array `data` for extraction purposes.
|
||||||
func (to *textObject) renderText(data []byte) error {
|
func (to *textObject) renderText(data []byte) error {
|
||||||
font := to.getCurrentFont()
|
font := to.getCurrentFont()
|
||||||
|
|
||||||
@ -628,7 +632,6 @@ func (to *textObject) renderText(data []byte) error {
|
|||||||
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
|
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
|
||||||
|
|
||||||
for i, r := range runes {
|
for i, r := range runes {
|
||||||
|
|
||||||
// XXX(peterwilliams97) Need to find and fix cases where this happens.
|
// XXX(peterwilliams97) Need to find and fix cases where this happens.
|
||||||
if r == "\x00" {
|
if r == "\x00" {
|
||||||
continue
|
continue
|
||||||
@ -784,7 +787,7 @@ func (tl TextList) ToText() string {
|
|||||||
tl.SortPosition()
|
tl.SortPosition()
|
||||||
|
|
||||||
lines := tl.toLines()
|
lines := tl.toLines()
|
||||||
texts := []string{}
|
texts := make([]string, 0, len(lines))
|
||||||
for _, l := range lines {
|
for _, l := range lines {
|
||||||
texts = append(texts, l.Text)
|
texts = append(texts, l.Text)
|
||||||
}
|
}
|
||||||
@ -825,11 +828,11 @@ type Line struct {
|
|||||||
func (tl TextList) toLines() []Line {
|
func (tl TextList) toLines() []Line {
|
||||||
// We divide `tl` into slices which contain texts with the same orientation, extract the lines
|
// We divide `tl` into slices which contain texts with the same orientation, extract the lines
|
||||||
// for each orientation then return the concatention of these lines sorted by orientation.
|
// for each orientation then return the concatention of these lines sorted by orientation.
|
||||||
tlOrient := map[int]TextList{}
|
tlOrient := make(map[int]TextList, len(tl))
|
||||||
for _, t := range tl {
|
for _, t := range tl {
|
||||||
tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
|
tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
|
||||||
}
|
}
|
||||||
lines := []Line{}
|
var lines []Line
|
||||||
for _, o := range orientKeys(tlOrient) {
|
for _, o := range orientKeys(tlOrient) {
|
||||||
lines = append(lines, tlOrient[o].toLinesOrient()...)
|
lines = append(lines, tlOrient[o].toLinesOrient()...)
|
||||||
}
|
}
|
||||||
@ -846,15 +849,15 @@ func (tl TextList) toLinesOrient() []Line {
|
|||||||
if len(tl) == 0 {
|
if len(tl) == 0 {
|
||||||
return []Line{}
|
return []Line{}
|
||||||
}
|
}
|
||||||
lines := []Line{}
|
var lines []Line
|
||||||
words := []string{}
|
var words []string
|
||||||
x := []float64{}
|
var x []float64
|
||||||
y := tl[0].OrientedStart.Y
|
y := tl[0].OrientedStart.Y
|
||||||
|
|
||||||
scanning := false
|
scanning := false
|
||||||
|
|
||||||
averageCharWidth := ExponAve{}
|
averageCharWidth := exponAve{}
|
||||||
wordSpacing := ExponAve{}
|
wordSpacing := exponAve{}
|
||||||
lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X
|
lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X
|
||||||
|
|
||||||
for _, t := range tl {
|
for _, t := range tl {
|
||||||
@ -889,13 +892,13 @@ func (tl TextList) toLinesOrient() []Line {
|
|||||||
deltaCharWidth := averageCharWidth.ave * 0.3
|
deltaCharWidth := averageCharWidth.ave * 0.3
|
||||||
|
|
||||||
isSpace := false
|
isSpace := false
|
||||||
nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
|
nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
|
||||||
if scanning && t.Text != " " {
|
if scanning && t.Text != " " {
|
||||||
isSpace = nextWordX < t.OrientedStart.X
|
isSpace = nextWordX < t.OrientedStart.X
|
||||||
}
|
}
|
||||||
common.Log.Trace("t=%s", t)
|
common.Log.Trace("t=%s", t)
|
||||||
common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
|
common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
|
||||||
t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
|
t.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
|
||||||
common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
|
common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
|
||||||
t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX,
|
t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX,
|
||||||
nextWordX-t.OrientedStart.X, isSpace)
|
nextWordX-t.OrientedStart.X, isSpace)
|
||||||
@ -940,14 +943,14 @@ func min(a, b float64) float64 {
|
|||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExponAve implements an exponential average.
|
// exponAve implements an exponential average.
|
||||||
type ExponAve struct {
|
type exponAve struct {
|
||||||
ave float64 // Current average value.
|
ave float64 // Current average value.
|
||||||
running bool // Has `ave` been set?
|
running bool // Has `ave` been set?
|
||||||
}
|
}
|
||||||
|
|
||||||
// update updates the exponential average `exp.ave` and returns it
|
// update updates the exponential average `exp.ave` and returns it
|
||||||
func (exp *ExponAve) update(x float64) float64 {
|
func (exp *exponAve) update(x float64) float64 {
|
||||||
if !exp.running {
|
if !exp.running {
|
||||||
exp.ave = x
|
exp.ave = x
|
||||||
exp.running = true
|
exp.running = true
|
||||||
@ -957,9 +960,15 @@ func (exp *ExponAve) update(x float64) float64 {
|
|||||||
return exp.ave
|
return exp.ave
|
||||||
}
|
}
|
||||||
|
|
||||||
// printTexts is a debugging function. XXX(peterwilliams97) Remove this.
|
const isDebug = false
|
||||||
|
|
||||||
|
// printTexts is a debugging function.
|
||||||
|
// TODO(peterwilliams97) Remove this.
|
||||||
func (tl *TextList) printTexts(message string) {
|
func (tl *TextList) printTexts(message string) {
|
||||||
return
|
if !isDebug {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
_, file, line, ok := runtime.Caller(1)
|
_, file, line, ok := runtime.Caller(1)
|
||||||
if !ok {
|
if !ok {
|
||||||
file = "???"
|
file = "???"
|
||||||
@ -985,7 +994,7 @@ func (tl *TextList) printTexts(message string) {
|
|||||||
// newLine returns the Line representation of strings `words` with y coordinate `y` and x
|
// newLine returns the Line representation of strings `words` with y coordinate `y` and x
|
||||||
// coordinates `x`.
|
// coordinates `x`.
|
||||||
func newLine(y float64, x []float64, words []string) Line {
|
func newLine(y float64, x []float64, words []string) Line {
|
||||||
dx := []float64{}
|
dx := make([]float64, 0, len(x))
|
||||||
for i := 1; i < len(x); i++ {
|
for i := 1; i < len(x); i++ {
|
||||||
dx = append(dx, x[i]-x[i-1])
|
dx = append(dx, x[i]-x[i-1])
|
||||||
}
|
}
|
||||||
@ -1211,18 +1220,8 @@ type fontEntry struct {
|
|||||||
const maxFontCache = 10
|
const maxFontCache = 10
|
||||||
|
|
||||||
// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
|
// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
|
||||||
// is doesn't.
|
// it doesn't. Accesses page resources directly (not cached).
|
||||||
// This is a direct (uncached access).
|
|
||||||
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
|
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
|
||||||
|
|
||||||
// This is a hack for testing.
|
|
||||||
switch name {
|
|
||||||
case "UniDocCourier":
|
|
||||||
return model.NewStandard14FontMustCompile(model.Courier), nil
|
|
||||||
case "UniDocHelvetica":
|
|
||||||
return model.NewStandard14FontMustCompile(model.Helvetica), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
fontObj, err := to.getFontDict(name)
|
fontObj, err := to.getFontDict(name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -8,7 +8,6 @@ package extractor
|
|||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
"os"
|
"os"
|
||||||
"os/user"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
@ -20,18 +19,14 @@ import (
|
|||||||
"golang.org/x/text/unicode/norm"
|
"golang.org/x/text/unicode/norm"
|
||||||
)
|
)
|
||||||
|
|
||||||
// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
|
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
|
||||||
// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
|
// will need to setup UNIDOC_EXTRACT_TESTDATA to point at the corpus directory.
|
||||||
|
|
||||||
// forceTest should be set to true to force running all tests.
|
// forceTest should be set to true to force running all tests.
|
||||||
const forceTest = false
|
// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
|
||||||
|
var forceTest = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
|
||||||
|
|
||||||
// corpusFolders is where we search for test files.
|
var corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
|
||||||
var corpusFolders = []string{
|
|
||||||
"./testdata",
|
|
||||||
"~/testdata",
|
|
||||||
".",
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
|
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
|
||||||
@ -40,23 +35,16 @@ func init() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestTextExtraction1 tests text extraction on the PDF fragments in `fragmentTests`.
|
// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
|
||||||
func TestTextExtraction1(t *testing.T) {
|
func TestTextExtractionFragments(t *testing.T) {
|
||||||
for _, f := range fragmentTests {
|
fragmentTests := []struct {
|
||||||
f.testExtraction(t)
|
name string
|
||||||
}
|
contents string
|
||||||
}
|
text string
|
||||||
|
}{
|
||||||
type fragment struct {
|
{
|
||||||
name string
|
name: "portrait",
|
||||||
contents string
|
contents: `
|
||||||
text string
|
|
||||||
}
|
|
||||||
|
|
||||||
var fragmentTests = []fragment{
|
|
||||||
|
|
||||||
{name: "portrait",
|
|
||||||
contents: `
|
|
||||||
BT
|
BT
|
||||||
/UniDocCourier 24 Tf
|
/UniDocCourier 24 Tf
|
||||||
(Hello World!)Tj
|
(Hello World!)Tj
|
||||||
@ -64,10 +52,11 @@ var fragmentTests = []fragment{
|
|||||||
(Doink)Tj
|
(Doink)Tj
|
||||||
ET
|
ET
|
||||||
`,
|
`,
|
||||||
text: "Hello World!\nDoink",
|
text: "Hello World!\nDoink",
|
||||||
},
|
},
|
||||||
{name: "landscape",
|
{
|
||||||
contents: `
|
name: "landscape",
|
||||||
|
contents: `
|
||||||
BT
|
BT
|
||||||
/UniDocCourier 24 Tf
|
/UniDocCourier 24 Tf
|
||||||
0 1 -1 0 0 0 Tm
|
0 1 -1 0 0 0 Tm
|
||||||
@ -76,10 +65,11 @@ var fragmentTests = []fragment{
|
|||||||
(Doink)Tj
|
(Doink)Tj
|
||||||
ET
|
ET
|
||||||
`,
|
`,
|
||||||
text: "Hello World!\nDoink",
|
text: "Hello World!\nDoink",
|
||||||
},
|
},
|
||||||
{name: "180 degree rotation",
|
{
|
||||||
contents: `
|
name: "180 degree rotation",
|
||||||
|
contents: `
|
||||||
BT
|
BT
|
||||||
/UniDocCourier 24 Tf
|
/UniDocCourier 24 Tf
|
||||||
-1 0 0 -1 0 0 Tm
|
-1 0 0 -1 0 0 Tm
|
||||||
@ -88,10 +78,11 @@ var fragmentTests = []fragment{
|
|||||||
(Doink)Tj
|
(Doink)Tj
|
||||||
ET
|
ET
|
||||||
`,
|
`,
|
||||||
text: "Hello World!\nDoink",
|
text: "Hello World!\nDoink",
|
||||||
},
|
},
|
||||||
{name: "Helvetica",
|
{
|
||||||
contents: `
|
name: "Helvetica",
|
||||||
|
contents: `
|
||||||
BT
|
BT
|
||||||
/UniDocHelvetica 24 Tf
|
/UniDocHelvetica 24 Tf
|
||||||
0 -1 1 0 0 0 Tm
|
0 -1 1 0 0 0 Tm
|
||||||
@ -100,35 +91,53 @@ var fragmentTests = []fragment{
|
|||||||
(Doink)Tj
|
(Doink)Tj
|
||||||
ET
|
ET
|
||||||
`,
|
`,
|
||||||
text: "Hello World!\nDoink",
|
text: "Hello World!\nDoink",
|
||||||
},
|
},
|
||||||
}
|
|
||||||
|
|
||||||
// testExtraction checks that ExtractText() works on fragment `f`.
|
|
||||||
func (f fragment) testExtraction(t *testing.T) {
|
|
||||||
e := Extractor{contents: f.contents}
|
|
||||||
text, err := e.ExtractText()
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Error extracting text: %q err=%v", f.name, err)
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
if text != f.text {
|
|
||||||
t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
|
// Setup mock resources.
|
||||||
return
|
resources := model.NewPdfPageResources()
|
||||||
|
{
|
||||||
|
courier := model.NewStandard14FontMustCompile(model.Courier)
|
||||||
|
helvetica := model.NewStandard14FontMustCompile(model.Helvetica)
|
||||||
|
resources.SetFontByName("UniDocHelvetica", helvetica.ToPdfObject())
|
||||||
|
resources.SetFontByName("UniDocCourier", courier.ToPdfObject())
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, f := range fragmentTests {
|
||||||
|
t.Run(f.name, func(t *testing.T) {
|
||||||
|
e := Extractor{resources: resources, contents: f.contents}
|
||||||
|
text, err := e.ExtractText()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error extracting text: %q err=%v", f.name, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if text != f.text {
|
||||||
|
t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestTextExtraction2 tests text extraction on set of PDF files.
|
// TestTextExtractionFiles tests text extraction on a set of PDF files.
|
||||||
// It checks for the existence of specified strings of words on specified pages.
|
// It checks for the existence of specified strings of words on specified pages.
|
||||||
// We currently only check within lines as our line order is still improving.
|
// We currently only check within lines as our line order is still improving.
|
||||||
func TestTextExtraction2(t *testing.T) {
|
func TestTextExtractionFiles(t *testing.T) {
|
||||||
for _, test := range extract2Tests {
|
if len(corpusFolder) == 0 && !forceTest {
|
||||||
testExtract2(t, test.filename, test.expectedPageText)
|
t.Log("Corpus folder not set - skipping")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range fileExtractionTests {
|
||||||
|
t.Run(test.filename, func(t *testing.T) {
|
||||||
|
testExtractFile(t, test.filename, test.expectedPageText)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract2Tests are the PDFs and texts we are looking for on specified pages.
|
// fileExtractionTests are the PDFs and texts we are looking for on specified pages.
|
||||||
var extract2Tests = []struct {
|
var fileExtractionTests = []struct {
|
||||||
filename string
|
filename string
|
||||||
expectedPageText map[int][]string
|
expectedPageText map[int][]string
|
||||||
}{
|
}{
|
||||||
@ -216,21 +225,27 @@ var extract2Tests = []struct {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted
|
// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
|
||||||
// text to `expectedPageText`.
|
// text to `expectedPageText`.
|
||||||
// XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
|
//
|
||||||
// test files in this repo so you will need to setup `corpusFolders` to point at the corpus directory.
|
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo
|
||||||
// If `filename` cannot be found in `corpusFolders` then the test is skipped.
|
// so you will need to set the environment variable UNIDOC_EXTRACT_TESTDATA to point at
|
||||||
func testExtract2(t *testing.T, filename string, expectedPageText map[int][]string) {
|
// the corpus directory.
|
||||||
homeDir, hasHome := getHomeDir()
|
//
|
||||||
path, ok := searchDirectories(homeDir, hasHome, corpusFolders, filename)
|
// If `filename` cannot be found in `corpusFolders` then the test is skipped unless `forceTest` global
|
||||||
if !ok {
|
// variable is true (e.g. setting environment variable UNIDOC_EXTRACT_FORCETESTS = 1).
|
||||||
|
func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]string) {
|
||||||
|
filepath := filepath.Join(corpusFolder, filename)
|
||||||
|
exists := checkFileExists(filepath)
|
||||||
|
if !exists {
|
||||||
if forceTest {
|
if forceTest {
|
||||||
t.Fatalf("filename=%q does not exist", filename)
|
t.Fatalf("filename=%q does not exist", filename)
|
||||||
}
|
}
|
||||||
|
t.Logf("%s not found", filename)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
_, actualPageText := extractPageTexts(t, path)
|
|
||||||
|
_, actualPageText := extractPageTexts(t, filepath)
|
||||||
for _, pageNum := range sortedKeys(expectedPageText) {
|
for _, pageNum := range sortedKeys(expectedPageText) {
|
||||||
expectedSentences, ok := expectedPageText[pageNum]
|
expectedSentences, ok := expectedPageText[pageNum]
|
||||||
actualText, ok := actualPageText[pageNum]
|
actualText, ok := actualPageText[pageNum]
|
||||||
@ -239,12 +254,12 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri
|
|||||||
}
|
}
|
||||||
actualText = norm.NFKC.String(actualText)
|
actualText = norm.NFKC.String(actualText)
|
||||||
if !containsSentences(t, expectedSentences, actualText) {
|
if !containsSentences(t, expectedSentences, actualText) {
|
||||||
t.Fatalf("Text mismatch filename=%q page=%d", path, pageNum)
|
t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractPageTexts runs ExtractText2 on all pages in PDF `filename` and returns the result as a map
|
// extractPageTexts runs ExtractTextWithStats on all pages in PDF `filename` and returns the result as a map
|
||||||
// {page number: page text}
|
// {page number: page text}
|
||||||
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
|
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
|
||||||
f, err := os.Open(filename)
|
f, err := os.Open(filename)
|
||||||
@ -272,11 +287,11 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
t.Fatalf("extractor.New failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
||||||
}
|
}
|
||||||
text, _, _, err := ex.ExtractText2()
|
text, _, _, err := ex.ExtractTextWithStats()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("ExtractText2 failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
|
||||||
}
|
}
|
||||||
// XXX(peterwilliams97)TODO: Improve text extraction space insertion so we don't need reduceSpaces.
|
// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
|
||||||
pageText[pageNum] = reduceSpaces(text)
|
pageText[pageNum] = reduceSpaces(text)
|
||||||
}
|
}
|
||||||
return numPages, pageText
|
return numPages, pageText
|
||||||
@ -303,30 +318,10 @@ func reduceSpaces(text string) string {
|
|||||||
|
|
||||||
var reSpace = regexp.MustCompile(`(?m)\s+`)
|
var reSpace = regexp.MustCompile(`(?m)\s+`)
|
||||||
|
|
||||||
// searchDirectories searches `directories` for `filename` and returns the full file path if it is
|
// checkFileExists returns true if `filepath` exists.
|
||||||
// found. `homeDir` and `hasHome` are used for home directory substitution.
|
func checkFileExists(filepath string) bool {
|
||||||
func searchDirectories(homeDir string, hasHome bool, directories []string, filename string) (string, bool) {
|
_, err := os.Stat(filepath)
|
||||||
for _, direct := range directories {
|
return err == nil
|
||||||
if hasHome {
|
|
||||||
direct = strings.Replace(direct, "~", homeDir, 1)
|
|
||||||
}
|
|
||||||
path := filepath.Join(direct, filename)
|
|
||||||
if _, err := os.Stat(path); err == nil {
|
|
||||||
return path, true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "", false
|
|
||||||
}
|
|
||||||
|
|
||||||
// getHomeDir returns the current user's home directory if it is defined and a bool to tell if it
|
|
||||||
// is defined.
|
|
||||||
func getHomeDir() (string, bool) {
|
|
||||||
usr, err := user.Current()
|
|
||||||
if err != nil {
|
|
||||||
common.Log.Error("No current user. err=%v", err)
|
|
||||||
return "", false
|
|
||||||
}
|
|
||||||
return usr.HomeDir, true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sortedKeys returns the keys of `m` as a sorted slice.
|
// sortedKeys returns the keys of `m` as a sorted slice.
|
||||||
|
@ -18,26 +18,30 @@ import (
|
|||||||
"github.com/unidoc/unidoc/pdf/model/fonts"
|
"github.com/unidoc/unidoc/pdf/model/fonts"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Font represents a font which is a series of glyphs. Character codes from PDF strings can be
|
|
||||||
// mapped to and from glyphs. Each glyph has metrics.
|
|
||||||
// XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14
|
|
||||||
// Remove this hack.
|
|
||||||
type Font interface {
|
|
||||||
Encoder() textencoding.TextEncoder
|
|
||||||
SetEncoder(encoder textencoding.TextEncoder)
|
|
||||||
GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
|
|
||||||
GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
|
|
||||||
GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
|
|
||||||
ToPdfObject() core.PdfObject
|
|
||||||
}
|
|
||||||
|
|
||||||
// PdfFont represents an underlying font structure which can be of type:
|
// PdfFont represents an underlying font structure which can be of type:
|
||||||
// - Type0
|
// - Type0
|
||||||
// - Type1
|
// - Type1
|
||||||
// - TrueType
|
// - TrueType
|
||||||
// etc.
|
// etc.
|
||||||
type PdfFont struct {
|
type PdfFont struct {
|
||||||
context Font // The underlying font: Type0, Type1, Truetype, etc..
|
context fonts.Font // The underlying font: Type0, Type1, Truetype, etc..
|
||||||
|
}
|
||||||
|
|
||||||
|
// getCharCodeMetrics is a handy function for getting character metrics given a charcode.
|
||||||
|
func (font PdfFont) getCharCodeMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||||
|
var nometrics fonts.CharMetrics
|
||||||
|
|
||||||
|
enc := font.Encoder()
|
||||||
|
if enc == nil {
|
||||||
|
return nometrics, false
|
||||||
|
}
|
||||||
|
|
||||||
|
glyph, found := enc.CharcodeToGlyph(code)
|
||||||
|
if !found {
|
||||||
|
return nometrics, false
|
||||||
|
}
|
||||||
|
|
||||||
|
return font.GetGlyphCharMetrics(glyph)
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetFontDescriptor returns the font descriptor for `font`.
|
// GetFontDescriptor returns the font descriptor for `font`.
|
||||||
@ -516,18 +520,7 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
|
|||||||
|
|
||||||
// GetCharMetrics returns the char metrics for character code `code`.
|
// GetCharMetrics returns the char metrics for character code `code`.
|
||||||
func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||||
t := font.actualFont()
|
return font.getCharCodeMetrics(code)
|
||||||
if t == nil {
|
|
||||||
common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context)
|
|
||||||
return fonts.CharMetrics{}, false
|
|
||||||
}
|
|
||||||
if m, ok := t.GetCharMetrics(code); ok {
|
|
||||||
return m, ok
|
|
||||||
}
|
|
||||||
if descriptor, err := font.GetFontDescriptor(); err == nil && descriptor != nil {
|
|
||||||
return fonts.CharMetrics{Wx: descriptor.missingWidth}, true
|
|
||||||
}
|
|
||||||
return fonts.CharMetrics{}, false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetRuneCharMetrics returns the char metrics for rune `r`.
|
// GetRuneCharMetrics returns the char metrics for rune `r`.
|
||||||
@ -550,18 +543,9 @@ func (font PdfFont) GetRuneCharMetrics(r rune) (fonts.CharMetrics, error) {
|
|||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
|
||||||
func (font PdfFont) GetAverageCharWidth() float64 {
|
|
||||||
t := font.actualFont()
|
|
||||||
if t == nil {
|
|
||||||
common.Log.Debug("ERROR: GetAverageCharWidth Not implemented for font type=%#T", font.context)
|
|
||||||
return 0.0
|
|
||||||
}
|
|
||||||
return t.GetAverageCharWidth()
|
|
||||||
}
|
|
||||||
|
|
||||||
// actualFont returns the Font in font.context
|
// actualFont returns the Font in font.context
|
||||||
func (font PdfFont) actualFont() Font {
|
// NOTE(gunnsth): Actually this only sanity checks the font.context as the returned font will be wrapped in an interface.
|
||||||
|
func (font PdfFont) actualFont() fonts.Font {
|
||||||
if font.context == nil {
|
if font.context == nil {
|
||||||
common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
|
common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
|
||||||
}
|
}
|
||||||
|
@ -131,15 +131,6 @@ func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
|||||||
return font.DescendantFont.GetCharMetrics(code)
|
return font.DescendantFont.GetCharMetrics(code)
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
|
||||||
func (font pdfFontType0) GetAverageCharWidth() float64 {
|
|
||||||
if font.DescendantFont == nil {
|
|
||||||
common.Log.Debug("ERROR: No descendant. font=%s", font)
|
|
||||||
return 0.0
|
|
||||||
}
|
|
||||||
return font.DescendantFont.GetAverageCharWidth()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Encoder returns the font's text encoder.
|
// Encoder returns the font's text encoder.
|
||||||
func (font pdfFontType0) Encoder() textencoding.TextEncoder {
|
func (font pdfFontType0) Encoder() textencoding.TextEncoder {
|
||||||
return font.encoder
|
return font.encoder
|
||||||
@ -253,11 +244,6 @@ func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
|
|||||||
return fonts.CharMetrics{}, true
|
return fonts.CharMetrics{}, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
|
||||||
func (font pdfCIDFontType0) GetAverageCharWidth() float64 {
|
|
||||||
return 0.0
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject converts the pdfCIDFontType0 to a PDF representation.
|
// ToPdfObject converts the pdfCIDFontType0 to a PDF representation.
|
||||||
func (font *pdfCIDFontType0) ToPdfObject() core.PdfObject {
|
func (font *pdfCIDFontType0) ToPdfObject() core.PdfObject {
|
||||||
return core.MakeNull()
|
return core.MakeNull()
|
||||||
@ -378,18 +364,6 @@ func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool
|
|||||||
return fonts.CharMetrics{Wx: float64(w)}, true
|
return fonts.CharMetrics{Wx: float64(w)}, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
|
||||||
func (font pdfCIDFontType2) GetAverageCharWidth() float64 {
|
|
||||||
if len(font.runeToWidthMap) == 0 {
|
|
||||||
return 0.0
|
|
||||||
}
|
|
||||||
total := 0
|
|
||||||
for _, w := range font.runeToWidthMap {
|
|
||||||
total += w
|
|
||||||
}
|
|
||||||
return float64(total) / float64(len(font.runeToWidthMap))
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject converts the pdfCIDFontType2 to a PDF representation.
|
// ToPdfObject converts the pdfCIDFontType2 to a PDF representation.
|
||||||
func (font *pdfCIDFontType2) ToPdfObject() core.PdfObject {
|
func (font *pdfCIDFontType2) ToPdfObject() core.PdfObject {
|
||||||
if font.container == nil {
|
if font.container == nil {
|
||||||
|
@ -149,18 +149,6 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
|
|||||||
return fonts.CharMetrics{}, false
|
return fonts.CharMetrics{}, false
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
|
||||||
func (font pdfFontSimple) GetAverageCharWidth() float64 {
|
|
||||||
if font.fontMetrics != nil {
|
|
||||||
return fonts.AverageCharWidth(font.fontMetrics)
|
|
||||||
}
|
|
||||||
total := 0.0
|
|
||||||
for _, w := range font.charWidths {
|
|
||||||
total += w
|
|
||||||
}
|
|
||||||
return total / float64(len(font.charWidths))
|
|
||||||
}
|
|
||||||
|
|
||||||
// newSimpleFontFromPdfObject creates a pdfFontSimple from dictionary `d`. Elements of `d` that
|
// newSimpleFontFromPdfObject creates a pdfFontSimple from dictionary `d`. Elements of `d` that
|
||||||
// are already parsed are contained in `base`.
|
// are already parsed are contained in `base`.
|
||||||
// Standard 14 fonts need to to specify their builtin encoders in the `std14Encoder` parameter.
|
// Standard 14 fonts need to to specify their builtin encoders in the `std14Encoder` parameter.
|
||||||
|
@ -47,11 +47,6 @@ func (font FontCourier) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontCourier) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(CourierCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontCourier) ToPdfObject() core.PdfObject {
|
func (font FontCourier) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontCourierBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontCourierBold) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(CourierBoldCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontCourierBold) ToPdfObject() core.PdfObject {
|
func (font FontCourierBold) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -48,11 +48,6 @@ func (font FontCourierBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetric
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontCourierBoldOblique) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(CourierBoldObliqueCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontCourierBoldOblique) ToPdfObject() core.PdfObject {
|
func (font FontCourierBoldOblique) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontCourierOblique) GetGlyphCharMetrics(glyph string) (CharMetrics, b
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontCourierOblique) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(CourierObliqueCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontCourierOblique) ToPdfObject() core.PdfObject {
|
func (font FontCourierOblique) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -18,7 +18,6 @@ type Font interface {
|
|||||||
Encoder() textencoding.TextEncoder
|
Encoder() textencoding.TextEncoder
|
||||||
SetEncoder(encoder textencoding.TextEncoder)
|
SetEncoder(encoder textencoding.TextEncoder)
|
||||||
GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
|
GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
|
||||||
GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
|
|
||||||
ToPdfObject() core.PdfObject
|
ToPdfObject() core.PdfObject
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -32,11 +31,3 @@ type CharMetrics struct {
|
|||||||
func (m CharMetrics) String() string {
|
func (m CharMetrics) String() string {
|
||||||
return fmt.Sprintf("<%q,%.1f,%.1f>", m.GlyphName, m.Wx, m.Wy)
|
return fmt.Sprintf("<%q,%.1f,%.1f>", m.GlyphName, m.Wx, m.Wy)
|
||||||
}
|
}
|
||||||
|
|
||||||
func AverageCharWidth(metrics map[string]CharMetrics) float64 {
|
|
||||||
total := 0.0
|
|
||||||
for _, m := range metrics {
|
|
||||||
total += m.Wx
|
|
||||||
}
|
|
||||||
return total / float64(len(metrics))
|
|
||||||
}
|
|
||||||
|
@ -47,11 +47,6 @@ func (font FontHelvetica) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontHelvetica) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(HelveticaCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontHelvetica) ToPdfObject() core.PdfObject {
|
func (font FontHelvetica) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -48,11 +48,6 @@ func (font FontHelveticaBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bo
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontHelveticaBold) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(HelveticaBoldCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontHelveticaBold) ToPdfObject() core.PdfObject {
|
func (font FontHelveticaBold) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontHelveticaBoldOblique) GetGlyphCharMetrics(glyph string) (CharMetr
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontHelveticaBoldOblique) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(HelveticaObliqueCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontHelveticaBoldOblique) ToPdfObject() core.PdfObject {
|
func (font FontHelveticaBoldOblique) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontHelveticaOblique) GetGlyphCharMetrics(glyph string) (CharMetrics,
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontHelveticaOblique) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(HelveticaObliqueCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontHelveticaOblique) ToPdfObject() core.PdfObject {
|
func (font FontHelveticaOblique) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -48,11 +48,6 @@ func (font FontSymbol) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) {
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontSymbol) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(SymbolCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontSymbol) ToPdfObject() core.PdfObject {
|
func (font FontSymbol) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontTimesBold) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontTimesBold) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(TimesBoldCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontTimesBold) ToPdfObject() core.PdfObject {
|
func (font FontTimesBold) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontTimesBoldItalic) GetGlyphCharMetrics(glyph string) (CharMetrics,
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontTimesBoldItalic) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(TimesBoldItalicCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontTimesBoldItalic) ToPdfObject() core.PdfObject {
|
func (font FontTimesBoldItalic) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontTimesItalic) GetGlyphCharMetrics(glyph string) (CharMetrics, bool
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontTimesItalic) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(TimesItalicCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontTimesItalic) ToPdfObject() core.PdfObject {
|
func (font FontTimesItalic) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -47,11 +47,6 @@ func (font FontTimesRoman) GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontTimesRoman) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(TimesRomanCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontTimesRoman) ToPdfObject() core.PdfObject {
|
func (font FontTimesRoman) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -48,11 +48,6 @@ func (font FontZapfDingbats) GetGlyphCharMetrics(glyph string) (CharMetrics, boo
|
|||||||
return metrics, true
|
return metrics, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAverageCharWidth returns the average width of all glyphs in the font.
|
|
||||||
func (font FontZapfDingbats) GetAverageCharWidth() float64 {
|
|
||||||
return AverageCharWidth(ZapfDingbatsCharMetrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToPdfObject returns a primitive PDF object representation of the font.
|
// ToPdfObject returns a primitive PDF object representation of the font.
|
||||||
func (font FontZapfDingbats) ToPdfObject() core.PdfObject {
|
func (font FontZapfDingbats) ToPdfObject() core.PdfObject {
|
||||||
fontDict := core.MakeDict()
|
fontDict := core.MakeDict()
|
||||||
|
@ -11,6 +11,7 @@ package model
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
@ -58,6 +59,16 @@ func NewPdfRectangle(arr PdfObjectArray) (*PdfRectangle, error) {
|
|||||||
return &rect, nil
|
return &rect, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Height returns the height of `rect`.
|
||||||
|
func (rect *PdfRectangle) Height() float64 {
|
||||||
|
return math.Abs(rect.Ury - rect.Lly)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Width returns the width of `rect`.
|
||||||
|
func (rect *PdfRectangle) Width() float64 {
|
||||||
|
return math.Abs(rect.Urx - rect.Llx)
|
||||||
|
}
|
||||||
|
|
||||||
// Convert to a PDF object.
|
// Convert to a PDF object.
|
||||||
func (rect *PdfRectangle) ToPdfObject() PdfObject {
|
func (rect *PdfRectangle) ToPdfObject() PdfObject {
|
||||||
arr := MakeArray(MakeFloat(rect.Llx), MakeFloat(rect.Lly), MakeFloat(rect.Urx), MakeFloat(rect.Ury))
|
arr := MakeArray(MakeFloat(rect.Llx), MakeFloat(rect.Lly), MakeFloat(rect.Urx), MakeFloat(rect.Ury))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user