Peter Williams 9ebcfcf168 Finding bounding boxes of substrings of extracted text. (#109)
* Added text bounding box extraction.
* Add `font` field to textMark struct;
Create a new method `TextComponents` to retrieve all the text components of the extracted text in the page, with position and character informations
* Reorganizing extractor/text.go
* Added a text extraction position test.
* Added another text extraction location test.
* Text extraction location testing.
* Added tests for text extraction with location information.
* Cleaned up text extraction tests. No changes to functionality.
* Simplifying text extraction code.
* Simplified line construction in text.go
* Returning TextMark's in TextMarkArray which are based on PdfObjectArray but read-only, so not pointers.
* Added text extraction to show PDFs marked-up with bounding boxes of substring in extracted text.
* Add comments explaining how to calculate text bounding boxes.
* Made text_test.go naming consistent with function comments in text.go
* Use tm, pt, tl for textMark/TextMark PageText and TextLine receivers and local variables.
* uncommeted text stress test. Use go test --short to skip
* TextMark.Offset is now an index into the extracted text. It was an index into []rune(text)
2019-07-18 06:41:47 +00:00

152 lines
4.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package transform
import (
"fmt"
"math"
"github.com/unidoc/unipdf/v3/common"
)
// Matrix is a linear transform matrix in homogenous coordinates.
// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix.
type Matrix [9]float64
// IdentityMatrix returns the identity transform.
func IdentityMatrix() Matrix {
return NewMatrix(1, 0, 0, 1, 0, 0)
}
// TranslationMatrix returns a matrix that translates by `tx`, `ty`.
func TranslationMatrix(tx, ty float64) Matrix {
return NewMatrix(1, 0, 0, 1, tx, ty)
}
// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
// a b 0
// c d 0
// tx ty 1
func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
m := Matrix{
a, b, 0,
c, d, 0,
tx, ty, 1,
}
m.clampRange()
return m
}
// String returns a string describing `m`.
func (m Matrix) String() string {
a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
return fmt.Sprintf("[%7.4f,%7.4f,%7.4f,%7.4f:%7.4f,%7.4f]", a, b, c, d, tx, ty)
}
// Set sets `m` to affine transform a,b,c,d,tx,ty.
func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
m[0], m[1] = a, b
m[3], m[4] = c, d
m[6], m[7] = tx, ty
m.clampRange()
}
// Concat sets `m` to `b` × `m`.
// `b` needs to be created by newMatrix. i.e. It must be an affine transform.
// b00 b01 0 m00 m01 0 b00*m00 + b01*m01 b00*m10 + b01*m11 0
// b10 b11 0 × m10 m11 0 ➔ b10*m00 + b11*m01 b10*m10 + b11*m11 0
// b20 b21 1 m20 m21 1 b20*m00 + b21*m10 + m20 b20*m01 + b21*m11 + m21 1
func (m *Matrix) Concat(b Matrix) {
*m = Matrix{
b[0]*m[0] + b[1]*m[3], b[0]*m[1] + b[1]*m[4], 0,
b[3]*m[0] + b[4]*m[3], b[3]*m[1] + b[4]*m[4], 0,
b[6]*m[0] + b[7]*m[3] + m[6], b[6]*m[1] + b[7]*m[4] + m[7], 1,
}
m.clampRange()
}
// Mult returns `b` × `m`.
func (m Matrix) Mult(b Matrix) Matrix {
m.Concat(b)
return m
}
// Translate appends a translation of `dx`,`dy` to `m`.
// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
func (m *Matrix) Translate(dx, dy float64) {
m[6] += dx
m[7] += dy
m.clampRange()
}
// Translation returns the translation part of `m`.
func (m *Matrix) Translation() (float64, float64) {
return m[6], m[7]
}
// Transform returns coordinates `x`,`y` transformed by `m`.
func (m *Matrix) Transform(x, y float64) (float64, float64) {
xp := x*m[0] + y*m[1] + m[6]
yp := x*m[3] + y*m[4] + m[7]
return xp, yp
}
// ScalingFactorX returns the X scaling of the affine transform.
func (m *Matrix) ScalingFactorX() float64 {
return math.Hypot(m[0], m[1])
}
// ScalingFactorY returns the Y scaling of the affine transform.
func (m *Matrix) ScalingFactorY() float64 {
return math.Hypot(m[3], m[4])
}
// Angle returns the angle of the affine transform in `m` in degrees.
func (m *Matrix) Angle() float64 {
theta := math.Atan2(-m[1], m[0])
if theta < 0.0 {
theta += 2 * math.Pi
}
return theta / math.Pi * 180.0
}
// clampRange forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF files.
// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions.
func (m *Matrix) clampRange() {
for i, x := range m {
if x > maxAbsNumber {
common.Log.Debug("CLAMP: %g -> %g", x, maxAbsNumber)
m[i] = maxAbsNumber
} else if x < -maxAbsNumber {
common.Log.Debug("CLAMP: %g -> %g", x, -maxAbsNumber)
m[i] = -maxAbsNumber
}
}
}
// Unrealistic returns true if `m` is too small to have been created intentionally.
// If it returns true then `m` probably contains junk values, due to some processing error in the
// PDF generator or our code.
func (m *Matrix) Unrealistic() bool {
xx, xy, yx, yy := math.Abs(m[0]), math.Abs(m[1]), math.Abs(m[3]), math.Abs(m[4])
goodXxYy := xx > minSafeScale && yy > minSafeScale
goodXyYx := xy > minSafeScale && yx > minSafeScale
return !(goodXxYy || goodXyYx)
}
// minSafeScale is the minimum matrix scale that is expected to occur in a valid PDF file.
const minSafeScale = 1e-6
// maxAbsNumber defines the maximum absolute value of allowed practical matrix element values as needed
// to avoid floating point exceptions.
// TODO(gunnsth): Add reference or point to a specific example PDF that validates this.
const maxAbsNumber = 1e9
// minDeterminant is the smallest matrix determinant we are prepared to deal with.
// Smaller determinants may lead to rounding errors.
const minDeterminant = 1.0e-6