mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00
Fixed orientation handling in text extraction.
This commit is contained in:
parent
a815ca7271
commit
536c688001
@ -25,13 +25,6 @@ type GraphicsState struct {
|
||||
CTM Matrix
|
||||
}
|
||||
|
||||
type Orientation int
|
||||
|
||||
const (
|
||||
OrientationPortrait Orientation = iota
|
||||
OrientationLandscape
|
||||
)
|
||||
|
||||
type GraphicStateStack []GraphicsState
|
||||
|
||||
func (gsStack *GraphicStateStack) Push(gs GraphicsState) {
|
||||
@ -49,11 +42,6 @@ func (gs *GraphicsState) Transform(x, y float64) (float64, float64) {
|
||||
return gs.CTM.Transform(x, y)
|
||||
}
|
||||
|
||||
// // PageOrientation returns the likely page orientation given the CTM.
|
||||
// func (gs *GraphicsState) PageOrientation() Orientation {
|
||||
// return gs.CTM.Orientation()
|
||||
// }
|
||||
|
||||
// ContentStreamProcessor defines a data structure and methods for processing a content stream, keeping track of the
|
||||
// current graphics state, and allowing external handlers to define their own functions as a part of the processing,
|
||||
// for example rendering or extracting certain information.
|
||||
@ -673,31 +661,42 @@ func (m *Matrix) Transform(x, y float64) (float64, float64) {
|
||||
return xp, yp
|
||||
}
|
||||
|
||||
// ScalingFactorX returns X scaling of the affine transform.
|
||||
func (m *Matrix) ScalingFactorX() float64 {
|
||||
scale := m[0]
|
||||
if !(m[1] == 0.0 && m[3] == 0.0) {
|
||||
scale = math.Sqrt(m[0]*m[0] + m[1]*m[1])
|
||||
}
|
||||
return scale
|
||||
return math.Sqrt(m[0]*m[0] + m[1]*m[1])
|
||||
}
|
||||
|
||||
// ScalingFactorY returns X scaling of the affine transform.
|
||||
func (m *Matrix) ScalingFactorY() float64 {
|
||||
scale := m[4]
|
||||
if !(m[1] == 0.0 && m[3] == 0.0) {
|
||||
scale = math.Sqrt(m[3]*m[3] + m[4]*m[4])
|
||||
}
|
||||
return scale
|
||||
return math.Sqrt(m[3]*m[3] + m[4]*m[4])
|
||||
}
|
||||
|
||||
// Orientation returns a guess at the pdf page orientation when text is printed with CTM `m`.
|
||||
// XXX(peterwilliams97) Use pageRotate flag instead.
|
||||
func (m *Matrix) Orientation() Orientation {
|
||||
switch {
|
||||
case m[1]*m[1]+m[3]*m[3] > m[0]*m[0]+m[4]*m[4]:
|
||||
return OrientationLandscape
|
||||
default:
|
||||
return OrientationPortrait
|
||||
// Angle returns the angle of the affine transform.
|
||||
// For simplicity, we assume the transform is a multiple of 90 degrees.
|
||||
func (m *Matrix) Angle() int {
|
||||
a, b, c, d := m[0], m[1], m[3], m[4]
|
||||
// We are returning θ for
|
||||
// a b cos θ -sin θ
|
||||
// c d = sin θ cos θ
|
||||
if a > 0 && d > 0 {
|
||||
// 1 0
|
||||
// 0 1
|
||||
return 0
|
||||
} else if b < 0 && c > 0 {
|
||||
// 0 1
|
||||
// -1 0
|
||||
return 90
|
||||
} else if a < 0 && d < 0 {
|
||||
// -1 0
|
||||
// 0 -1
|
||||
return 180
|
||||
} else if b > 0 && c < 0 {
|
||||
// 0 -1
|
||||
// 1 0
|
||||
return 270
|
||||
}
|
||||
common.Log.Debug("ERROR: Angle not a mulitple of 90°. m=%s", m)
|
||||
return 0
|
||||
}
|
||||
|
||||
// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
|
||||
|
@ -59,7 +59,6 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
||||
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
|
||||
resources *model.PdfPageResources) error {
|
||||
|
||||
|
||||
operand := op.Operand
|
||||
|
||||
switch operand {
|
||||
@ -315,7 +314,7 @@ func (to *textObject) setTextMatrix(f []float64) {
|
||||
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
|
||||
to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
||||
to.Tlm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
||||
common.Log.Debug("setTextMatrix:Tm=%s", to.Tm)
|
||||
common.Log.Debug("setTextMatrix: Tm=%s", to.Tm)
|
||||
}
|
||||
|
||||
// showText "Tj" Show a text string.
|
||||
@ -665,15 +664,19 @@ func (to *textObject) renderText(data []byte) error {
|
||||
td0 := translationMatrix(t0)
|
||||
td := translationMatrix(t)
|
||||
|
||||
common.Log.Trace("\"%s\" stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.Tm)
|
||||
common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.Tc, w, state.Tw)
|
||||
common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.Tm).Mult(to.gs.CTM))
|
||||
|
||||
nextTm := td.Mult(to.Tm)
|
||||
common.Log.Trace("nextTm=%s", nextTm)
|
||||
|
||||
xyt := newXYText(
|
||||
string(r),
|
||||
translation(trm),
|
||||
trm,
|
||||
translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
|
||||
trm.Orientation(),
|
||||
spaceWidth*trm.ScalingFactorX())
|
||||
common.Log.Debug("i=%d code=%d, xyt=%s", i, code, xyt)
|
||||
common.Log.Trace("i=%d code=%d, xyt=%s", i, code, xyt)
|
||||
to.Texts = append(to.Texts, xyt)
|
||||
|
||||
// update the text matrix by the displacement of the text location.
|
||||
@ -703,57 +706,46 @@ func translationMatrix(p Point) contentstream.Matrix {
|
||||
// Move to the start of the next line, offset from the start of the current line by (tx, ty).
|
||||
// `tx` and `ty` are in unscaled text space units.
|
||||
func (to *textObject) moveTo(tx, ty float64) {
|
||||
common.Log.Debug("moveTo: tx,ty=%.3f,%.3f Tm=%s Tlm=%s", tx, ty, to.Tm, to.Tlm)
|
||||
to.Tlm = contentstream.NewMatrix(1, 0, 0, 1, tx, ty).Mult(to.Tlm)
|
||||
to.Tm = to.Tlm
|
||||
common.Log.Debug(" ===> Tm=%s", to.Tm)
|
||||
}
|
||||
|
||||
// XYText represents text drawn on a page and its position in device coordinates.
|
||||
type XYText struct {
|
||||
Point // Position of text. Left-bottom?
|
||||
End Point // End of text. Right-top?
|
||||
Trm contentstream.Matrix
|
||||
OrientedStart Point // Left of text in orientation where text is horizontal.
|
||||
OrientedEnd Point // Right of text in orientation where text is horizontal.
|
||||
ColorStroking model.PdfColor // Colour that text is stroked with, if any.
|
||||
ColorNonStroking model.PdfColor // Colour that text is filled with, if any.
|
||||
Orient contentstream.Orientation
|
||||
Orient int
|
||||
Text string
|
||||
SpaceWidth float64
|
||||
Font string
|
||||
FontSize float64
|
||||
counter int
|
||||
}
|
||||
|
||||
var counter int
|
||||
|
||||
func newXYText(text string, point, end Point, orient contentstream.Orientation, spaceWidth float64) XYText {
|
||||
counter++
|
||||
func newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText {
|
||||
theta := trm.Angle()
|
||||
return XYText{
|
||||
Text: text,
|
||||
Point: point,
|
||||
End: end,
|
||||
Orient: orient,
|
||||
SpaceWidth: spaceWidth,
|
||||
counter: counter,
|
||||
Text: text,
|
||||
Trm: trm,
|
||||
OrientedStart: translation(trm).Rotate(theta),
|
||||
OrientedEnd: end.Rotate(theta),
|
||||
Orient: theta,
|
||||
SpaceWidth: spaceWidth,
|
||||
}
|
||||
}
|
||||
|
||||
// String returns a string describing `t`.
|
||||
func (t XYText) String() string {
|
||||
return fmt.Sprintf("@@%d %s,%s %.1f %q", t.counter,
|
||||
t.Point.String(), t.End.String(), t.End.X-t.X, truncate(t.Text, 100))
|
||||
return fmt.Sprintf("XYText{%s %.1f |%d| [%.3f,%.3f] %q}",
|
||||
t.Trm.String(), t.Width(), t.Orient, t.OrientedStart.X, t.OrientedStart.Y,
|
||||
truncate(t.Text, 100))
|
||||
}
|
||||
|
||||
// Width returns the width of `t`.Text in its orientation.
|
||||
// Width returns the width of `t`.Text in the text direction.
|
||||
func (t XYText) Width() float64 {
|
||||
var w float64
|
||||
switch t.Orient {
|
||||
case contentstream.OrientationLandscape:
|
||||
w = math.Abs(t.End.Y - t.Y)
|
||||
default:
|
||||
w = math.Abs(t.End.X - t.X)
|
||||
}
|
||||
common.Log.Debug(" Width %q (%s %s) -> %.1f", t.Text, t.Point.String(), t.End.String(), w)
|
||||
return w
|
||||
return math.Abs(t.OrientedStart.X - t.OrientedEnd.X)
|
||||
}
|
||||
|
||||
// TextList is a list of texts and their positions on a PDF page.
|
||||
@ -764,21 +756,6 @@ func (tl *TextList) Length() int {
|
||||
return len(*tl)
|
||||
}
|
||||
|
||||
// // AppendText appends the location and contents of `text` to a text list.
|
||||
// func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text string, spaceWidth float64) {
|
||||
// t := XYText{
|
||||
// Point: p,
|
||||
// End: e,
|
||||
// ColorStroking: gs.ColorStroking,
|
||||
// ColorNonStroking: gs.ColorNonStroking,
|
||||
// Orient: gs.PageOrientation(),
|
||||
// Text: text,
|
||||
// SpaceWidth: spaceWidth,
|
||||
// }
|
||||
// common.Log.Debug("AppendText: %s", t.String())
|
||||
// *tl = append(*tl, t)
|
||||
// }
|
||||
|
||||
// ToText returns the contents of `tl` as a single string.
|
||||
func (tl *TextList) ToText() string {
|
||||
tl.printTexts("ToText: before sorting")
|
||||
@ -799,20 +776,12 @@ func (tl *TextList) SortPosition() {
|
||||
sort.SliceStable(*tl, func(i, j int) bool {
|
||||
ti, tj := (*tl)[i], (*tl)[j]
|
||||
if ti.Orient != tj.Orient {
|
||||
return ti.Orient > tj.Orient
|
||||
return ti.Orient < tj.Orient
|
||||
}
|
||||
// x, y is orientated so text is horizontal.
|
||||
xi, xj := ti.X, tj.X
|
||||
yi, yj := ti.Y, tj.Y
|
||||
if ti.Orient == contentstream.OrientationLandscape {
|
||||
xi, yi = yi, -xi
|
||||
xj, yj = yj, -xj
|
||||
if ti.OrientedStart.Y != tj.OrientedStart.Y {
|
||||
return ti.OrientedStart.Y > tj.OrientedStart.Y
|
||||
}
|
||||
|
||||
if yi != yj {
|
||||
return yi > yj
|
||||
}
|
||||
return xi < xj
|
||||
return ti.OrientedStart.X < tj.OrientedStart.X
|
||||
})
|
||||
}
|
||||
|
||||
@ -824,50 +793,41 @@ type Line struct {
|
||||
Words []string // words in the line
|
||||
}
|
||||
|
||||
// toLines return the text and positions in `tl` as a slice of Line.
|
||||
// toLines returns the text and positions in `tl` as a slice of Line.
|
||||
// NOTE: Caller must sort the text list by top-to-bottom, left-to-write (for orientation adjusted so
|
||||
// that text is horizontal) before calling this function.
|
||||
func (tl *TextList) toLines() []Line {
|
||||
portText, landText := TextList{}, TextList{}
|
||||
for _, t := range *tl {
|
||||
if t.Orient == contentstream.OrientationPortrait {
|
||||
portText = append(portText, t)
|
||||
} else {
|
||||
t.X, t.Y = t.Y, -t.X
|
||||
t.End.X, t.End.Y = t.End.Y, -t.End.X
|
||||
t.Orient = contentstream.OrientationPortrait
|
||||
landText = append(landText, t)
|
||||
func (tl TextList) toLines() []Line {
|
||||
tlOrient := map[int]TextList{}
|
||||
for _, t := range tl {
|
||||
tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
|
||||
}
|
||||
}
|
||||
common.Log.Debug("toLines: portrait ^^^^^^^")
|
||||
portLines := portText.toLinesOrient()
|
||||
common.Log.Debug("toLines: landscape &&&&&&&")
|
||||
landLines := landText.toLinesOrient()
|
||||
common.Log.Debug("portText=%d landText=%d", len(portText), len(landText))
|
||||
return append(portLines, landLines...)
|
||||
lines := []Line{}
|
||||
for _, o := range []int{0, 90, 180, 270} {
|
||||
lines = append(lines, tlOrient[o].toLinesOrient()...)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
// toLinesOrient returns the text and positions in `tl` as a slice of Line.
|
||||
// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
|
||||
func (tl *TextList) toLinesOrient() []Line {
|
||||
func (tl TextList) toLinesOrient() []Line {
|
||||
tl.printTexts("toLines: before")
|
||||
if len(*tl) == 0 {
|
||||
if len(tl) == 0 {
|
||||
return []Line{}
|
||||
}
|
||||
lines := []Line{}
|
||||
words := []string{}
|
||||
x := []float64{}
|
||||
y := (*tl)[0].Y
|
||||
y := tl[0].OrientedStart.Y
|
||||
|
||||
scanning := false
|
||||
|
||||
averageCharWidth := ExponAve{}
|
||||
wordSpacing := ExponAve{}
|
||||
lastEndX := 0.0 // (*tl)[i-1).End.X
|
||||
lastEndX := 0.0 // tl[i-1].End.X
|
||||
|
||||
for _, t := range *tl {
|
||||
// common.Log.Debug("%d --------------------------", i)
|
||||
if t.Y < y {
|
||||
for _, t := range tl {
|
||||
if t.OrientedStart.Y < y {
|
||||
if len(words) > 0 {
|
||||
line := newLine(y, x, words)
|
||||
if averageCharWidth.running {
|
||||
@ -877,7 +837,7 @@ func (tl *TextList) toLinesOrient() []Line {
|
||||
}
|
||||
words = []string{}
|
||||
x = []float64{}
|
||||
y = t.Y
|
||||
y = t.OrientedStart.Y
|
||||
scanning = false
|
||||
}
|
||||
|
||||
@ -899,25 +859,26 @@ func (tl *TextList) toLinesOrient() []Line {
|
||||
isSpace := false
|
||||
nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
|
||||
if scanning && t.Text != " " {
|
||||
isSpace = nextWordX < t.X
|
||||
isSpace = nextWordX < t.OrientedStart.X
|
||||
}
|
||||
common.Log.Debug("t=%s", t)
|
||||
common.Log.Debug("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
|
||||
common.Log.Trace("t=%s", t)
|
||||
common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
|
||||
t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
|
||||
common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
|
||||
t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX,
|
||||
nextWordX-t.OrientedStart.X, isSpace)
|
||||
|
||||
common.Log.Debug("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
|
||||
t.Text, t.X, t.Y, lastEndX, nextWordX, nextWordX-t.X, isSpace)
|
||||
if isSpace {
|
||||
words = append(words, " ")
|
||||
x = append(x, (lastEndX+t.X)*0.5)
|
||||
x = append(x, (lastEndX+t.OrientedStart.X)*0.5)
|
||||
}
|
||||
|
||||
// Add the text to the line.
|
||||
lastEndX = t.End.X
|
||||
lastEndX = t.OrientedEnd.X
|
||||
words = append(words, t.Text)
|
||||
x = append(x, t.X)
|
||||
x = append(x, t.OrientedStart.X)
|
||||
scanning = true
|
||||
common.Log.Debug("lastEndX=%.2f", lastEndX)
|
||||
common.Log.Trace("lastEndX=%.2f", lastEndX)
|
||||
}
|
||||
if len(words) > 0 {
|
||||
line := newLine(y, x, words)
|
||||
@ -1012,30 +973,6 @@ func removeDuplicates(line Line, charWidth float64) Line {
|
||||
return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
|
||||
}
|
||||
|
||||
// PageOrientation is a heuristic for the orientation of a page.
|
||||
// XXX TODO: Use Page's Rotate flag instead.
|
||||
func (tl *TextList) PageOrientation() contentstream.Orientation {
|
||||
landscapeCount := 0
|
||||
for _, t := range *tl {
|
||||
if t.Orient == contentstream.OrientationLandscape {
|
||||
landscapeCount++
|
||||
}
|
||||
}
|
||||
portraitCount := len(*tl) - landscapeCount
|
||||
if landscapeCount > portraitCount {
|
||||
return contentstream.OrientationLandscape
|
||||
}
|
||||
return contentstream.OrientationPortrait
|
||||
}
|
||||
|
||||
// Transform transforms all points in `tl` by the affine transformation a, b, c, d, tx, ty.
|
||||
func (tl *TextList) Transform(a, b, c, d, tx, ty float64) {
|
||||
m := contentstream.NewMatrix(a, b, c, d, tx, ty)
|
||||
for _, t := range *tl {
|
||||
t.X, t.Y = m.Transform(t.X, t.Y)
|
||||
}
|
||||
}
|
||||
|
||||
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
|
||||
// empty.
|
||||
func (to *textObject) getCurrentFont() *model.PdfFont {
|
||||
@ -1099,8 +1036,11 @@ const maxFontCache = 10
|
||||
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
|
||||
|
||||
// This is a hack for testing.
|
||||
if name == "UniDocCourier" {
|
||||
switch name {
|
||||
case "UniDocCourier":
|
||||
return model.NewStandard14FontMustCompile(model.Courier), nil
|
||||
case "UniDocHelvetica":
|
||||
return model.NewStandard14FontMustCompile(model.Helvetica), nil
|
||||
}
|
||||
|
||||
fontObj, err := to.getFontDict(name)
|
||||
|
@ -39,29 +39,80 @@ func init() {
|
||||
}
|
||||
}
|
||||
|
||||
const testContents1 = `
|
||||
BT
|
||||
/UniDocCourier 24 Tf
|
||||
(Hello World!)Tj
|
||||
0 -10 Td
|
||||
(Doink)Tj
|
||||
ET
|
||||
`
|
||||
|
||||
const testExpected1 = "Hello World!\nDoink"
|
||||
|
||||
// TestTextExtraction1 tests text extraction on a PDF fragment.
|
||||
// TestTextExtraction1 tests text extraction on the PDF fragments in `fragmentTests`.
|
||||
func TestTextExtraction1(t *testing.T) {
|
||||
e := Extractor{}
|
||||
e.contents = testContents1
|
||||
for _, f := range fragmentTests {
|
||||
f.testExtraction(t)
|
||||
}
|
||||
}
|
||||
|
||||
s, err := e.ExtractText()
|
||||
type fragment struct {
|
||||
name string
|
||||
contents string
|
||||
text string
|
||||
}
|
||||
|
||||
var fragmentTests = []fragment{
|
||||
|
||||
{name: "portrait",
|
||||
contents: `
|
||||
BT
|
||||
/UniDocCourier 24 Tf
|
||||
(Hello World!)Tj
|
||||
0 -10 Td
|
||||
(Doink)Tj
|
||||
ET
|
||||
`,
|
||||
text: "Hello World!\nDoink",
|
||||
},
|
||||
{name: "landscape",
|
||||
contents: `
|
||||
BT
|
||||
/UniDocCourier 24 Tf
|
||||
0 1 -1 0 0 0 Tm
|
||||
(Hello World!)Tj
|
||||
0 -10 Td
|
||||
(Doink)Tj
|
||||
ET
|
||||
`,
|
||||
text: "Hello World!\nDoink",
|
||||
},
|
||||
{name: "180 degree rotation",
|
||||
contents: `
|
||||
BT
|
||||
/UniDocCourier 24 Tf
|
||||
-1 0 0 -1 0 0 Tm
|
||||
(Hello World!)Tj
|
||||
0 -10 Td
|
||||
(Doink)Tj
|
||||
ET
|
||||
`,
|
||||
text: "Hello World!\nDoink",
|
||||
},
|
||||
{name: "Helvetica",
|
||||
contents: `
|
||||
BT
|
||||
/UniDocHelvetica 24 Tf
|
||||
0 -1 1 0 0 0 Tm
|
||||
(Hello World!)Tj
|
||||
0 -10 Td
|
||||
(Doink)Tj
|
||||
ET
|
||||
`,
|
||||
text: "Hello World!\nDoink",
|
||||
},
|
||||
}
|
||||
|
||||
// testExtraction checks that ExtractText() works on fragment `f`.
|
||||
func (f fragment) testExtraction(t *testing.T) {
|
||||
e := Extractor{contents: f.contents}
|
||||
text, err := e.ExtractText()
|
||||
if err != nil {
|
||||
t.Errorf("Error extracting text: %v", err)
|
||||
t.Fatalf("Error extracting text: %q err=%v", f.name, err)
|
||||
return
|
||||
}
|
||||
if s != testExpected1 {
|
||||
t.Errorf("Text mismatch. Got %q. Expected %q", s, testExpected1)
|
||||
if text != f.text {
|
||||
t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
|
||||
return
|
||||
}
|
||||
}
|
||||
@ -134,11 +185,12 @@ var extract2Tests = []struct {
|
||||
1: []string{"clustering, entropy, object attributes, spatial correlation, and local"},
|
||||
},
|
||||
},
|
||||
// {filename:"Ito_Formula.pdf",
|
||||
// expectedPageText: map[int][]string{
|
||||
// 1: []string{"In the Itô stochastic calculus"},
|
||||
// },
|
||||
// },
|
||||
{filename: "Ito_Formula.pdf",
|
||||
expectedPageText: map[int][]string{
|
||||
// 1: []string{"In the Itô stochastic calculus"},
|
||||
1: []string{"In standard, non-stochastic calculus, one computes a derivative"},
|
||||
},
|
||||
},
|
||||
{filename: "circ2.pdf",
|
||||
expectedPageText: map[int][]string{
|
||||
1: []string{"Understanding and complying with copyright law can be a challenge"},
|
||||
@ -149,11 +201,11 @@ var extract2Tests = []struct {
|
||||
6: []string{"words in the test set, we increase the BLEU score"},
|
||||
},
|
||||
},
|
||||
// {filename: "Planck_Wien.pdf",
|
||||
// expectedPageText: map[int][]string{
|
||||
// 1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
|
||||
// },
|
||||
// },
|
||||
{filename: "Planck_Wien.pdf",
|
||||
expectedPageText: map[int][]string{
|
||||
1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted
|
||||
|
@ -135,7 +135,8 @@ func NewStandard14FontMustCompile(basefont Standard14Font) *PdfFont {
|
||||
// An error can occur if`basefont` is not one the standard 14 font names.
|
||||
func NewStandard14FontWithEncoding(basefont Standard14Font, alphabet map[rune]int) (*PdfFont,
|
||||
*textencoding.SimpleEncoder, error) {
|
||||
baseEncoder := "MacRomanEncoding"
|
||||
// XXX(peterwilliams97) This is wrong. Use correct implementation in newPdfFontFromPdfObject()
|
||||
baseEncoder := "MacRomanEncoding"
|
||||
common.Log.Trace("NewStandard14FontWithEncoding: basefont=%#q baseEncoder=%#q alphabet=%q",
|
||||
basefont, baseEncoder, string(sortedAlphabet(alphabet)))
|
||||
|
||||
@ -266,8 +267,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
|
||||
font.context = type0font
|
||||
case "Type1", "Type3", "MMType1", "TrueType":
|
||||
var simplefont *pdfFontSimple
|
||||
std, ok := loadStandard14Font(Standard14Font(base.basefont))
|
||||
builtin := ok
|
||||
std, builtin := loadStandard14Font(Standard14Font(base.basefont))
|
||||
if builtin {
|
||||
font.context = &std
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user