Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.
This commit is contained in:
Peter Williams 2018-11-18 17:21:30 +11:00
parent 851aa267b1
commit a9019a50a3
8 changed files with 185 additions and 119 deletions

BIN
pdf/extractor/testdata/000026.pdf vendored Normal file

Binary file not shown.

View File

@ -266,7 +266,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
err = processor.Process(e.resources)
if err != nil {
common.Log.Error("ERROR: Processing: err=%v", err)
common.Log.Debug("ERROR: Processing: err=%v", err)
}
return textList, state.numChars, state.numMisses, err
}
@ -406,7 +406,7 @@ func (to *textObject) setTextRise(y float64) {
// setWordSpacing "Tw" Set word spacing.
func (to *textObject) setWordSpacing(y float64) {
// Not implemented yet
to.State.Tw = y
}
// setHorizScaling "Tz" Set horizontal scaling.
@ -609,7 +609,10 @@ func (to *textObject) renderText(data []byte) error {
0, tfs,
0, state.Trise)
common.Log.Debug("%d codes=%+v runes=%q", len(charcodes), charcodes, runes)
for i, r := range runes {
code := charcodes[i]
// The location of the text on the page in device coordinates is given by trm, the text
// rendering matrix.
@ -634,19 +637,38 @@ func (to *textObject) renderText(data []byte) error {
c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
// t is the displacement of the text cursor when the character is rendered.
// float tx = displacementX * fontSize * horizontalScaling;
// w = 0
t0 := Point{X: (c.X*tfs + w) * th}
t := Point{X: (c.X*tfs + state.Tc + w) * th}
// td is t in matrix form.
td0 := translationMatrix(t0)
td := translationMatrix(t)
common.Log.Debug("%q stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.Tm)
common.Log.Debug("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.Tc, w, state.Tw)
common.Log.Debug("m=%s c=%+v t0=%+v td0=%s trm0=%s",
m, c, t0, td0, td0.Mult(to.Tm).Mult(to.gs.CTM))
common.Log.Debug("m=%s c=%+v t=%+v td=%s trm=%s",
m, c, t, td, td.Mult(to.Tm).Mult(to.gs.CTM))
nextTm := to.Tm.Mult(td)
xyt := XYText{Text: string(r),
Point: translation(trm),
Orient: trm.Orientation(),
End: translation(to.Tm.Mult(td).Mult(to.gs.CTM)),
SpaceWidth: spaceWidth * trm.ScalingFactorX(),
}
// xyt := XYText{Text: string(r),
// Point: translation(trm),
// Orient: trm.Orientation(),
// // Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
// End: translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
// SpaceWidth: spaceWidth * trm.ScalingFactorX(),
// }
xyt := newXYText(
string(r),
translation(trm),
translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
trm.Orientation(),
spaceWidth*trm.ScalingFactorX())
common.Log.Debug("i=%d code=%d, xyt=%s", i, code, xyt)
to.Texts = append(to.Texts, xyt)
// update the text matrix by the displacement of the text location.
@ -690,11 +712,26 @@ type XYText struct {
SpaceWidth float64
Font string
FontSize float64
counter int
}
var counter int
func newXYText(text string, point, end Point, orient contentstream.Orientation, spaceWidth float64) XYText {
counter++
return XYText{
Text: text,
Point: point,
End: end,
Orient: orient,
SpaceWidth: spaceWidth,
counter: counter,
}
}
// String returns a string describing `t`.
func (t XYText) String() string {
return fmt.Sprintf("%s,%s %.1f %q",
return fmt.Sprintf("@@%d %s,%s %.1f %q", t.counter,
t.Point.String(), t.End.String(), t.End.X-t.X, truncate(t.Text, 100))
}
@ -707,7 +744,7 @@ func (t XYText) Width() float64 {
default:
w = math.Abs(t.End.X - t.X)
}
common.Log.Trace(" Width %q (%s %s) -> %.1f", t.Text, t.Point.String(), t.End.String(), w)
common.Log.Debug(" Width %q (%s %s) -> %.1f", t.Text, t.Point.String(), t.End.String(), w)
return w
}
@ -719,20 +756,20 @@ func (tl *TextList) Length() int {
return len(*tl)
}
// AppendText appends the location and contents of `text` to a text list.
func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text string, spaceWidth float64) {
t := XYText{
Point: p,
End: e,
ColorStroking: gs.ColorStroking,
ColorNonStroking: gs.ColorNonStroking,
Orient: gs.PageOrientation(),
Text: text,
SpaceWidth: spaceWidth,
}
common.Log.Debug("AppendText: %s", t.String())
*tl = append(*tl, t)
}
// // AppendText appends the location and contents of `text` to a text list.
// func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text string, spaceWidth float64) {
// t := XYText{
// Point: p,
// End: e,
// ColorStroking: gs.ColorStroking,
// ColorNonStroking: gs.ColorNonStroking,
// Orient: gs.PageOrientation(),
// Text: text,
// SpaceWidth: spaceWidth,
// }
// common.Log.Debug("AppendText: %s", t.String())
// *tl = append(*tl, t)
// }
// ToText returns the contents of `tl` as a single string.
func (tl *TextList) ToText() string {
@ -794,6 +831,7 @@ func (tl *TextList) toLines() []Line {
}
portLines := portText.toLinesOrient()
landLines := landText.toLinesOrient()
common.Log.Debug("portText=%d landText=%d", len(portText), len(landText))
return append(portLines, landLines...)
}
@ -816,6 +854,7 @@ func (tl *TextList) toLinesOrient() []Line {
lastEndX := 0.0 // (*tl)[i-1).End.X
for _, t := range *tl {
// common.Log.Debug("%d --------------------------", i)
if t.Y < y {
if len(words) > 0 {
line := newLine(y, x, words)
@ -846,12 +885,16 @@ func (tl *TextList) toLinesOrient() []Line {
deltaCharWidth := averageCharWidth.ave * 0.3
isSpace := false
if scanning && t.Text != " " {
nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
if scanning && t.Text != " " {
isSpace = nextWordX < t.X
common.Log.Trace("[%.1f, %.1f] lastEndX=%.1f nextWordX=%.1f",
t.Y, t.X, lastEndX, nextWordX)
}
common.Log.Debug("t=%s", t)
common.Log.Debug("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
common.Log.Debug("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
t.Text, t.X, t.Y, lastEndX, nextWordX, nextWordX-t.X, isSpace)
if isSpace {
words = append(words, " ")
x = append(x, (lastEndX+t.X)*0.5)
@ -862,6 +905,7 @@ func (tl *TextList) toLinesOrient() []Line {
words = append(words, t.Text)
x = append(x, t.X)
scanning = true
common.Log.Debug("lastEndX=%.2f", lastEndX)
}
if len(words) > 0 {
line := newLine(y, x, words)
@ -898,7 +942,7 @@ func (exp *ExponAve) update(x float64) float64 {
return exp.ave
}
// printTexts is a debugging function. XXX Remove this.
// printTexts is a debugging function. XXX(peterwilliams97) Remove this.
func (tl *TextList) printTexts(message string) {
return
_, file, line, ok := runtime.Caller(1)
@ -910,17 +954,17 @@ func (tl *TextList) printTexts(message string) {
}
prefix := fmt.Sprintf("[%s:%d]", file, line)
common.Log.Error("=====================================")
common.Log.Error("printTexts %s %s", prefix, message)
common.Log.Error("%d texts", len(*tl))
common.Log.Debug("=====================================")
common.Log.Debug("printTexts %s %s", prefix, message)
common.Log.Debug("%d texts", len(*tl))
parts := []string{}
for i, t := range *tl {
fmt.Printf("%5d: %s\n", i, t.String())
parts = append(parts, t.Text)
}
common.Log.Error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
common.Log.Debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
fmt.Printf("%s\n", strings.Join(parts, ""))
common.Log.Error("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
common.Log.Debug("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
}
// newLine returns the Line representation of strings `words` with y coordinate `y` and x

View File

@ -17,7 +17,7 @@ import (
)
func init() {
common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
if flag.Lookup("test.v") != nil {
isTesting = true
}
@ -59,12 +59,20 @@ var extract2Tests = []struct {
filename string
expectedPageText map[int][]string
}{
{
filename: "testdata/reader.pdf",
{filename: "testdata/reader.pdf",
expectedPageText: map[int][]string{
1: []string{"A Research UNIX Reader:",
"Annotated Excerpts from the Programmer's Manual,",
"Annotated Excerpts from the Programmers Manual,",
"1. Introduction",
"To keep the size of this report",
"last common ancestor of a radiative explosion",
},
},
},
{filename: "testdata/000026.pdf",
expectedPageText: map[int][]string{
1: []string{"Fresh Flower",
"Care & Handling",
},
},
},
@ -85,10 +93,9 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri
}
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
actualSentences := asSet(strings.Split(actualText, "\n"))
for _, e := range expectedSentences {
if _, ok := actualSentences[e]; !ok {
t.Errorf("No match for %q", e)
if !strings.Contains(actualText, e) {
t.Errorf("No match for %+q", e)
return false
}
}
@ -104,14 +111,6 @@ func sortedKeys(m map[int][]string) []int {
return keys
}
func asSet(keys []string) map[string]bool {
set := map[string]bool{}
for _, k := range keys {
set[k] = true
}
return set
}
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
f, err := os.Open(filename)
if err != nil {

View File

@ -6,7 +6,7 @@
package textencoding
// NewSymbolEncoder returns a SimpleEncoder that implements SymbolEncoding.
func NewSymbolEncoder() SimpleEncoder {
func NewSymbolEncoder() *SimpleEncoder {
enc, _ := NewSimpleTextEncoder("SymbolEncoding", nil)
return *enc
return enc
}

View File

@ -6,7 +6,7 @@
package textencoding
// NewWinAnsiTextEncoder returns a SimpleEncoder that implements WinAnsiEncoding.
func NewWinAnsiTextEncoder() SimpleEncoder {
func NewWinAnsiTextEncoder() *SimpleEncoder {
enc, _ := NewSimpleTextEncoder("WinAnsiEncoding", nil)
return *enc
return enc
}

View File

@ -6,7 +6,7 @@
package textencoding
// NewZapfDingbatsEncoder returns a SimpleEncoder that implements ZapfDingbatsEncoding.
func NewZapfDingbatsEncoder() SimpleEncoder {
func NewZapfDingbatsEncoder() *SimpleEncoder {
enc, _ := NewSimpleTextEncoder("ZapfDingbatsEncoding", nil)
return *enc
return enc
}

View File

@ -263,7 +263,9 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
font.context = type0font
case "Type1", "Type3", "MMType1", "TrueType":
var simplefont *pdfFontSimple
if std, ok := loadStandard14Font(Standard14Font(base.basefont)); ok && base.subtype == "Type1" {
std, ok := loadStandard14Font(Standard14Font(base.basefont))
builtin := ok && base.subtype == "Type1"
if builtin {
font.context = &std
stdObj := core.TraceToDirectObject(std.ToPdfObject())
@ -283,8 +285,6 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
return nil, err
}
simplefont.firstChar = std.firstChar
simplefont.lastChar = std.lastChar
simplefont.charWidths = std.charWidths
simplefont.fontMetrics = std.fontMetrics
} else {
@ -298,6 +298,19 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
if err != nil {
return nil, err
}
if builtin {
simplefont.updateStandard14Font()
}
if builtin && simplefont.encoder == nil && simplefont.std14Encoder == nil {
common.Log.Error("simplefont=%s", simplefont)
common.Log.Error("std=%s", std)
panic("Not possible")
}
if len(simplefont.charWidths) == 0 {
common.Log.Error("simplefont=%s", simplefont)
common.Log.Error("std=%s", std)
common.Log.Debug("ERROR: No widths. font=%s", simplefont)
}
font.context = simplefont
case "CIDFontType0":
cidfont, err := newPdfCIDFontType0FromPdfObject(d, base)
@ -415,6 +428,7 @@ func (font PdfFont) BytesToCharcodes(data []byte) []uint16 {
}
// CharcodesToUnicode converts the character codes `charcodes` to a slice of unicode strings.
// XXX(peterwilliams97): Remove int returns.
func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int) {
charstrings := make([]string, 0, len(charcodes))
numMisses := 0
@ -426,7 +440,7 @@ func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int)
continue
}
}
// Fall back to encoding
// Fall back to encoding.
encoder := font.Encoder()
if encoder != nil {
r, ok := encoder.CharcodeToRune(code)

View File

@ -37,13 +37,12 @@ type pdfFontSimple struct {
container *core.PdfIndirectObject
// These fields are specific to simple PDF fonts.
firstChar int
lastChar int
charWidths []float64
charWidths map[uint16]float64
// std14Encoder is the encoder specified by the /Encoding entry in the font dict.
encoder textencoding.TextEncoder
encoder *textencoding.SimpleEncoder
// std14Encoder is used for Standard 14 fonts where no /Encoding is specified in the font dict.
std14Encoder textencoding.TextEncoder
std14Encoder *textencoding.SimpleEncoder
// std14Descriptor is used for Standard 14 fonts where no /FontDescriptor is specified in the font dict.
std14Descriptor *PdfFontDescriptor
@ -76,14 +75,26 @@ func (font *pdfFontSimple) Encoder() textencoding.TextEncoder {
// Standard 14 fonts have builtin encoders that we fall back to when no /Encoding is specified
// in the font dict.
if font.encoder == nil {
// Need to make font.Encoder()==nil test work for font.std14=Encoder=font.encoder=nil
// See https://golang.org/doc/faq#nil_error
if font.std14Encoder == nil {
return nil
}
return font.std14Encoder
}
return font.encoder
}
// SetEncoder sets the encoding for the underlying font.
// XXX(peterwilliams97) Change function signature to SetEncoder(encoder *textencoding.SimpleEncoder).
func (font *pdfFontSimple) SetEncoder(encoder textencoding.TextEncoder) {
font.encoder = encoder
simple, ok := encoder.(*textencoding.SimpleEncoder)
if !ok {
// This can't happen.
common.Log.Error("pdfFontSimple.SetEncoder passedbad encoder type %T", encoder)
simple = nil
}
font.encoder = simple
}
// GetGlyphCharMetrics returns the character metrics for the specified glyph. A bool flag is
@ -123,30 +134,21 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
// GetCharMetrics returns the character metrics for the specified character code. A bool flag is
// returned to indicate whether or not the entry was found in the glyph to charcode mapping.
func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
metrics := fonts.CharMetrics{}
if int(code) < font.firstChar {
common.Log.Debug("Code lower than firstchar (%d < %d)", code, font.firstChar)
return metrics, false
if width, ok := font.charWidths[code]; ok {
common.Log.Debug("GetCharMetrics 1: code=%d width=%.1f font=%s", code, width, font)
return fonts.CharMetrics{Wx: width}, true
}
if int(code) > font.lastChar {
common.Log.Debug("ERROR: Code higher than lastchar (%d > %d) %s",
code, font.lastChar, font)
return metrics, false
}
index := int(code) - font.firstChar
if index >= len(font.charWidths) {
common.Log.Debug("ERROR: Code outside of widths range (%d > %d) code=%d [%d %d] font=%s",
index, len(font.charWidths), code, font.firstChar, font.lastChar, font.String())
return metrics, false
}
width := font.charWidths[index]
metrics.Wx = width
if font.encoder != nil {
if glyph, ok := font.encoder.CharcodeToGlyph(code); ok {
if metrics, ok := font.fontMetrics[glyph]; ok {
font.charWidths[code] = metrics.Wx
common.Log.Debug("GetCharMetrics 2: code=%d glyph=%q width=%.1f", code, glyph, metrics.Wx)
return metrics, true
}
}
}
common.Log.Debug("GetCharMetrics 3: code=%d", code)
return fonts.CharMetrics{}, false
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
@ -170,7 +172,7 @@ func (font pdfFontSimple) GetAverageCharWidth() float64 {
// • The value of BaseFont is derived differently.
//
func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
std14Encoder textencoding.TextEncoder) (*pdfFontSimple, error) {
std14Encoder *textencoding.SimpleEncoder) (*pdfFontSimple, error) {
font := pdfFontSimpleFromSkeleton(base)
font.std14Encoder = std14Encoder
@ -187,7 +189,7 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
common.Log.Debug("ERROR: Invalid FirstChar type (%T)", obj)
return nil, core.ErrTypeError
}
font.firstChar = int(intVal)
firstChar := int(intVal)
obj = d.Get("LastChar")
if obj == nil {
@ -199,9 +201,9 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
common.Log.Debug("ERROR: Invalid LastChar type (%T)", obj)
return nil, core.ErrTypeError
}
font.lastChar = int(intVal)
lastChar := int(intVal)
font.charWidths = []float64{}
font.charWidths = map[uint16]float64{}
obj = d.Get("Widths")
if obj != nil {
font.Widths = obj
@ -218,16 +220,15 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
return nil, err
}
if len(widths) != (font.lastChar - font.firstChar + 1) {
if len(widths) != (lastChar - firstChar + 1) {
common.Log.Debug("ERROR: Invalid widths length != %d (%d)",
font.lastChar-font.firstChar+1, len(widths))
lastChar-firstChar+1, len(widths))
return nil, core.ErrRangeError
}
font.charWidths = widths
for i, w := range widths {
font.charWidths[uint16(firstChar+i)] = w
}
}
if font.lastChar > 0 && len(font.charWidths) == 0 {
common.Log.Debug("ERROR: No widths. font=%s", font)
}
font.Encoding = core.TraceToDirectObject(d.Get("Encoding"))
@ -250,9 +251,8 @@ func (font *pdfFontSimple) addEncoding() error {
return err
}
base := font.baseFields()
common.Log.Trace("addEncoding: BaseFont=%q Subtype=%q Encoding=%s (%T)", base.basefont,
base.subtype, font.Encoding, font.Encoding)
common.Log.Trace("addEncoding: BaseFont=%q Subtype=%q Encoding=%s (%T) differences=%d %+v",
base.basefont, base.subtype, font.Encoding, font.Encoding, len(differences), differences)
encoder, err = textencoding.NewSimpleTextEncoder(baseEncoder, differences)
if err != nil {
return err
@ -378,6 +378,7 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) {
}
truefont := &pdfFontSimple{
charWidths: map[uint16]float64{},
fontCommon: fontCommon{
subtype: "TrueType",
},
@ -387,8 +388,6 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) {
// then can derive
// TODO: Subsetting fonts.
truefont.encoder = textencoding.NewWinAnsiTextEncoder()
truefont.firstChar = minCode
truefont.lastChar = maxCode
truefont.basefont = ttf.PostScriptName
truefont.FirstChar = core.MakeInteger(minCode)
@ -424,12 +423,15 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) {
truefont.Widths = core.MakeIndirectObject(core.MakeArrayFromFloats(vals))
if len(vals) < (255 - 32 + 1) {
if len(vals) < maxCode-minCode+1 {
common.Log.Debug("ERROR: Invalid length of widths, %d < %d", len(vals), 255-32+1)
return nil, core.ErrRangeError
}
truefont.charWidths = vals[:255-32+1]
// truefont.charWidths = vals[:maxCode-minCode+1]
for i := uint16(minCode); i <= maxCode; i++ {
truefont.charWidths[i] = vals[i-minCode]
}
// Use WinAnsiEncoding by default.
truefont.Encoding = core.MakeName("WinAnsiEncoding")
@ -511,31 +513,38 @@ func loadStandard14Font(baseFont Standard14Font) (pdfFontSimple, bool) {
if !ok {
return pdfFontSimple{}, false
}
descriptor := builtinDescriptor(string(baseFont))
if descriptor == nil {
return pdfFontSimple{}, false
}
std.std14Descriptor = descriptor
se, ok := std.std14Encoder.(textencoding.SimpleEncoder)
if !ok {
common.Log.Debug("ERROR: Wrong encoder type: %T", std.std14Encoder)
}
codes := []int{}
for c := range se.CodeToGlyph {
codes = append(codes, int(c))
}
sort.Ints(codes)
std.firstChar = codes[0]
std.lastChar = codes[len(codes)-1]
std.charWidths = make([]float64, len(codes))
for i, code := range codes {
glyph := se.CodeToGlyph[uint16(code)]
std.charWidths[i] = std.fontMetrics[glyph].Wx
}
return std, true
}
func (font *pdfFontSimple) updateStandard14Font() {
se, ok := font.Encoder().(*textencoding.SimpleEncoder)
if !ok {
// This can't happen.
common.Log.Error("Wrong encoder type: %T. font=%s.", font.Encoder(), font)
return
}
codes := []uint16{}
for c := range se.CodeToGlyph {
codes = append(codes, c)
}
sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
font.charWidths = map[uint16]float64{}
for _, code := range codes {
glyph := se.CodeToGlyph[uint16(code)]
font.charWidths[code] = font.fontMetrics[glyph].Wx
}
}
var standard14Fonts = map[Standard14Font]pdfFontSimple{
Courier: pdfFontSimple{
fontCommon: fontCommon{