Fixes for text extraction corpus testing.

- Correct matrix multiplication order in text.go
- Look up standard 14 font widths after applying custom encoding.
This commit is contained in:
Peter Williams 2018-11-18 17:21:30 +11:00
parent 851aa267b1
commit a9019a50a3
8 changed files with 185 additions and 119 deletions

BIN
pdf/extractor/testdata/000026.pdf vendored Normal file

Binary file not shown.

View File

@ -266,7 +266,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
err = processor.Process(e.resources) err = processor.Process(e.resources)
if err != nil { if err != nil {
common.Log.Error("ERROR: Processing: err=%v", err) common.Log.Debug("ERROR: Processing: err=%v", err)
} }
return textList, state.numChars, state.numMisses, err return textList, state.numChars, state.numMisses, err
} }
@ -406,7 +406,7 @@ func (to *textObject) setTextRise(y float64) {
// setWordSpacing "Tw" Set word spacing. // setWordSpacing "Tw" Set word spacing.
func (to *textObject) setWordSpacing(y float64) { func (to *textObject) setWordSpacing(y float64) {
// Not implemented yet to.State.Tw = y
} }
// setHorizScaling "Tz" Set horizontal scaling. // setHorizScaling "Tz" Set horizontal scaling.
@ -609,7 +609,10 @@ func (to *textObject) renderText(data []byte) error {
0, tfs, 0, tfs,
0, state.Trise) 0, state.Trise)
common.Log.Debug("%d codes=%+v runes=%q", len(charcodes), charcodes, runes)
for i, r := range runes { for i, r := range runes {
code := charcodes[i] code := charcodes[i]
// The location of the text on the page in device coordinates is given by trm, the text // The location of the text on the page in device coordinates is given by trm, the text
// rendering matrix. // rendering matrix.
@ -634,19 +637,38 @@ func (to *textObject) renderText(data []byte) error {
c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio} c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
// t is the displacement of the text cursor when the character is rendered. // t is the displacement of the text cursor when the character is rendered.
// float tx = displacementX * fontSize * horizontalScaling;
// w = 0
t0 := Point{X: (c.X*tfs + w) * th}
t := Point{X: (c.X*tfs + state.Tc + w) * th} t := Point{X: (c.X*tfs + state.Tc + w) * th}
// td is t in matrix form. // td is t in matrix form.
td0 := translationMatrix(t0)
td := translationMatrix(t) td := translationMatrix(t)
common.Log.Debug("%q stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.Tm)
common.Log.Debug("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.Tc, w, state.Tw)
common.Log.Debug("m=%s c=%+v t0=%+v td0=%s trm0=%s",
m, c, t0, td0, td0.Mult(to.Tm).Mult(to.gs.CTM))
common.Log.Debug("m=%s c=%+v t=%+v td=%s trm=%s",
m, c, t, td, td.Mult(to.Tm).Mult(to.gs.CTM))
nextTm := to.Tm.Mult(td) nextTm := to.Tm.Mult(td)
xyt := XYText{Text: string(r), // xyt := XYText{Text: string(r),
Point: translation(trm), // Point: translation(trm),
Orient: trm.Orientation(), // Orient: trm.Orientation(),
End: translation(to.Tm.Mult(td).Mult(to.gs.CTM)), // // Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
SpaceWidth: spaceWidth * trm.ScalingFactorX(), // End: translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
} // SpaceWidth: spaceWidth * trm.ScalingFactorX(),
// }
xyt := newXYText(
string(r),
translation(trm),
translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
trm.Orientation(),
spaceWidth*trm.ScalingFactorX())
common.Log.Debug("i=%d code=%d, xyt=%s", i, code, xyt)
to.Texts = append(to.Texts, xyt) to.Texts = append(to.Texts, xyt)
// update the text matrix by the displacement of the text location. // update the text matrix by the displacement of the text location.
@ -690,11 +712,26 @@ type XYText struct {
SpaceWidth float64 SpaceWidth float64
Font string Font string
FontSize float64 FontSize float64
counter int
}
var counter int
func newXYText(text string, point, end Point, orient contentstream.Orientation, spaceWidth float64) XYText {
counter++
return XYText{
Text: text,
Point: point,
End: end,
Orient: orient,
SpaceWidth: spaceWidth,
counter: counter,
}
} }
// String returns a string describing `t`. // String returns a string describing `t`.
func (t XYText) String() string { func (t XYText) String() string {
return fmt.Sprintf("%s,%s %.1f %q", return fmt.Sprintf("@@%d %s,%s %.1f %q", t.counter,
t.Point.String(), t.End.String(), t.End.X-t.X, truncate(t.Text, 100)) t.Point.String(), t.End.String(), t.End.X-t.X, truncate(t.Text, 100))
} }
@ -707,7 +744,7 @@ func (t XYText) Width() float64 {
default: default:
w = math.Abs(t.End.X - t.X) w = math.Abs(t.End.X - t.X)
} }
common.Log.Trace(" Width %q (%s %s) -> %.1f", t.Text, t.Point.String(), t.End.String(), w) common.Log.Debug(" Width %q (%s %s) -> %.1f", t.Text, t.Point.String(), t.End.String(), w)
return w return w
} }
@ -719,20 +756,20 @@ func (tl *TextList) Length() int {
return len(*tl) return len(*tl)
} }
// AppendText appends the location and contents of `text` to a text list. // // AppendText appends the location and contents of `text` to a text list.
func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text string, spaceWidth float64) { // func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text string, spaceWidth float64) {
t := XYText{ // t := XYText{
Point: p, // Point: p,
End: e, // End: e,
ColorStroking: gs.ColorStroking, // ColorStroking: gs.ColorStroking,
ColorNonStroking: gs.ColorNonStroking, // ColorNonStroking: gs.ColorNonStroking,
Orient: gs.PageOrientation(), // Orient: gs.PageOrientation(),
Text: text, // Text: text,
SpaceWidth: spaceWidth, // SpaceWidth: spaceWidth,
} // }
common.Log.Debug("AppendText: %s", t.String()) // common.Log.Debug("AppendText: %s", t.String())
*tl = append(*tl, t) // *tl = append(*tl, t)
} // }
// ToText returns the contents of `tl` as a single string. // ToText returns the contents of `tl` as a single string.
func (tl *TextList) ToText() string { func (tl *TextList) ToText() string {
@ -794,6 +831,7 @@ func (tl *TextList) toLines() []Line {
} }
portLines := portText.toLinesOrient() portLines := portText.toLinesOrient()
landLines := landText.toLinesOrient() landLines := landText.toLinesOrient()
common.Log.Debug("portText=%d landText=%d", len(portText), len(landText))
return append(portLines, landLines...) return append(portLines, landLines...)
} }
@ -816,6 +854,7 @@ func (tl *TextList) toLinesOrient() []Line {
lastEndX := 0.0 // (*tl)[i-1).End.X lastEndX := 0.0 // (*tl)[i-1).End.X
for _, t := range *tl { for _, t := range *tl {
// common.Log.Debug("%d --------------------------", i)
if t.Y < y { if t.Y < y {
if len(words) > 0 { if len(words) > 0 {
line := newLine(y, x, words) line := newLine(y, x, words)
@ -846,12 +885,16 @@ func (tl *TextList) toLinesOrient() []Line {
deltaCharWidth := averageCharWidth.ave * 0.3 deltaCharWidth := averageCharWidth.ave * 0.3
isSpace := false isSpace := false
if scanning && t.Text != " " {
nextWordX := lastEndX + min(deltaSpace, deltaCharWidth) nextWordX := lastEndX + min(deltaSpace, deltaCharWidth)
if scanning && t.Text != " " {
isSpace = nextWordX < t.X isSpace = nextWordX < t.X
common.Log.Trace("[%.1f, %.1f] lastEndX=%.1f nextWordX=%.1f",
t.Y, t.X, lastEndX, nextWordX)
} }
common.Log.Debug("t=%s", t)
common.Log.Debug("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
common.Log.Debug("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
t.Text, t.X, t.Y, lastEndX, nextWordX, nextWordX-t.X, isSpace)
if isSpace { if isSpace {
words = append(words, " ") words = append(words, " ")
x = append(x, (lastEndX+t.X)*0.5) x = append(x, (lastEndX+t.X)*0.5)
@ -862,6 +905,7 @@ func (tl *TextList) toLinesOrient() []Line {
words = append(words, t.Text) words = append(words, t.Text)
x = append(x, t.X) x = append(x, t.X)
scanning = true scanning = true
common.Log.Debug("lastEndX=%.2f", lastEndX)
} }
if len(words) > 0 { if len(words) > 0 {
line := newLine(y, x, words) line := newLine(y, x, words)
@ -898,7 +942,7 @@ func (exp *ExponAve) update(x float64) float64 {
return exp.ave return exp.ave
} }
// printTexts is a debugging function. XXX Remove this. // printTexts is a debugging function. XXX(peterwilliams97) Remove this.
func (tl *TextList) printTexts(message string) { func (tl *TextList) printTexts(message string) {
return return
_, file, line, ok := runtime.Caller(1) _, file, line, ok := runtime.Caller(1)
@ -910,17 +954,17 @@ func (tl *TextList) printTexts(message string) {
} }
prefix := fmt.Sprintf("[%s:%d]", file, line) prefix := fmt.Sprintf("[%s:%d]", file, line)
common.Log.Error("=====================================") common.Log.Debug("=====================================")
common.Log.Error("printTexts %s %s", prefix, message) common.Log.Debug("printTexts %s %s", prefix, message)
common.Log.Error("%d texts", len(*tl)) common.Log.Debug("%d texts", len(*tl))
parts := []string{} parts := []string{}
for i, t := range *tl { for i, t := range *tl {
fmt.Printf("%5d: %s\n", i, t.String()) fmt.Printf("%5d: %s\n", i, t.String())
parts = append(parts, t.Text) parts = append(parts, t.Text)
} }
common.Log.Error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") common.Log.Debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
fmt.Printf("%s\n", strings.Join(parts, "")) fmt.Printf("%s\n", strings.Join(parts, ""))
common.Log.Error("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") common.Log.Debug("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
} }
// newLine returns the Line representation of strings `words` with y coordinate `y` and x // newLine returns the Line representation of strings `words` with y coordinate `y` and x

View File

@ -17,7 +17,7 @@ import (
) )
func init() { func init() {
common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug)) common.SetLogger(common.NewConsoleLogger(common.LogLevelError))
if flag.Lookup("test.v") != nil { if flag.Lookup("test.v") != nil {
isTesting = true isTesting = true
} }
@ -59,12 +59,20 @@ var extract2Tests = []struct {
filename string filename string
expectedPageText map[int][]string expectedPageText map[int][]string
}{ }{
{ {filename: "testdata/reader.pdf",
filename: "testdata/reader.pdf",
expectedPageText: map[int][]string{ expectedPageText: map[int][]string{
1: []string{"A Research UNIX Reader:", 1: []string{"A Research UNIX Reader:",
"Annotated Excerpts from the Programmer's Manual,", "Annotated Excerpts from the Programmers Manual,",
"1. Introduction",
"To keep the size of this report", "To keep the size of this report",
"last common ancestor of a radiative explosion",
},
},
},
{filename: "testdata/000026.pdf",
expectedPageText: map[int][]string{
1: []string{"Fresh Flower",
"Care & Handling",
}, },
}, },
}, },
@ -85,10 +93,9 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri
} }
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool { func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
actualSentences := asSet(strings.Split(actualText, "\n"))
for _, e := range expectedSentences { for _, e := range expectedSentences {
if _, ok := actualSentences[e]; !ok { if !strings.Contains(actualText, e) {
t.Errorf("No match for %q", e) t.Errorf("No match for %+q", e)
return false return false
} }
} }
@ -104,14 +111,6 @@ func sortedKeys(m map[int][]string) []int {
return keys return keys
} }
func asSet(keys []string) map[string]bool {
set := map[string]bool{}
for _, k := range keys {
set[k] = true
}
return set
}
func extractPageTexts(t *testing.T, filename string) (int, map[int]string) { func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
f, err := os.Open(filename) f, err := os.Open(filename)
if err != nil { if err != nil {

View File

@ -6,7 +6,7 @@
package textencoding package textencoding
// NewSymbolEncoder returns a SimpleEncoder that implements SymbolEncoding. // NewSymbolEncoder returns a SimpleEncoder that implements SymbolEncoding.
func NewSymbolEncoder() SimpleEncoder { func NewSymbolEncoder() *SimpleEncoder {
enc, _ := NewSimpleTextEncoder("SymbolEncoding", nil) enc, _ := NewSimpleTextEncoder("SymbolEncoding", nil)
return *enc return enc
} }

View File

@ -6,7 +6,7 @@
package textencoding package textencoding
// NewWinAnsiTextEncoder returns a SimpleEncoder that implements WinAnsiEncoding. // NewWinAnsiTextEncoder returns a SimpleEncoder that implements WinAnsiEncoding.
func NewWinAnsiTextEncoder() SimpleEncoder { func NewWinAnsiTextEncoder() *SimpleEncoder {
enc, _ := NewSimpleTextEncoder("WinAnsiEncoding", nil) enc, _ := NewSimpleTextEncoder("WinAnsiEncoding", nil)
return *enc return enc
} }

View File

@ -6,7 +6,7 @@
package textencoding package textencoding
// NewZapfDingbatsEncoder returns a SimpleEncoder that implements ZapfDingbatsEncoding. // NewZapfDingbatsEncoder returns a SimpleEncoder that implements ZapfDingbatsEncoding.
func NewZapfDingbatsEncoder() SimpleEncoder { func NewZapfDingbatsEncoder() *SimpleEncoder {
enc, _ := NewSimpleTextEncoder("ZapfDingbatsEncoding", nil) enc, _ := NewSimpleTextEncoder("ZapfDingbatsEncoding", nil)
return *enc return enc
} }

View File

@ -263,7 +263,9 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
font.context = type0font font.context = type0font
case "Type1", "Type3", "MMType1", "TrueType": case "Type1", "Type3", "MMType1", "TrueType":
var simplefont *pdfFontSimple var simplefont *pdfFontSimple
if std, ok := loadStandard14Font(Standard14Font(base.basefont)); ok && base.subtype == "Type1" { std, ok := loadStandard14Font(Standard14Font(base.basefont))
builtin := ok && base.subtype == "Type1"
if builtin {
font.context = &std font.context = &std
stdObj := core.TraceToDirectObject(std.ToPdfObject()) stdObj := core.TraceToDirectObject(std.ToPdfObject())
@ -283,8 +285,6 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
return nil, err return nil, err
} }
simplefont.firstChar = std.firstChar
simplefont.lastChar = std.lastChar
simplefont.charWidths = std.charWidths simplefont.charWidths = std.charWidths
simplefont.fontMetrics = std.fontMetrics simplefont.fontMetrics = std.fontMetrics
} else { } else {
@ -298,6 +298,19 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
if err != nil { if err != nil {
return nil, err return nil, err
} }
if builtin {
simplefont.updateStandard14Font()
}
if builtin && simplefont.encoder == nil && simplefont.std14Encoder == nil {
common.Log.Error("simplefont=%s", simplefont)
common.Log.Error("std=%s", std)
panic("Not possible")
}
if len(simplefont.charWidths) == 0 {
common.Log.Error("simplefont=%s", simplefont)
common.Log.Error("std=%s", std)
common.Log.Debug("ERROR: No widths. font=%s", simplefont)
}
font.context = simplefont font.context = simplefont
case "CIDFontType0": case "CIDFontType0":
cidfont, err := newPdfCIDFontType0FromPdfObject(d, base) cidfont, err := newPdfCIDFontType0FromPdfObject(d, base)
@ -415,6 +428,7 @@ func (font PdfFont) BytesToCharcodes(data []byte) []uint16 {
} }
// CharcodesToUnicode converts the character codes `charcodes` to a slice of unicode strings. // CharcodesToUnicode converts the character codes `charcodes` to a slice of unicode strings.
// XXX(peterwilliams97): Remove int returns.
func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int) { func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int) {
charstrings := make([]string, 0, len(charcodes)) charstrings := make([]string, 0, len(charcodes))
numMisses := 0 numMisses := 0
@ -426,7 +440,7 @@ func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int)
continue continue
} }
} }
// Fall back to encoding // Fall back to encoding.
encoder := font.Encoder() encoder := font.Encoder()
if encoder != nil { if encoder != nil {
r, ok := encoder.CharcodeToRune(code) r, ok := encoder.CharcodeToRune(code)

View File

@ -37,13 +37,12 @@ type pdfFontSimple struct {
container *core.PdfIndirectObject container *core.PdfIndirectObject
// These fields are specific to simple PDF fonts. // These fields are specific to simple PDF fonts.
firstChar int
lastChar int charWidths map[uint16]float64
charWidths []float64
// std14Encoder is the encoder specified by the /Encoding entry in the font dict. // std14Encoder is the encoder specified by the /Encoding entry in the font dict.
encoder textencoding.TextEncoder encoder *textencoding.SimpleEncoder
// std14Encoder is used for Standard 14 fonts where no /Encoding is specified in the font dict. // std14Encoder is used for Standard 14 fonts where no /Encoding is specified in the font dict.
std14Encoder textencoding.TextEncoder std14Encoder *textencoding.SimpleEncoder
// std14Descriptor is used for Standard 14 fonts where no /FontDescriptor is specified in the font dict. // std14Descriptor is used for Standard 14 fonts where no /FontDescriptor is specified in the font dict.
std14Descriptor *PdfFontDescriptor std14Descriptor *PdfFontDescriptor
@ -76,14 +75,26 @@ func (font *pdfFontSimple) Encoder() textencoding.TextEncoder {
// Standard 14 fonts have builtin encoders that we fall back to when no /Encoding is specified // Standard 14 fonts have builtin encoders that we fall back to when no /Encoding is specified
// in the font dict. // in the font dict.
if font.encoder == nil { if font.encoder == nil {
// Need to make font.Encoder()==nil test work for font.std14=Encoder=font.encoder=nil
// See https://golang.org/doc/faq#nil_error
if font.std14Encoder == nil {
return nil
}
return font.std14Encoder return font.std14Encoder
} }
return font.encoder return font.encoder
} }
// SetEncoder sets the encoding for the underlying font. // SetEncoder sets the encoding for the underlying font.
// XXX(peterwilliams97) Change function signature to SetEncoder(encoder *textencoding.SimpleEncoder).
func (font *pdfFontSimple) SetEncoder(encoder textencoding.TextEncoder) { func (font *pdfFontSimple) SetEncoder(encoder textencoding.TextEncoder) {
font.encoder = encoder simple, ok := encoder.(*textencoding.SimpleEncoder)
if !ok {
// This can't happen.
common.Log.Error("pdfFontSimple.SetEncoder passedbad encoder type %T", encoder)
simple = nil
}
font.encoder = simple
} }
// GetGlyphCharMetrics returns the character metrics for the specified glyph. A bool flag is // GetGlyphCharMetrics returns the character metrics for the specified glyph. A bool flag is
@ -123,30 +134,21 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
// GetCharMetrics returns the character metrics for the specified character code. A bool flag is // GetCharMetrics returns the character metrics for the specified character code. A bool flag is
// returned to indicate whether or not the entry was found in the glyph to charcode mapping. // returned to indicate whether or not the entry was found in the glyph to charcode mapping.
func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) { func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
metrics := fonts.CharMetrics{} if width, ok := font.charWidths[code]; ok {
common.Log.Debug("GetCharMetrics 1: code=%d width=%.1f font=%s", code, width, font)
if int(code) < font.firstChar { return fonts.CharMetrics{Wx: width}, true
common.Log.Debug("Code lower than firstchar (%d < %d)", code, font.firstChar)
return metrics, false
} }
if font.encoder != nil {
if int(code) > font.lastChar { if glyph, ok := font.encoder.CharcodeToGlyph(code); ok {
common.Log.Debug("ERROR: Code higher than lastchar (%d > %d) %s", if metrics, ok := font.fontMetrics[glyph]; ok {
code, font.lastChar, font) font.charWidths[code] = metrics.Wx
return metrics, false common.Log.Debug("GetCharMetrics 2: code=%d glyph=%q width=%.1f", code, glyph, metrics.Wx)
}
index := int(code) - font.firstChar
if index >= len(font.charWidths) {
common.Log.Debug("ERROR: Code outside of widths range (%d > %d) code=%d [%d %d] font=%s",
index, len(font.charWidths), code, font.firstChar, font.lastChar, font.String())
return metrics, false
}
width := font.charWidths[index]
metrics.Wx = width
return metrics, true return metrics, true
}
}
}
common.Log.Debug("GetCharMetrics 3: code=%d", code)
return fonts.CharMetrics{}, false
} }
// GetAverageCharWidth returns the average width of all the characters in `font`. // GetAverageCharWidth returns the average width of all the characters in `font`.
@ -170,7 +172,7 @@ func (font pdfFontSimple) GetAverageCharWidth() float64 {
// • The value of BaseFont is derived differently. // • The value of BaseFont is derived differently.
// //
func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon, func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
std14Encoder textencoding.TextEncoder) (*pdfFontSimple, error) { std14Encoder *textencoding.SimpleEncoder) (*pdfFontSimple, error) {
font := pdfFontSimpleFromSkeleton(base) font := pdfFontSimpleFromSkeleton(base)
font.std14Encoder = std14Encoder font.std14Encoder = std14Encoder
@ -187,7 +189,7 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
common.Log.Debug("ERROR: Invalid FirstChar type (%T)", obj) common.Log.Debug("ERROR: Invalid FirstChar type (%T)", obj)
return nil, core.ErrTypeError return nil, core.ErrTypeError
} }
font.firstChar = int(intVal) firstChar := int(intVal)
obj = d.Get("LastChar") obj = d.Get("LastChar")
if obj == nil { if obj == nil {
@ -199,9 +201,9 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
common.Log.Debug("ERROR: Invalid LastChar type (%T)", obj) common.Log.Debug("ERROR: Invalid LastChar type (%T)", obj)
return nil, core.ErrTypeError return nil, core.ErrTypeError
} }
font.lastChar = int(intVal) lastChar := int(intVal)
font.charWidths = []float64{} font.charWidths = map[uint16]float64{}
obj = d.Get("Widths") obj = d.Get("Widths")
if obj != nil { if obj != nil {
font.Widths = obj font.Widths = obj
@ -218,16 +220,15 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon,
return nil, err return nil, err
} }
if len(widths) != (font.lastChar - font.firstChar + 1) { if len(widths) != (lastChar - firstChar + 1) {
common.Log.Debug("ERROR: Invalid widths length != %d (%d)", common.Log.Debug("ERROR: Invalid widths length != %d (%d)",
font.lastChar-font.firstChar+1, len(widths)) lastChar-firstChar+1, len(widths))
return nil, core.ErrRangeError return nil, core.ErrRangeError
} }
font.charWidths = widths for i, w := range widths {
font.charWidths[uint16(firstChar+i)] = w
} }
} }
if font.lastChar > 0 && len(font.charWidths) == 0 {
common.Log.Debug("ERROR: No widths. font=%s", font)
} }
font.Encoding = core.TraceToDirectObject(d.Get("Encoding")) font.Encoding = core.TraceToDirectObject(d.Get("Encoding"))
@ -250,9 +251,8 @@ func (font *pdfFontSimple) addEncoding() error {
return err return err
} }
base := font.baseFields() base := font.baseFields()
common.Log.Trace("addEncoding: BaseFont=%q Subtype=%q Encoding=%s (%T)", base.basefont, common.Log.Trace("addEncoding: BaseFont=%q Subtype=%q Encoding=%s (%T) differences=%d %+v",
base.subtype, font.Encoding, font.Encoding) base.basefont, base.subtype, font.Encoding, font.Encoding, len(differences), differences)
encoder, err = textencoding.NewSimpleTextEncoder(baseEncoder, differences) encoder, err = textencoding.NewSimpleTextEncoder(baseEncoder, differences)
if err != nil { if err != nil {
return err return err
@ -378,6 +378,7 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) {
} }
truefont := &pdfFontSimple{ truefont := &pdfFontSimple{
charWidths: map[uint16]float64{},
fontCommon: fontCommon{ fontCommon: fontCommon{
subtype: "TrueType", subtype: "TrueType",
}, },
@ -387,8 +388,6 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) {
// then can derive // then can derive
// TODO: Subsetting fonts. // TODO: Subsetting fonts.
truefont.encoder = textencoding.NewWinAnsiTextEncoder() truefont.encoder = textencoding.NewWinAnsiTextEncoder()
truefont.firstChar = minCode
truefont.lastChar = maxCode
truefont.basefont = ttf.PostScriptName truefont.basefont = ttf.PostScriptName
truefont.FirstChar = core.MakeInteger(minCode) truefont.FirstChar = core.MakeInteger(minCode)
@ -424,12 +423,15 @@ func NewPdfFontFromTTFFile(filePath string) (*PdfFont, error) {
truefont.Widths = core.MakeIndirectObject(core.MakeArrayFromFloats(vals)) truefont.Widths = core.MakeIndirectObject(core.MakeArrayFromFloats(vals))
if len(vals) < (255 - 32 + 1) { if len(vals) < maxCode-minCode+1 {
common.Log.Debug("ERROR: Invalid length of widths, %d < %d", len(vals), 255-32+1) common.Log.Debug("ERROR: Invalid length of widths, %d < %d", len(vals), 255-32+1)
return nil, core.ErrRangeError return nil, core.ErrRangeError
} }
truefont.charWidths = vals[:255-32+1] // truefont.charWidths = vals[:maxCode-minCode+1]
for i := uint16(minCode); i <= maxCode; i++ {
truefont.charWidths[i] = vals[i-minCode]
}
// Use WinAnsiEncoding by default. // Use WinAnsiEncoding by default.
truefont.Encoding = core.MakeName("WinAnsiEncoding") truefont.Encoding = core.MakeName("WinAnsiEncoding")
@ -511,31 +513,38 @@ func loadStandard14Font(baseFont Standard14Font) (pdfFontSimple, bool) {
if !ok { if !ok {
return pdfFontSimple{}, false return pdfFontSimple{}, false
} }
descriptor := builtinDescriptor(string(baseFont)) descriptor := builtinDescriptor(string(baseFont))
if descriptor == nil { if descriptor == nil {
return pdfFontSimple{}, false return pdfFontSimple{}, false
} }
std.std14Descriptor = descriptor std.std14Descriptor = descriptor
se, ok := std.std14Encoder.(textencoding.SimpleEncoder)
if !ok {
common.Log.Debug("ERROR: Wrong encoder type: %T", std.std14Encoder)
}
codes := []int{}
for c := range se.CodeToGlyph {
codes = append(codes, int(c))
}
sort.Ints(codes)
std.firstChar = codes[0]
std.lastChar = codes[len(codes)-1]
std.charWidths = make([]float64, len(codes))
for i, code := range codes {
glyph := se.CodeToGlyph[uint16(code)]
std.charWidths[i] = std.fontMetrics[glyph].Wx
}
return std, true return std, true
} }
func (font *pdfFontSimple) updateStandard14Font() {
se, ok := font.Encoder().(*textencoding.SimpleEncoder)
if !ok {
// This can't happen.
common.Log.Error("Wrong encoder type: %T. font=%s.", font.Encoder(), font)
return
}
codes := []uint16{}
for c := range se.CodeToGlyph {
codes = append(codes, c)
}
sort.Slice(codes, func(i, j int) bool { return codes[i] < codes[j] })
font.charWidths = map[uint16]float64{}
for _, code := range codes {
glyph := se.CodeToGlyph[uint16(code)]
font.charWidths[code] = font.fontMetrics[glyph].Wx
}
}
var standard14Fonts = map[Standard14Font]pdfFontSimple{ var standard14Fonts = map[Standard14Font]pdfFontSimple{
Courier: pdfFontSimple{ Courier: pdfFontSimple{
fontCommon: fontCommon{ fontCommon: fontCommon{