From b0c440dd0024a7fa273a4db34e9671027697c94d Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 30 Oct 2018 21:55:30 +1100 Subject: [PATCH] Fixed text position tracking. --- pdf/extractor/point.go | 2 +- pdf/extractor/text.go | 66 +++++++++++++--------- pdf/model/TODO.md | 2 + pdf/model/font.go | 107 ++++++++++++++++++++++++++++++++---- pdf/model/font_composite.go | 20 +++++++ pdf/model/font_simple.go | 13 ++++- 6 files changed, 170 insertions(+), 40 deletions(-) diff --git a/pdf/extractor/point.go b/pdf/extractor/point.go index a973847f..fbcd9c55 100644 --- a/pdf/extractor/point.go +++ b/pdf/extractor/point.go @@ -49,5 +49,5 @@ func (p *Point) transformByMatrix(m contentstream.Matrix) { // String returns a string describing `p`. func (p *Point) String() string { - return fmt.Sprintf("(%.1f,%.1f)", p.X, p.Y) + return fmt.Sprintf("(%.2f,%.2f)", p.X, p.Y) } diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go index 758755f4..e31f0533 100644 --- a/pdf/extractor/text.go +++ b/pdf/extractor/text.go @@ -9,6 +9,8 @@ import ( "errors" "fmt" "math" + "path/filepath" + "runtime" "sort" "strings" @@ -585,8 +587,10 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat func (to *textObject) renderText(data []byte) error { font := to.getCurrentFont() - text, numChars, numMisses := font.CharcodeBytesToUnicode(data) - runes := []rune(text) + charcodes := font.BytesToCharcodes(data) + + runes, numChars, numMisses := font.CharcodesToUnicode(charcodes) + to.State.numChars += numChars to.State.numMisses += numMisses @@ -598,7 +602,7 @@ func (to *textObject) renderText(data []byte) error { spaceMetrics, _ = model.DefaultFont().GetRuneCharMetrics(' ') } spaceWidth := spaceMetrics.Wx * glyphTextRatio - common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, text, + common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs) stateMatrix := contentstream.NewMatrix( @@ -606,7 +610,8 @@ func (to *textObject) renderText(data []byte) error { 0, tfs, 0, state.Trise) - for _, r := range runes { + for i, r := range runes { + code := charcodes[i] // The location of the text on the page in device coordinates is given by trm, the text // rendering matrix. trm := stateMatrix.Mult(to.Tm).Mult(to.gs.CTM) @@ -616,40 +621,33 @@ func (to *textObject) renderText(data []byte) error { // w is the unscaled movement at the end of a word. w := 0.0 - if r == ' ' { + if r == " " { w = state.Tw } - m, err := font.GetRuneCharMetrics(r) - if err != nil { - common.Log.Debug("ERROR: No metric for 0x%04x=%c %s", r, r, font) - return err + m, ok := font.GetCharMetrics(code) + if !ok { + common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%c %s", code, r, r, font) + return errors.New("no char metrics") } + // c is the character size in unscaled text units. c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio} - // cScaled is the character size - cScaled := Point{X: c.X * tfs * th} + // t is the displacement of the text cursor when the character is rendered. t := Point{X: (c.X*tfs + state.Tc + w) * th} - common.Log.Debug("t=%s cScaled=%s c=%s tfs=%.2f state.Tc=%.2f w=%.2f th=%.2f", - t.String(), cScaled.String(), c.String(), tfs, state.Tc, w, th) - - // td is t in matrix from + // td is t in matrix form. td := translationMatrix(t) - common.Log.Debug("displacement=%s t=%s td=%s m=%s", - c.String(), t.String(), td.String(), m.String()) nextTm := to.Tm.Mult(td) - common.Log.Debug(" next: td=%s %s->%s", td, to.Tm, nextTm) xyt := XYText{Text: string(r), Point: translation(trm), - End: translation(trm).Displace(cScaled), + End: translation(to.Tm.Mult(td).Mult(to.gs.CTM)), SpaceWidth: spaceWidth * trm.ScalingFactorX(), } to.Texts = append(to.Texts, xyt) - common.Log.Debug(" xyt=%s", xyt.String()) // update the text matrix by the displacement of the text location. to.Tm = nextTm @@ -738,6 +736,7 @@ func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text // ToText returns the contents of `tl` as a single string. func (tl *TextList) ToText() string { + tl.printTexts("ToText: before sorting") tl.SortPosition() lines := tl.toLines() @@ -770,7 +769,7 @@ type Line struct { // toLines return the text and positions in `tl` as a slice of Line. // NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function. func (tl *TextList) toLines() []Line { - tl.printTexts() + tl.printTexts("toLines: before") if len(*tl) == 0 { return []Line{} } @@ -870,13 +869,28 @@ func (exp *ExponAve) update(x float64) float64 { } // printTexts is a debugging function. XXX Remove this. -func (tl *TextList) printTexts() { +func (tl *TextList) printTexts(message string) { return - common.Log.Error("=====================================") - common.Log.Error("%d texts", len(*tl)) - for i, t := range (*tl)[1:] { - fmt.Printf("%5d: %s\n", i, t.String()) + _, file, line, ok := runtime.Caller(1) + if !ok { + file = "???" + line = 0 + } else { + file = filepath.Base(file) } + prefix := fmt.Sprintf("[%s:%d]", file, line) + + common.Log.Error("=====================================") + common.Log.Error("printTexts %s %s", prefix, message) + common.Log.Error("%d texts", len(*tl)) + parts := []string{} + for i, t := range *tl { + fmt.Printf("%5d: %s\n", i, t.String()) + parts = append(parts, t.Text) + } + common.Log.Error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") + fmt.Printf("%s\n", strings.Join(parts, "")) + common.Log.Error("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") } // newLine returns the Line representation of strings `words` with y coordinate `y` and x diff --git a/pdf/model/TODO.md b/pdf/model/TODO.md index 07b727b4..d479d6f7 100644 --- a/pdf/model/TODO.md +++ b/pdf/model/TODO.md @@ -3,3 +3,5 @@ Font Metrics 1 Leave char->unicode until end 2 Build metrics tables for charcodes +3 Remove double Font interface definition +4 Express CharcodeBytesToUnicode2 in terms of diff --git a/pdf/model/font.go b/pdf/model/font.go index 322fc808..6b406a69 100644 --- a/pdf/model/font.go +++ b/pdf/model/font.go @@ -18,13 +18,26 @@ import ( "github.com/unidoc/unidoc/pdf/model/textencoding" ) +// Font represents a font which is a series of glyphs. Character codes from PDF strings can be +// mapped to and from glyphs. Each glyph has metrics. +// XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14 +// Remove this hack. +type Font interface { + Encoder() textencoding.TextEncoder + SetEncoder(encoder textencoding.TextEncoder) + GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool) + GetCharMetrics(code uint16) (fonts.CharMetrics, bool) + GetAverageCharWidth() float64 + ToPdfObject() core.PdfObject +} + // PdfFont represents an underlying font structure which can be of type: // - Type0 // - Type1 // - TrueType // etc. type PdfFont struct { - context fonts.Font // The underlying font: Type0, Type1, Truetype, etc.. + context Font // The underlying font: Type0, Type1, Truetype, etc.. } // String returns a string that describes `font`. @@ -303,12 +316,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont, // conforming writers, instead of using a simple font, shall use a Type 0 font with an Identity-H // encoding and use the glyph indices as character codes, as described following Table 118. func (font PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) { - _, out, numChars, numMisses := font.CharcodeBytesToUnicode2(data) - return out, numChars, numMisses -} - -func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int, int) { - common.Log.Trace("showText: data=[% 02x]=%#q", data, data) + common.Log.Trace("CharcodeBytesToUnicode: data=[% 02x]=%#q", data, data) charcodes := make([]uint16, 0, len(data)+len(data)%2) if font.baseFields().isCIDFont() { @@ -363,7 +371,74 @@ func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int, } out := strings.Join(charstrings, "") - return charcodes, out, len([]rune(out)), numMisses + return out, len([]rune(out)), numMisses +} + +// BytesToCharcodes converts the bytes in a PDF string to character codes. +func (font PdfFont) BytesToCharcodes(data []byte) []uint16 { + common.Log.Trace("BytesToCharcodes: data=[% 02x]=%#q", data, data) + charcodes := make([]uint16, 0, len(data)+len(data)%2) + if font.baseFields().isCIDFont() { + if len(data) == 1 { + data = []byte{0, data[0]} + } + if len(data)%2 != 0 { + common.Log.Debug("ERROR: Padding data=%+v to even length", data) + data = append(data, 0) + } + for i := 0; i < len(data); i += 2 { + b := uint16(data[i])<<8 | uint16(data[i+1]) + charcodes = append(charcodes, b) + } + } else { + for _, b := range data { + charcodes = append(charcodes, uint16(b)) + } + } + return charcodes +} + +// CharcodesToUnicode converts the character codes `charcodes` to a slice of unicode strings. +func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int) { + charstrings := make([]string, 0, len(charcodes)) + numMisses := 0 + for _, code := range charcodes { + if font.baseFields().toUnicodeCmap != nil { + r, ok := font.baseFields().toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)) + if ok { + charstrings = append(charstrings, r) + continue + } + } + // Fall back to encoding + encoder := font.Encoder() + if encoder != nil { + r, ok := encoder.CharcodeToRune(code) + if ok { + charstrings = append(charstrings, textencoding.RuneToString(r)) + continue + } + } + common.Log.Debug("ERROR: No rune. code=0x%04x charcodes=[% 04x] CID=%t\n"+ + "\tfont=%s\n\tencoding=%s", + code, charcodes, font.baseFields().isCIDFont(), font, encoder) + numMisses++ + charstrings = append(charstrings, cmap.MissingCodeString) + + } + + if numMisses != 0 { + common.Log.Debug("ERROR: Couldn't convert to unicode. Using input.\n"+ + "\tnumChars=%d numMisses=%d\n"+ + "\tfont=%s", + len(charcodes), numMisses, font) + } + + if len(charcodes) != len(charstrings) { + panic(fmt.Errorf("charcodes=%d charstrings=%d", len(charcodes), len(charstrings))) + } + + return charstrings, len(charstrings), numMisses } // ToPdfObject converts the PdfFont object to its PDF representation. @@ -402,9 +477,21 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool) t := font.actualFont() if t == nil { common.Log.Debug("ERROR: GetGlyphCharMetrics Not implemented for font type=%#T", font.context) + return fonts.CharMetrics{GlyphName: glyph}, false + } + metrics, ok := t.GetGlyphCharMetrics(glyph) + return metrics, ok +} + +// GetCharMetrics returns the char metrics for character code `code`. +func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) { + t := font.actualFont() + if t == nil { + common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context) return fonts.CharMetrics{}, false } - return t.GetGlyphCharMetrics(glyph) + m, ok := t.GetCharMetrics(code) + return m, ok } // GetRuneCharMetrics returns the char metrics for rune `r`. @@ -438,7 +525,7 @@ func (font PdfFont) GetAverageCharWidth() float64 { } // actualFont returns the Font in font.context -func (font PdfFont) actualFont() fonts.Font { +func (font PdfFont) actualFont() Font { if font.context == nil { common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font) } diff --git a/pdf/model/font_composite.go b/pdf/model/font_composite.go index e94854a2..fb0ef491 100644 --- a/pdf/model/font_composite.go +++ b/pdf/model/font_composite.go @@ -121,6 +121,15 @@ func (font pdfFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, b return font.DescendantFont.GetGlyphCharMetrics(glyph) } +// !@#$ stub +func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) { + if font.DescendantFont == nil { + common.Log.Debug("ERROR: No descendant. font=%s", font) + return fonts.CharMetrics{}, false + } + return font.DescendantFont.GetCharMetrics(code) +} + // GetAverageCharWidth returns the average width of all the characters in `font`. func (font pdfFontType0) GetAverageCharWidth() float64 { if font.DescendantFont == nil { @@ -238,6 +247,11 @@ func (font pdfCIDFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics return fonts.CharMetrics{}, true } +// !@#$ stub +func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) { + return fonts.CharMetrics{}, true +} + // GetAverageCharWidth returns the average width of all the characters in `font`. func (font pdfCIDFontType0) GetAverageCharWidth() float64 { return 0.0 @@ -347,6 +361,12 @@ func (font pdfCIDFontType2) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics return metrics, true } +// !@#$ stub +func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) { + metrics := fonts.CharMetrics{} + return metrics, true +} + // GetAverageCharWidth returns the average width of all the characters in `font`. func (font pdfCIDFontType2) GetAverageCharWidth() float64 { if len(font.runeToWidthMap) == 0 { diff --git a/pdf/model/font_simple.go b/pdf/model/font_simple.go index dcfd2e5b..cf476b16 100644 --- a/pdf/model/font_simple.go +++ b/pdf/model/font_simple.go @@ -93,13 +93,20 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, return metrics, ok } - metrics := fonts.CharMetrics{} - code, found := font.encoder.GlyphToCharcode(glyph) if !found { - return metrics, false + return fonts.CharMetrics{GlyphName: glyph}, false } + // !@#$ Shouldn't we fall back from GetCharMetrics to GetGlyphCharMetrics? + metrics, ok := font.GetCharMetrics(code) metrics.GlyphName = glyph + return metrics, ok +} + +// GetCharMetrics returns the character metrics for the specified character code. A bool flag is +// returned to indicate whether or not the entry was found in the glyph to charcode mapping. +func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) { + metrics := fonts.CharMetrics{} if int(code) < font.firstChar { common.Log.Debug("Code lower than firstchar (%d < %d)", code, font.firstChar)