From b0c440dd0024a7fa273a4db34e9671027697c94d Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Tue, 30 Oct 2018 21:55:30 +1100
Subject: [PATCH] Fixed text position tracking.

---
 pdf/extractor/point.go      |   2 +-
 pdf/extractor/text.go       |  66 +++++++++++++---------
 pdf/model/TODO.md           |   2 +
 pdf/model/font.go           | 107 ++++++++++++++++++++++++++++++++----
 pdf/model/font_composite.go |  20 +++++++
 pdf/model/font_simple.go    |  13 ++++-
 6 files changed, 170 insertions(+), 40 deletions(-)

diff --git a/pdf/extractor/point.go b/pdf/extractor/point.go
index a973847f..fbcd9c55 100644
--- a/pdf/extractor/point.go
+++ b/pdf/extractor/point.go
@@ -49,5 +49,5 @@ func (p *Point) transformByMatrix(m contentstream.Matrix) {
 
 // String returns a string describing `p`.
 func (p *Point) String() string {
-	return fmt.Sprintf("(%.1f,%.1f)", p.X, p.Y)
+	return fmt.Sprintf("(%.2f,%.2f)", p.X, p.Y)
 }
diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go
index 758755f4..e31f0533 100644
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@@ -9,6 +9,8 @@ import (
 	"errors"
 	"fmt"
 	"math"
+	"path/filepath"
+	"runtime"
 	"sort"
 	"strings"
 
@@ -585,8 +587,10 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat
 func (to *textObject) renderText(data []byte) error {
 	font := to.getCurrentFont()
 
-	text, numChars, numMisses := font.CharcodeBytesToUnicode(data)
-	runes := []rune(text)
+	charcodes := font.BytesToCharcodes(data)
+
+	runes, numChars, numMisses := font.CharcodesToUnicode(charcodes)
+
 	to.State.numChars += numChars
 	to.State.numMisses += numMisses
 
@@ -598,7 +602,7 @@ func (to *textObject) renderText(data []byte) error {
 		spaceMetrics, _ = model.DefaultFont().GetRuneCharMetrics(' ')
 	}
 	spaceWidth := spaceMetrics.Wx * glyphTextRatio
-	common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, text,
+	common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes,
 		font, tfs)
 
 	stateMatrix := contentstream.NewMatrix(
@@ -606,7 +610,8 @@ func (to *textObject) renderText(data []byte) error {
 		0, tfs,
 		0, state.Trise)
 
-	for _, r := range runes {
+	for i, r := range runes {
+		code := charcodes[i]
 		// The location of the text on the page in device coordinates is given by trm, the text
 		// rendering matrix.
 		trm := stateMatrix.Mult(to.Tm).Mult(to.gs.CTM)
@@ -616,40 +621,33 @@ func (to *textObject) renderText(data []byte) error {
 
 		// w is the unscaled movement at the end of a word.
 		w := 0.0
-		if r == ' ' {
+		if r == " " {
 			w = state.Tw
 		}
 
-		m, err := font.GetRuneCharMetrics(r)
-		if err != nil {
-			common.Log.Debug("ERROR: No metric for 0x%04x=%c %s", r, r, font)
-			return err
+		m, ok := font.GetCharMetrics(code)
+		if !ok {
+			common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%c %s", code, r, r, font)
+			return errors.New("no char metrics")
 		}
+
 		// c is the character size in unscaled text units.
 		c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
-		// cScaled is the character size
-		cScaled := Point{X: c.X * tfs * th}
+
 		// t is the displacement of the text cursor when the character is rendered.
 		t := Point{X: (c.X*tfs + state.Tc + w) * th}
 
-		common.Log.Debug("t=%s cScaled=%s c=%s tfs=%.2f state.Tc=%.2f w=%.2f th=%.2f",
-			t.String(), cScaled.String(), c.String(), tfs, state.Tc, w, th)
-
-		// td is t in matrix  from
+		// td is t in matrix form.
 		td := translationMatrix(t)
-		common.Log.Debug("displacement=%s t=%s td=%s m=%s",
-			c.String(), t.String(), td.String(), m.String())
 
 		nextTm := to.Tm.Mult(td)
-		common.Log.Debug("  next: td=%s %s->%s", td, to.Tm, nextTm)
 
 		xyt := XYText{Text: string(r),
 			Point:      translation(trm),
-			End:        translation(trm).Displace(cScaled),
+			End:        translation(to.Tm.Mult(td).Mult(to.gs.CTM)),
 			SpaceWidth: spaceWidth * trm.ScalingFactorX(),
 		}
 		to.Texts = append(to.Texts, xyt)
-		common.Log.Debug("  xyt=%s", xyt.String())
 
 		// update the text matrix by the displacement of the text location.
 		to.Tm = nextTm
@@ -738,6 +736,7 @@ func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text
 
 // ToText returns the contents of `tl` as a single string.
 func (tl *TextList) ToText() string {
+	tl.printTexts("ToText: before sorting")
 	tl.SortPosition()
 
 	lines := tl.toLines()
@@ -770,7 +769,7 @@ type Line struct {
 // toLines return the text and positions in `tl` as a slice of Line.
 // NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
 func (tl *TextList) toLines() []Line {
-	tl.printTexts()
+	tl.printTexts("toLines: before")
 	if len(*tl) == 0 {
 		return []Line{}
 	}
@@ -870,13 +869,28 @@ func (exp *ExponAve) update(x float64) float64 {
 }
 
 // printTexts is a debugging function. XXX Remove this.
-func (tl *TextList) printTexts() {
+func (tl *TextList) printTexts(message string) {
 	return
-	common.Log.Error("=====================================")
-	common.Log.Error("%d texts", len(*tl))
-	for i, t := range (*tl)[1:] {
-		fmt.Printf("%5d: %s\n", i, t.String())
+	_, file, line, ok := runtime.Caller(1)
+	if !ok {
+		file = "???"
+		line = 0
+	} else {
+		file = filepath.Base(file)
 	}
+	prefix := fmt.Sprintf("[%s:%d]", file, line)
+
+	common.Log.Error("=====================================")
+	common.Log.Error("printTexts %s %s", prefix, message)
+	common.Log.Error("%d texts", len(*tl))
+	parts := []string{}
+	for i, t := range *tl {
+		fmt.Printf("%5d: %s\n", i, t.String())
+		parts = append(parts, t.Text)
+	}
+	common.Log.Error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
+	fmt.Printf("%s\n", strings.Join(parts, ""))
+	common.Log.Error("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
 }
 
 // newLine returns the Line representation of strings `words` with y coordinate `y` and x
diff --git a/pdf/model/TODO.md b/pdf/model/TODO.md
index 07b727b4..d479d6f7 100644
--- a/pdf/model/TODO.md
+++ b/pdf/model/TODO.md
@@ -3,3 +3,5 @@ Font Metrics
 
 1 Leave char->unicode until end
 2 Build metrics tables for charcodes
+3 Remove double Font interface definition
+4 Express CharcodeBytesToUnicode2 in terms of
diff --git a/pdf/model/font.go b/pdf/model/font.go
index 322fc808..6b406a69 100644
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@@ -18,13 +18,26 @@ import (
 	"github.com/unidoc/unidoc/pdf/model/textencoding"
 )
 
+// Font represents a font which is a series of glyphs. Character codes from PDF strings can be
+// mapped to and from glyphs. Each glyph has metrics.
+// XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14
+//      Remove this hack.
+type Font interface {
+	Encoder() textencoding.TextEncoder
+	SetEncoder(encoder textencoding.TextEncoder)
+	GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
+	GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
+	GetAverageCharWidth() float64
+	ToPdfObject() core.PdfObject
+}
+
 // PdfFont represents an underlying font structure which can be of type:
 // - Type0
 // - Type1
 // - TrueType
 // etc.
 type PdfFont struct {
-	context fonts.Font // The underlying font: Type0, Type1, Truetype, etc..
+	context Font // The underlying font: Type0, Type1, Truetype, etc..
 }
 
 // String returns a string that describes `font`.
@@ -303,12 +316,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
 //   conforming writers, instead of using a simple font, shall use a Type 0 font with an Identity-H
 //   encoding and use the glyph indices as character codes, as described following Table 118.
 func (font PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
-	_, out, numChars, numMisses := font.CharcodeBytesToUnicode2(data)
-	return out, numChars, numMisses
-}
-
-func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int, int) {
-	common.Log.Trace("showText: data=[% 02x]=%#q", data, data)
+	common.Log.Trace("CharcodeBytesToUnicode: data=[% 02x]=%#q", data, data)
 
 	charcodes := make([]uint16, 0, len(data)+len(data)%2)
 	if font.baseFields().isCIDFont() {
@@ -363,7 +371,74 @@ func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int,
 	}
 
 	out := strings.Join(charstrings, "")
-	return charcodes, out, len([]rune(out)), numMisses
+	return out, len([]rune(out)), numMisses
+}
+
+// BytesToCharcodes converts the bytes in a PDF string to character codes.
+func (font PdfFont) BytesToCharcodes(data []byte) []uint16 {
+	common.Log.Trace("BytesToCharcodes: data=[% 02x]=%#q", data, data)
+	charcodes := make([]uint16, 0, len(data)+len(data)%2)
+	if font.baseFields().isCIDFont() {
+		if len(data) == 1 {
+			data = []byte{0, data[0]}
+		}
+		if len(data)%2 != 0 {
+			common.Log.Debug("ERROR: Padding data=%+v to even length", data)
+			data = append(data, 0)
+		}
+		for i := 0; i < len(data); i += 2 {
+			b := uint16(data[i])<<8 | uint16(data[i+1])
+			charcodes = append(charcodes, b)
+		}
+	} else {
+		for _, b := range data {
+			charcodes = append(charcodes, uint16(b))
+		}
+	}
+	return charcodes
+}
+
+// CharcodesToUnicode converts the character codes `charcodes` to a slice of unicode strings.
+func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int) {
+	charstrings := make([]string, 0, len(charcodes))
+	numMisses := 0
+	for _, code := range charcodes {
+		if font.baseFields().toUnicodeCmap != nil {
+			r, ok := font.baseFields().toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code))
+			if ok {
+				charstrings = append(charstrings, r)
+				continue
+			}
+		}
+		// Fall back to encoding
+		encoder := font.Encoder()
+		if encoder != nil {
+			r, ok := encoder.CharcodeToRune(code)
+			if ok {
+				charstrings = append(charstrings, textencoding.RuneToString(r))
+				continue
+			}
+		}
+		common.Log.Debug("ERROR: No rune. code=0x%04x charcodes=[% 04x] CID=%t\n"+
+			"\tfont=%s\n\tencoding=%s",
+			code, charcodes, font.baseFields().isCIDFont(), font, encoder)
+		numMisses++
+		charstrings = append(charstrings, cmap.MissingCodeString)
+
+	}
+
+	if numMisses != 0 {
+		common.Log.Debug("ERROR: Couldn't convert to unicode. Using input.\n"+
+			"\tnumChars=%d numMisses=%d\n"+
+			"\tfont=%s",
+			len(charcodes), numMisses, font)
+	}
+
+	if len(charcodes) != len(charstrings) {
+		panic(fmt.Errorf("charcodes=%d charstrings=%d", len(charcodes), len(charstrings)))
+	}
+
+	return charstrings, len(charstrings), numMisses
 }
 
 // ToPdfObject converts the PdfFont object to its PDF representation.
@@ -402,9 +477,21 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
 	t := font.actualFont()
 	if t == nil {
 		common.Log.Debug("ERROR: GetGlyphCharMetrics Not implemented for font type=%#T", font.context)
+		return fonts.CharMetrics{GlyphName: glyph}, false
+	}
+	metrics, ok := t.GetGlyphCharMetrics(glyph)
+	return metrics, ok
+}
+
+// GetCharMetrics returns the char metrics for character code `code`.
+func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
+	t := font.actualFont()
+	if t == nil {
+		common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context)
 		return fonts.CharMetrics{}, false
 	}
-	return t.GetGlyphCharMetrics(glyph)
+	m, ok := t.GetCharMetrics(code)
+	return m, ok
 }
 
 // GetRuneCharMetrics returns the char metrics for rune `r`.
@@ -438,7 +525,7 @@ func (font PdfFont) GetAverageCharWidth() float64 {
 }
 
 // actualFont returns the Font in font.context
-func (font PdfFont) actualFont() fonts.Font {
+func (font PdfFont) actualFont() Font {
 	if font.context == nil {
 		common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
 	}
diff --git a/pdf/model/font_composite.go b/pdf/model/font_composite.go
index e94854a2..fb0ef491 100644
--- a/pdf/model/font_composite.go
+++ b/pdf/model/font_composite.go
@@ -121,6 +121,15 @@ func (font pdfFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, b
 	return font.DescendantFont.GetGlyphCharMetrics(glyph)
 }
 
+// !@#$ stub
+func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
+	if font.DescendantFont == nil {
+		common.Log.Debug("ERROR: No descendant. font=%s", font)
+		return fonts.CharMetrics{}, false
+	}
+	return font.DescendantFont.GetCharMetrics(code)
+}
+
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font pdfFontType0) GetAverageCharWidth() float64 {
 	if font.DescendantFont == nil {
@@ -238,6 +247,11 @@ func (font pdfCIDFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics
 	return fonts.CharMetrics{}, true
 }
 
+// !@#$ stub
+func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
+	return fonts.CharMetrics{}, true
+}
+
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font pdfCIDFontType0) GetAverageCharWidth() float64 {
 	return 0.0
@@ -347,6 +361,12 @@ func (font pdfCIDFontType2) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics
 	return metrics, true
 }
 
+// !@#$ stub
+func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
+	metrics := fonts.CharMetrics{}
+	return metrics, true
+}
+
 // GetAverageCharWidth returns the average width of all the characters in `font`.
 func (font pdfCIDFontType2) GetAverageCharWidth() float64 {
 	if len(font.runeToWidthMap) == 0 {
diff --git a/pdf/model/font_simple.go b/pdf/model/font_simple.go
index dcfd2e5b..cf476b16 100644
--- a/pdf/model/font_simple.go
+++ b/pdf/model/font_simple.go
@@ -93,13 +93,20 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
 		return metrics, ok
 	}
 
-	metrics := fonts.CharMetrics{}
-
 	code, found := font.encoder.GlyphToCharcode(glyph)
 	if !found {
-		return metrics, false
+		return fonts.CharMetrics{GlyphName: glyph}, false
 	}
+	// !@#$ Shouldn't we fall back from GetCharMetrics to GetGlyphCharMetrics?
+	metrics, ok := font.GetCharMetrics(code)
 	metrics.GlyphName = glyph
+	return metrics, ok
+}
+
+// GetCharMetrics returns the character metrics for the specified character code.  A bool flag is
+// returned to indicate whether or not the entry was found in the glyph to charcode mapping.
+func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
+	metrics := fonts.CharMetrics{}
 
 	if int(code) < font.firstChar {
 		common.Log.Debug("Code lower than firstchar (%d < %d)", code, font.firstChar)