From f373881a48afd519e46bf20e1c8038f28aad7ecf Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Tue, 27 Nov 2018 13:37:12 +1100
Subject: [PATCH 1/3] Removed some unused struct fields.

---
 pdf/extractor/text.go   | 119 +++++++++++++++++++++-------------------
 pdf/model/font.go       |  13 ++---
 pdf/model/fonts/font.go |   2 +-
 3 files changed, 69 insertions(+), 65 deletions(-)

diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go
index f23a59a2..c6e99440 100644
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@@ -49,7 +49,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
 	cstreamParser := contentstream.NewContentStreamParser(e.contents)
 	operations, err := cstreamParser.Parse()
 	if err != nil {
-		common.Log.Debug("ExtractXYText: parse failed. err=%v", err)
+		common.Log.Debug("ERROR: ExtractXYText parse failed. err=%v", err)
 		return textList, state.numChars, state.numMisses, err
 	}
 
@@ -309,29 +309,27 @@ func (to *textObject) nextLine() {
 
 // setTextMatrix "Tm".
 // Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
-// in `f`  (page 250)
+// in `f` (page 250).
 func (to *textObject) setTextMatrix(f []float64) {
 	a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
 	to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
-	to.Tlm = contentstream.NewMatrix(a, b, c, d, tx, ty)
-	common.Log.Debug("setTextMatrix: Tm=%s", to.Tm)
+	to.Tlm = to.Tm
 }
 
-// showText "Tj" Show a text string.
+// showText "Tj". Show a text string.
 func (to *textObject) showText(charcodes []byte) error {
 	return to.renderText(charcodes)
 }
 
-// showTextAdjusted "TJ" Show text with adjustable spacing.
+// showTextAdjusted "TJ". Show text with adjustable spacing.
 func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 	vertical := false
 	for _, o := range args.Elements() {
 		switch o.(type) {
 		case *core.PdfObjectFloat, *core.PdfObjectInteger:
-			// The following is supposed to be equivalent to the existing Unidoc implementation.
 			x, err := core.GetNumberAsFloat(o)
 			if err != nil {
-				common.Log.Debug("showTextAdjusted: Bad numerical arg. o=%s args=%+v", o, args)
+				common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
 				return err
 			}
 			dx, dy := -x*0.001*to.State.Tfs, 0.0
@@ -340,23 +338,23 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 			}
 			td := translationMatrix(Point{X: dx, Y: dy})
 			to.Tm = td.Mult(to.Tm)
-			common.Log.Debug("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm)
+			common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm)
 		case *core.PdfObjectString:
 			charcodes, ok := core.GetStringBytes(o)
 			if !ok {
-				common.Log.Debug("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
+				common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
 				return core.ErrTypeError
 			}
 			to.renderText(charcodes)
 		default:
-			common.Log.Debug("showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
+			common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
 			return core.ErrTypeError
 		}
 	}
 	return nil
 }
 
-// setTextLeading "TL" Set text leading.
+// setTextLeading "TL". Set text leading.
 func (to *textObject) setTextLeading(y float64) {
 	if to == nil {
 		return
@@ -364,7 +362,7 @@ func (to *textObject) setTextLeading(y float64) {
 	to.State.Tl = y
 }
 
-// setCharSpacing "Tc" Set character spacing.
+// setCharSpacing "Tc". Set character spacing.
 func (to *textObject) setCharSpacing(x float64) {
 	if to == nil {
 		return
@@ -372,7 +370,7 @@ func (to *textObject) setCharSpacing(x float64) {
 	to.State.Tc = x
 }
 
-// setFont "Tf" Set font.
+// setFont "Tf". Set font.
 func (to *textObject) setFont(name string, size float64) error {
 	if to == nil {
 		return nil
@@ -395,7 +393,7 @@ func (to *textObject) setFont(name string, size float64) error {
 	return nil
 }
 
-// setTextRenderMode "Tr" Set text rendering mode.
+// setTextRenderMode "Tr". Set text rendering mode.
 func (to *textObject) setTextRenderMode(mode int) {
 	if to == nil {
 		return
@@ -403,7 +401,7 @@ func (to *textObject) setTextRenderMode(mode int) {
 	to.State.Tmode = RenderMode(mode)
 }
 
-// setTextRise "Ts" Set text rise.
+// setTextRise "Ts". Set text rise.
 func (to *textObject) setTextRise(y float64) {
 	if to == nil {
 		return
@@ -411,7 +409,7 @@ func (to *textObject) setTextRise(y float64) {
 	to.State.Trise = y
 }
 
-// setWordSpacing "Tw" Set word spacing.
+// setWordSpacing "Tw". Set word spacing.
 func (to *textObject) setWordSpacing(y float64) {
 	if to == nil {
 		return
@@ -419,7 +417,7 @@ func (to *textObject) setWordSpacing(y float64) {
 	to.State.Tw = y
 }
 
-// setHorizScaling "Tz" Set horizontal scaling.
+// setHorizScaling "Tz". Set horizontal scaling.
 func (to *textObject) setHorizScaling(y float64) {
 	if to == nil {
 		return
@@ -573,9 +571,6 @@ type textObject struct {
 	Tm        contentstream.Matrix // Text matrix. For the character pointer.
 	Tlm       contentstream.Matrix // Text line matrix. For the start of line pointer.
 	Texts     []XYText             // Text gets written here.
-
-	// These fields are used to implement existing UniDoc behaviour.
-	xPos, yPos float64
 }
 
 // newTextState returns a default textState.
@@ -625,8 +620,7 @@ func (to *textObject) renderText(data []byte) error {
 		0, tfs,
 		0, state.Trise)
 
-	common.Log.Debug("==========================================")
-	common.Log.Debug("%d codes=%+v runes=%q", len(charcodes), charcodes, runes)
+	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
 
 	for i, r := range runes {
 
@@ -653,14 +647,13 @@ func (to *textObject) renderText(data []byte) error {
 		// c is the character size in unscaled text units.
 		c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
 
+		// t0 is the end of this character.
 		// t is the displacement of the text cursor when the character is rendered.
-		// float tx = displacementX * fontSize * horizontalScaling;
-		// w = 0
 		t0 := Point{X: (c.X*tfs + w) * th}
 		t := Point{X: (c.X*tfs + state.Tc + w) * th}
 
 		// td, td0 are t, t0 in matrix form.
-		// td0 is where this char ends. td is where the next char stats.
+		// td0 is where this character ends. td is where the next character starts.
 		td0 := translationMatrix(t0)
 		td := translationMatrix(t)
 
@@ -668,19 +661,16 @@ func (to *textObject) renderText(data []byte) error {
 		common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.Tc, w, state.Tw)
 		common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.Tm).Mult(to.gs.CTM))
 
-		nextTm := td.Mult(to.Tm)
-		common.Log.Trace("nextTm=%s", nextTm)
-
-		xyt := newXYText(
+		xyt := to.newXYText(
 			string(r),
 			trm,
 			translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
 			spaceWidth*trm.ScalingFactorX())
-		common.Log.Trace("i=%d code=%d, xyt=%s", i, code, xyt)
+		common.Log.Trace("i=%d code=%d xyt=%s trm=%s", i, code, xyt, trm)
 		to.Texts = append(to.Texts, xyt)
 
 		// update the text matrix by the displacement of the text location.
-		to.Tm = nextTm
+		to.Tm = td.Mult(to.Tm)
 		common.Log.Trace("to.Tm=%s", to.Tm)
 	}
 
@@ -711,36 +701,36 @@ func (to *textObject) moveTo(tx, ty float64) {
 }
 
 // XYText represents text drawn on a page and its position in device coordinates.
+// All dimensions are in device coordinates.
 type XYText struct {
-	Trm              contentstream.Matrix
-	OrientedStart    Point          // Left of text in orientation where text is horizontal.
-	OrientedEnd      Point          // Right of text in orientation where text is horizontal.
-	ColorStroking    model.PdfColor // Colour that text is stroked with, if any.
-	ColorNonStroking model.PdfColor // Colour that text is filled with, if any.
-	Orient           int
-	Text             string
-	SpaceWidth       float64
-	Font             string
-	FontSize         float64
+	Text          string  // The text.
+	Orient        int     // The text orientation.
+	OrientedStart Point   // Left of text in orientation where text is horizontal.
+	OrientedEnd   Point   // Right of text in orientation where text is horizontal.
+	SpaceWidth    float64 // Best guess at the width of a space in the font the text was rendered with.
+	count         int64   // To help with reading debug logs.
 }
 
-func newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText {
+// newXYText returns an XYText for text `text` rendered with text rendering matrix `trm` and end
+// of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in
+// the font the text is rendered in device coordinates.
+func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText {
+	to.e.textCount++
 	theta := trm.Angle()
 	return XYText{
 		Text:          text,
-		Trm:           trm,
+		Orient:        theta,
 		OrientedStart: translation(trm).Rotate(theta),
 		OrientedEnd:   end.Rotate(theta),
-		Orient:        theta,
 		SpaceWidth:    spaceWidth,
+		count:         to.e.textCount,
 	}
 }
 
 // String returns a string describing `t`.
 func (t XYText) String() string {
-	return fmt.Sprintf("XYText{%s  %.1f |%d| [%.3f,%.3f] %q}", 
-		t.Trm.String(), t.Width(), t.Orient, t.OrientedStart.X, t.OrientedStart.Y,
-		truncate(t.Text, 100))
+	return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f |%d| %q}",
+		t.count, t.OrientedStart.X, t.OrientedStart.Y, t.Width(), t.Orient, truncate(t.Text, 100))
 }
 
 // Width returns the width of `t`.Text in the text direction.
@@ -752,12 +742,12 @@ func (t XYText) Width() float64 {
 type TextList []XYText
 
 // Length returns the number of elements in `tl`.
-func (tl *TextList) Length() int {
-	return len(*tl)
+func (tl TextList) Length() int {
+	return len(tl)
 }
 
 // ToText returns the contents of `tl` as a single string.
-func (tl *TextList) ToText() string {
+func (tl TextList) ToText() string {
 	tl.printTexts("ToText: before sorting")
 	tl.SortPosition()
 
@@ -790,26 +780,31 @@ type Line struct {
 	Y     float64   // y position of line.
 	Dx    []float64 // x distance between successive words in line.
 	Text  string    // text in the line.
-	Words []string  // words in the line
+	Words []string  // words in the line.
 }
 
 // toLines returns the text and positions in `tl` as a slice of Line.
-// NOTE: Caller must sort the text list by top-to-bottom, left-to-write (for orientation adjusted so
+// NOTE: Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so
 // that text is horizontal) before calling this function.
 func (tl TextList) toLines() []Line {
+	// We divide `tl` into slices which contain texts with the same orientation, extract the lines
+	// for each orientation then return the concatention of these lines sorted by orientation.
 	tlOrient := map[int]TextList{}
 	for _, t := range tl {
 		tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
 	}
 	lines := []Line{}
-	for _, o := range []int{0, 90, 180, 270} {
+	for _, o := range orientKeys(tlOrient) {
 		lines = append(lines, tlOrient[o].toLinesOrient()...)
 	}
 	return lines
 }
 
 // toLinesOrient returns the text and positions in `tl` as a slice of Line.
-// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
+// NOTE: This function only works on text lists where all text is the same orientation so it should
+// only be called from toLines.
+// Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so
+// that text is horizontal) before calling this function.
 func (tl TextList) toLinesOrient() []Line {
 	tl.printTexts("toLines: before")
 	if len(tl) == 0 {
@@ -824,7 +819,7 @@ func (tl TextList) toLinesOrient() []Line {
 
 	averageCharWidth := ExponAve{}
 	wordSpacing := ExponAve{}
-	lastEndX := 0.0 // tl[i-1].End.X
+	lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X
 
 	for _, t := range tl {
 		if t.OrientedStart.Y < y {
@@ -865,7 +860,7 @@ func (tl TextList) toLinesOrient() []Line {
 		common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
 			t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
 		common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
-			t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX, 
+			t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX,
 			nextWordX-t.OrientedStart.X, isSpace)
 
 		if isSpace {
@@ -890,6 +885,16 @@ func (tl TextList) toLinesOrient() []Line {
 	return lines
 }
 
+// orientKeys returns the keys of `tlOrient` as a sorted slice.
+func orientKeys(tlOrient map[int]TextList) []int {
+	keys := []int{}
+	for k := range tlOrient {
+		keys = append(keys, k)
+	}
+	sort.Ints(keys)
+	return keys
+}
+
 // min returns the lesser of `a` and `b`.
 func min(a, b float64) float64 {
 	if a < b {
diff --git a/pdf/model/font.go b/pdf/model/font.go
index 6c6e51d0..d8a745e7 100644
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@@ -27,7 +27,7 @@ type Font interface {
 	SetEncoder(encoder textencoding.TextEncoder)
 	GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
 	GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
-	GetAverageCharWidth() float64
+	GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
 	ToPdfObject() core.PdfObject
 }
 
@@ -52,7 +52,7 @@ func (font PdfFont) GetFontDescriptor() (*PdfFontDescriptor, error) {
 	case *pdfCIDFontType2:
 		return t.fontDescriptor, nil
 	}
-	common.Log.Debug("ERROR: Cannot get font descriptor for font type %t (%s)", font, font)
+	common.Log.Debug("ERROR: Cannot get font descriptor for font type %T (%s)", font, font)
 	return nil, errors.New("font descriptor not found")
 }
 
@@ -63,7 +63,6 @@ func (font PdfFont) String() string {
 		enc = font.context.Encoder().String()
 	}
 	return fmt.Sprintf("FONT{%T %s %s}", font.context, font.baseFields().coreString(), enc)
-
 }
 
 // BaseFont returns the font's "BaseFont" field.
@@ -210,7 +209,7 @@ func NewStandard14FontWithEncoding(basefont Standard14Font, alphabet map[rune]in
 	return &PdfFont{context: &std}, encoder, nil
 }
 
-// GetAlphabet returns a map of the runes in `text`.
+// GetAlphabet returns a map of the runes in `text` and their frequencies.
 func GetAlphabet(text string) map[rune]int {
 	alphabet := map[rune]int{}
 	for _, r := range text {
@@ -473,7 +472,7 @@ func (font PdfFont) ToPdfObject() core.PdfObject {
 	if t := font.actualFont(); t != nil {
 		return t.ToPdfObject()
 	}
-	common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object",
+	common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object.",
 		font.context)
 	return core.MakeNull()
 }
@@ -576,7 +575,7 @@ func (font PdfFont) actualFont() Font {
 	case *pdfCIDFontType2:
 		return t
 	default:
-		common.Log.Debug("ERROR: actualFont. Unknown font type %t. font=%s", t, font)
+		common.Log.Debug("ERROR: actualFont. Unknown font type %T. font=%s", t, font)
 		return nil
 	}
 }
@@ -597,7 +596,7 @@ func (font PdfFont) baseFields() *fontCommon {
 	case *pdfCIDFontType2:
 		return t.baseFields()
 	default:
-		common.Log.Debug("ERROR: base. Unknown font type %t. font=%s", t, font.String())
+		common.Log.Debug("ERROR: base. Unknown font type %T. font=%s", t, font.String())
 		return nil
 	}
 }
diff --git a/pdf/model/fonts/font.go b/pdf/model/fonts/font.go
index 34755e7b..8b7b3cf5 100644
--- a/pdf/model/fonts/font.go
+++ b/pdf/model/fonts/font.go
@@ -18,7 +18,7 @@ type Font interface {
 	Encoder() textencoding.TextEncoder
 	SetEncoder(encoder textencoding.TextEncoder)
 	GetGlyphCharMetrics(glyph string) (CharMetrics, bool)
-	GetAverageCharWidth() float64
+	GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove.
 	ToPdfObject() core.PdfObject
 }
 

From 36a1148962ceaad33c523e70b9192d6f4e733619 Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Wed, 28 Nov 2018 18:06:03 +1100
Subject: [PATCH 2/3] Combine diacritics in text extraction.

---
 pdf/extractor/extractor.go |   3 +
 pdf/extractor/text.go      | 184 ++++++++++++++++++++++++++++++++++++-
 pdf/extractor/text_test.go |  16 +++-
 pdf/model/font_simple.go   |   4 +-
 4 files changed, 198 insertions(+), 9 deletions(-)

diff --git a/pdf/extractor/extractor.go b/pdf/extractor/extractor.go
index 12c2488e..dad88a7c 100644
--- a/pdf/extractor/extractor.go
+++ b/pdf/extractor/extractor.go
@@ -18,6 +18,9 @@ type Extractor struct {
 
 	// accessCount is used to set fontEntry.access to an incrementing number.
 	accessCount int64
+
+	// textCount is an incrementing number used to identify XYTest objects.
+	textCount int64
 }
 
 // New returns an Extractor instance for extracting content from the input PDF page.
diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go
index c6e99440..d6a47fc9 100644
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@@ -13,11 +13,13 @@ import (
 	"runtime"
 	"sort"
 	"strings"
+	"unicode"
 
 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/contentstream"
 	"github.com/unidoc/unidoc/pdf/core"
 	"github.com/unidoc/unidoc/pdf/model"
+	"golang.org/x/text/unicode/norm"
 )
 
 // ExtractText processes and extracts all text data in content streams and returns as a string.
@@ -601,6 +603,9 @@ func (to *textObject) renderText(data []byte) error {
 	charcodes := font.BytesToCharcodes(data)
 
 	runes, numChars, numMisses := font.CharcodesToUnicode(charcodes)
+	if numMisses > 0 {
+		common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
+	}
 
 	to.State.numChars += numChars
 	to.State.numMisses += numMisses
@@ -624,6 +629,11 @@ func (to *textObject) renderText(data []byte) error {
 
 	for i, r := range runes {
 
+		// XXX(peterwilliams97) Need to find and fix cases where this happens.
+		if r == "\x00" {
+			continue
+		}
+
 		code := charcodes[i]
 		// The location of the text on the page in device coordinates is given by trm, the text
 		// rendering matrix.
@@ -665,6 +675,7 @@ func (to *textObject) renderText(data []byte) error {
 			string(r),
 			trm,
 			translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
+			1.0*trm.ScalingFactorY(),
 			spaceWidth*trm.ScalingFactorX())
 		common.Log.Trace("i=%d code=%d xyt=%s trm=%s", i, code, xyt, trm)
 		to.Texts = append(to.Texts, xyt)
@@ -707,6 +718,7 @@ type XYText struct {
 	Orient        int     // The text orientation.
 	OrientedStart Point   // Left of text in orientation where text is horizontal.
 	OrientedEnd   Point   // Right of text in orientation where text is horizontal.
+	Height        float64 // Text height.
 	SpaceWidth    float64 // Best guess at the width of a space in the font the text was rendered with.
 	count         int64   // To help with reading debug logs.
 }
@@ -714,14 +726,22 @@ type XYText struct {
 // newXYText returns an XYText for text `text` rendered with text rendering matrix `trm` and end
 // of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in
 // the font the text is rendered in device coordinates.
-func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText {
+func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point,
+	height, spaceWidth float64) XYText {
 	to.e.textCount++
 	theta := trm.Angle()
+	if theta%180 == 0 {
+		height = trm.ScalingFactorY()
+	} else {
+		height = trm.ScalingFactorX()
+	}
+
 	return XYText{
 		Text:          text,
 		Orient:        theta,
 		OrientedStart: translation(trm).Rotate(theta),
 		OrientedEnd:   end.Rotate(theta),
+		Height:        height,
 		SpaceWidth:    spaceWidth,
 		count:         to.e.textCount,
 	}
@@ -729,7 +749,7 @@ func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point
 
 // String returns a string describing `t`.
 func (t XYText) String() string {
-	return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f |%d| %q}",
+	return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f %d° %q}",
 		t.count, t.OrientedStart.X, t.OrientedStart.Y, t.Width(), t.Orient, truncate(t.Text, 100))
 }
 
@@ -746,9 +766,21 @@ func (tl TextList) Length() int {
 	return len(tl)
 }
 
+// height returns the max height of the elements in `tl`.
+func (tl TextList) height() float64 {
+	fontHeight := 0.0
+	for _, t := range tl {
+		if t.Height > fontHeight {
+			fontHeight = t.Height
+		}
+	}
+	return fontHeight
+}
+
 // ToText returns the contents of `tl` as a single string.
 func (tl TextList) ToText() string {
 	tl.printTexts("ToText: before sorting")
+
 	tl.SortPosition()
 
 	lines := tl.toLines()
@@ -763,12 +795,16 @@ func (tl TextList) ToText() string {
 // Sorting is by orientation then top to bottom, left to right when page is orientated so that text
 // is horizontal.
 func (tl *TextList) SortPosition() {
+	fontHeight := tl.height()
+	// We sort with a y tolerance to allow for subscripts, diacritics etc.
+	tol := min(fontHeight*0.2, 5.0)
+	common.Log.Trace("SortPosition: fontHeight=%.1f tol=%.1f", fontHeight, tol)
 	sort.SliceStable(*tl, func(i, j int) bool {
 		ti, tj := (*tl)[i], (*tl)[j]
 		if ti.Orient != tj.Orient {
 			return ti.Orient < tj.Orient
 		}
-		if ti.OrientedStart.Y != tj.OrientedStart.Y {
+		if math.Abs(ti.OrientedStart.Y-tj.OrientedStart.Y) > tol {
 			return ti.OrientedStart.Y > tj.OrientedStart.Y
 		}
 		return ti.OrientedStart.X < tj.OrientedStart.X
@@ -826,6 +862,7 @@ func (tl TextList) toLinesOrient() []Line {
 			if len(words) > 0 {
 				line := newLine(y, x, words)
 				if averageCharWidth.running {
+					line = combineDiacritics(line, averageCharWidth.ave)
 					line = removeDuplicates(line, averageCharWidth.ave)
 				}
 				lines = append(lines, line)
@@ -978,6 +1015,147 @@ func removeDuplicates(line Line, charWidth float64) Line {
 	return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
 }
 
+// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
+// `charWidth` is the average character width for the line.
+// We have to do this because PDF can render diacritics separately to the characters they attach to
+// in extracted text.
+func combineDiacritics(line Line, charWidth float64) Line {
+	if len(line.Dx) == 0 {
+		return line
+	}
+
+	tol := charWidth * 0.2
+	common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)
+
+	words := []string{}
+	dxList := []float64{}
+	w := line.Words[0]
+	w, c := countDiacritic(w)
+	delta := 0.0
+	dx0 := 0.0
+	parts := []string{w}
+	numChars := c
+
+	for i := 0; i < len(line.Dx); i++ {
+		w = line.Words[i+1]
+		w, c := countDiacritic(w)
+		dx := line.Dx[i]
+		if numChars+c <= 1 && delta+dx <= tol {
+			if len(parts) == 0 {
+				dx0 = dx
+			} else {
+				delta += dx
+			}
+			parts = append(parts, w)
+			numChars += c
+		} else {
+			if len(parts) > 0 {
+				if len(words) > 0 {
+					dxList = append(dxList, dx0)
+				}
+				words = append(words, combine(parts))
+			}
+			parts = []string{w}
+			numChars = c
+			dx0 = dx
+			delta = 0.0
+		}
+	}
+	if len(parts) > 0 {
+		if len(words) > 0 {
+			dxList = append(dxList, dx0)
+		}
+		words = append(words, combine(parts))
+	}
+
+	if len(words) != len(dxList)+1 {
+		common.Log.Error("Inconsistent: \nwords=%d %q\ndxList=%d %.2f",
+			len(words), words, len(dxList), dxList)
+		return line
+	}
+	return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
+}
+
+// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
+func combine(parts []string) string {
+	if len(parts) == 1 {
+		// Must be a non-diacritic.
+		return parts[0]
+	}
+
+	// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
+	diacritic := map[string]bool{}
+	for _, w := range parts {
+		r := []rune(w)[0]
+		diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
+	}
+	sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })
+
+	// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
+	for i, w := range parts {
+		parts[i] = strings.TrimSpace(norm.NFKC.String(w))
+	}
+	return strings.Join(parts, "")
+}
+
+// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
+// non-diacritics in `w` (0 or 1)
+func countDiacritic(w string) (string, int) {
+	runes := []rune(w)
+	if len(runes) != 1 {
+		return w, 1
+	}
+	r := runes[0]
+	if w2, ok := diacritics[r]; ok {
+		w = w2
+	}
+	c := 1
+	if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) {
+		c = 0
+	}
+	if w2, ok := diacritics[r]; ok {
+		c = 0
+		w = w2
+	}
+	return w, c
+}
+
+// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
+// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
+var diacritics = map[rune]string{
+	0x0060: "\u0300",
+	0x02CB: "\u0300",
+	0x0027: "\u0301",
+	0x02B9: "\u0301",
+	0x02CA: "\u0301",
+	0x005e: "\u0302",
+	0x02C6: "\u0302",
+	0x007E: "\u0303",
+	0x02C9: "\u0304",
+	0x00B0: "\u030A",
+	0x02BA: "\u030B",
+	0x02C7: "\u030C",
+	0x02C8: "\u030D",
+	0x0022: "\u030E",
+	0x02BB: "\u0312",
+	0x02BC: "\u0313",
+	0x0486: "\u0313",
+	0x055A: "\u0313",
+	0x02BD: "\u0314",
+	0x0485: "\u0314",
+	0x0559: "\u0314",
+	0x02D4: "\u031D",
+	0x02D5: "\u031E",
+	0x02D6: "\u031F",
+	0x02D7: "\u0320",
+	0x02B2: "\u0321",
+	0x02CC: "\u0329",
+	0x02B7: "\u032B",
+	0x02CD: "\u0331",
+	0x005F: "\u0332",
+	0x204E: "\u0359",
+}
+
 // getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
 // empty.
 func (to *textObject) getCurrentFont() *model.PdfFont {
diff --git a/pdf/extractor/text_test.go b/pdf/extractor/text_test.go
index d9f84b46..c3a4fb7a 100644
--- a/pdf/extractor/text_test.go
+++ b/pdf/extractor/text_test.go
@@ -17,6 +17,7 @@ import (
 
 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/model"
+	"golang.org/x/text/unicode/norm"
 )
 
 // XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF
@@ -187,8 +188,10 @@ var extract2Tests = []struct {
 	},
 	{filename: "Ito_Formula.pdf",
 		expectedPageText: map[int][]string{
-			// 1: []string{"In the Itô stochastic calculus"},
-			1: []string{"In standard, non-stochastic calculus, one computes a derivative"},
+			1: []string{
+				"In the Itô stochastic calculus",
+				"In standard, non-stochastic calculus, one computes a derivative"},
+			2: []string{"Financial Economics Itô’s Formula"},
 		},
 	},
 	{filename: "circ2.pdf",
@@ -206,6 +209,11 @@ var extract2Tests = []struct {
 			1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
 		},
 	},
+	{filename: "thanh.pdf",
+		expectedPageText: map[int][]string{
+			1: []string{"Hàn Thé̂ Thành"},
+		},
+	},
 }
 
 // testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted
@@ -229,6 +237,7 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri
 		if !ok {
 			t.Fatalf("%q doesn't have page %d", filename, pageNum)
 		}
+		actualText = norm.NFKC.String(actualText)
 		if !containsSentences(t, expectedSentences, actualText) {
 			t.Fatalf("Text mismatch filename=%q page=%d", path, pageNum)
 		}
@@ -276,8 +285,9 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 // containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
 func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
 	for _, e := range expectedSentences {
+		e = norm.NFKC.String(e)
 		if !strings.Contains(actualText, e) {
-			t.Errorf("No match for %#q", e)
+			t.Errorf("No match for %q", e)
 			return false
 		}
 	}
diff --git a/pdf/model/font_simple.go b/pdf/model/font_simple.go
index 5c509714..8e570c1a 100644
--- a/pdf/model/font_simple.go
+++ b/pdf/model/font_simple.go
@@ -131,7 +131,6 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
 // returned to indicate whether or not the entry was found in the glyph to charcode mapping.
 func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
 	if width, ok := font.charWidths[code]; ok {
-		common.Log.Debug("GetCharMetrics 1: code=%d width=%.1f font=%s", code, width, font)
 		return fonts.CharMetrics{Wx: width}, true
 	}
 	if isBuiltin(Standard14Font(font.basefont)) {
@@ -142,12 +141,11 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
 		if glyph, ok := font.encoder.CharcodeToGlyph(code); ok {
 			if metrics, ok := font.fontMetrics[glyph]; ok {
 				font.charWidths[code] = metrics.Wx
-				common.Log.Debug("GetCharMetrics 2: code=%d glyph=%q width=%.1f", code, glyph, metrics.Wx)
 				return metrics, true
 			}
 		}
 	}
-	common.Log.Debug("GetCharMetrics 3: code=%d font=%s", code, font)
+	common.Log.Debug("GetCharMetrics: No match for code=%d font=%s", code, font)
 	return fonts.CharMetrics{}, false
 }
 

From 6529b42a704857e3f4da52696d431eba26d16950 Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Wed, 28 Nov 2018 18:22:42 +1100
Subject: [PATCH 3/3] Remove duplicate code.

---
 pdf/extractor/text.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go
index d6a47fc9..523bd1f3 100644
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@@ -1106,9 +1106,6 @@ func countDiacritic(w string) (string, int) {
 		return w, 1
 	}
 	r := runes[0]
-	if w2, ok := diacritics[r]; ok {
-		w = w2
-	}
 	c := 1
 	if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) {
 		c = 0