Merge branch 'extract.text' of https://github.com/peterwilliams97/unidoc into v3-peterwilliams97-extract.text

2025-04-27 13:48:51 +08:00 · 2018-11-28 23:33:31 +00:00 · 2018-11-28 23:33:31 +00:00 · f04f83b271
commit f04f83b271
parent d29f9a6a34 6529b42a70
5 changed files with 268 additions and 70 deletions
--- a/pdf/extractor/extractor.go
+++ b/pdf/extractor/extractor.go
@ -18,6 +18,9 @@ type Extractor struct {

 	// accessCount is used to set fontEntry.access to an incrementing number.
 	accessCount int64
+
+	// textCount is an incrementing number used to identify XYTest objects.
+	textCount int64
 }

 // New returns an Extractor instance for extracting content from the input PDF page.
--- a/pdf/extractor/text.go
+++ b/pdf/extractor/text.go
@ -13,11 +13,13 @@ import (
 	"runtime"
 	"sort"
 	"strings"
+	"unicode"

 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/contentstream"
 	"github.com/unidoc/unidoc/pdf/core"
 	"github.com/unidoc/unidoc/pdf/model"
+	"golang.org/x/text/unicode/norm"
 )

 // ExtractText processes and extracts all text data in content streams and returns as a string.
@ -49,7 +51,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
 	cstreamParser := contentstream.NewContentStreamParser(e.contents)
 	operations, err := cstreamParser.Parse()
 	if err != nil {
-		common.Log.Debug("ExtractXYText: parse failed. err=%v", err)
+		common.Log.Debug("ERROR: ExtractXYText parse failed. err=%v", err)
 		return textList, state.numChars, state.numMisses, err
 	}

@ -309,7 +311,7 @@ func (to *textObject) nextLine() {

 // setTextMatrix "Tm".
 // Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
-// in `f`  (page 250)
+// in `f` (page 250).
 func (to *textObject) setTextMatrix(f []float64) {
 	if len(f) != 6 {
 		common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
@ -317,25 +319,23 @@ func (to *textObject) setTextMatrix(f []float64) {
 	}
 	a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
 	to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
-	to.Tlm = contentstream.NewMatrix(a, b, c, d, tx, ty)
-	common.Log.Debug("setTextMatrix: Tm=%s", to.Tm)
+	to.Tlm = to.Tm
 }

-// showText "Tj" Show a text string.
+// showText "Tj". Show a text string.
 func (to *textObject) showText(charcodes []byte) error {
 	return to.renderText(charcodes)
 }

-// showTextAdjusted "TJ" Show text with adjustable spacing.
+// showTextAdjusted "TJ". Show text with adjustable spacing.
 func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 	vertical := false
 	for _, o := range args.Elements() {
 		switch o.(type) {
 		case *core.PdfObjectFloat, *core.PdfObjectInteger:
-			// The following is supposed to be equivalent to the existing Unidoc implementation.
 			x, err := core.GetNumberAsFloat(o)
 			if err != nil {
-				common.Log.Debug("showTextAdjusted: Bad numerical arg. o=%s args=%+v", o, args)
+				common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
 				return err
 			}
 			dx, dy := -x*0.001*to.State.Tfs, 0.0
@ -344,23 +344,23 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 			}
 			td := translationMatrix(Point{X: dx, Y: dy})
 			to.Tm = td.Mult(to.Tm)
-			common.Log.Debug("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm)
+			common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm)
 		case *core.PdfObjectString:
 			charcodes, ok := core.GetStringBytes(o)
 			if !ok {
-				common.Log.Debug("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
+				common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
 				return core.ErrTypeError
 			}
 			to.renderText(charcodes)
 		default:
-			common.Log.Debug("showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
+			common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
 			return core.ErrTypeError
 		}
 	}
 	return nil
 }

-// setTextLeading "TL" Set text leading.
+// setTextLeading "TL". Set text leading.
 func (to *textObject) setTextLeading(y float64) {
 	if to == nil || to.State == nil {
 		return
@ -368,7 +368,7 @@ func (to *textObject) setTextLeading(y float64) {
 	to.State.Tl = y
 }

-// setCharSpacing "Tc" Set character spacing.
+// setCharSpacing "Tc". Set character spacing.
 func (to *textObject) setCharSpacing(x float64) {
 	if to == nil {
 		return
@ -376,7 +376,7 @@ func (to *textObject) setCharSpacing(x float64) {
 	to.State.Tc = x
 }

-// setFont "Tf" Set font.
+// setFont "Tf". Set font.
 func (to *textObject) setFont(name string, size float64) error {
 	if to == nil {
 		return nil
@ -399,7 +399,7 @@ func (to *textObject) setFont(name string, size float64) error {
 	return nil
 }

-// setTextRenderMode "Tr" Set text rendering mode.
+// setTextRenderMode "Tr". Set text rendering mode.
 func (to *textObject) setTextRenderMode(mode int) {
 	if to == nil {
 		return
@ -407,7 +407,7 @@ func (to *textObject) setTextRenderMode(mode int) {
 	to.State.Tmode = RenderMode(mode)
 }

-// setTextRise "Ts" Set text rise.
+// setTextRise "Ts". Set text rise.
 func (to *textObject) setTextRise(y float64) {
 	if to == nil {
 		return
@ -415,7 +415,7 @@ func (to *textObject) setTextRise(y float64) {
 	to.State.Trise = y
 }

-// setWordSpacing "Tw" Set word spacing.
+// setWordSpacing "Tw". Set word spacing.
 func (to *textObject) setWordSpacing(y float64) {
 	if to == nil {
 		return
@ -423,7 +423,7 @@ func (to *textObject) setWordSpacing(y float64) {
 	to.State.Tw = y
 }

-// setHorizScaling "Tz" Set horizontal scaling.
+// setHorizScaling "Tz". Set horizontal scaling.
 func (to *textObject) setHorizScaling(y float64) {
 	if to == nil {
 		return
@ -577,9 +577,6 @@ type textObject struct {
 	Tm        contentstream.Matrix // Text matrix. For the character pointer.
 	Tlm       contentstream.Matrix // Text line matrix. For the start of line pointer.
 	Texts     []XYText             // Text gets written here.
-
-	// These fields are used to implement existing UniDoc behaviour.
-	xPos, yPos float64
 }

 // newTextState returns a default textState.
@ -610,6 +607,9 @@ func (to *textObject) renderText(data []byte) error {
 	charcodes := font.BytesToCharcodes(data)

 	runes, numChars, numMisses := font.CharcodesToUnicode(charcodes)
+	if numMisses > 0 {
+		common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
+	}

 	to.State.numChars += numChars
 	to.State.numMisses += numMisses
@ -629,10 +629,14 @@ func (to *textObject) renderText(data []byte) error {
 		0, tfs,
 		0, state.Trise)

-	common.Log.Debug("==========================================")
-	common.Log.Debug("%d codes=%+v runes=%q", len(charcodes), charcodes, runes)
+	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)

 	for i, r := range runes {
+		// XXX(peterwilliams97) Need to find and fix cases where this happens.
+		if r == "\x00" {
+			continue
+		}
+
 		code := charcodes[i]
 		// The location of the text on the page in device coordinates is given by trm, the text
 		// rendering matrix.
@ -656,14 +660,13 @@ func (to *textObject) renderText(data []byte) error {
 		// c is the character size in unscaled text units.
 		c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}

+		// t0 is the end of this character.
 		// t is the displacement of the text cursor when the character is rendered.
-		// float tx = displacementX * fontSize * horizontalScaling;
-		// w = 0
 		t0 := Point{X: (c.X*tfs + w) * th}
 		t := Point{X: (c.X*tfs + state.Tc + w) * th}

 		// td, td0 are t, t0 in matrix form.
-		// td0 is where this char ends. td is where the next char stats.
+		// td0 is where this character ends. td is where the next character starts.
 		td0 := translationMatrix(t0)
 		td := translationMatrix(t)

@ -671,19 +674,17 @@ func (to *textObject) renderText(data []byte) error {
 		common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.Tc, w, state.Tw)
 		common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.Tm).Mult(to.gs.CTM))

-		nextTm := td.Mult(to.Tm)
-		common.Log.Trace("nextTm=%s", nextTm)
-
-		xyt := newXYText(
+		xyt := to.newXYText(
 			string(r),
 			trm,
 			translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
+			1.0*trm.ScalingFactorY(),
 			spaceWidth*trm.ScalingFactorX())
-		common.Log.Trace("i=%d code=%d, xyt=%s", i, code, xyt)
+		common.Log.Trace("i=%d code=%d xyt=%s trm=%s", i, code, xyt, trm)
 		to.Texts = append(to.Texts, xyt)

 		// update the text matrix by the displacement of the text location.
-		to.Tm = nextTm
+		to.Tm = td.Mult(to.Tm)
 		common.Log.Trace("to.Tm=%s", to.Tm)
 	}

@ -714,36 +715,45 @@ func (to *textObject) moveTo(tx, ty float64) {
 }

 // XYText represents text drawn on a page and its position in device coordinates.
+// All dimensions are in device coordinates.
 type XYText struct {
-	Trm              contentstream.Matrix
-	OrientedStart    Point          // Left of text in orientation where text is horizontal.
-	OrientedEnd      Point          // Right of text in orientation where text is horizontal.
-	ColorStroking    model.PdfColor // Colour that text is stroked with, if any.
-	ColorNonStroking model.PdfColor // Colour that text is filled with, if any.
-	Orient           int
-	Text             string
-	SpaceWidth       float64
-	Font             string
-	FontSize         float64
+	Text          string  // The text.
+	Orient        int     // The text orientation.
+	OrientedStart Point   // Left of text in orientation where text is horizontal.
+	OrientedEnd   Point   // Right of text in orientation where text is horizontal.
+	Height        float64 // Text height.
+	SpaceWidth    float64 // Best guess at the width of a space in the font the text was rendered with.
+	count         int64   // To help with reading debug logs.
 }

-func newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText {
+// newXYText returns an XYText for text `text` rendered with text rendering matrix `trm` and end
+// of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in
+// the font the text is rendered in device coordinates.
+func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point,
+	height, spaceWidth float64) XYText {
+	to.e.textCount++
 	theta := trm.Angle()
+	if theta%180 == 0 {
+		height = trm.ScalingFactorY()
+	} else {
+		height = trm.ScalingFactorX()
+	}
+
 	return XYText{
 		Text:          text,
-		Trm:           trm,
+		Orient:        theta,
 		OrientedStart: translation(trm).Rotate(theta),
 		OrientedEnd:   end.Rotate(theta),
-		Orient:        theta,
+		Height:        height,
 		SpaceWidth:    spaceWidth,
+		count:         to.e.textCount,
 	}
 }

 // String returns a string describing `t`.
 func (t XYText) String() string {
-	return fmt.Sprintf("XYText{%s  %.1f |%d| [%.3f,%.3f] %q}",
-		t.Trm.String(), t.Width(), t.Orient, t.OrientedStart.X, t.OrientedStart.Y,
-		truncate(t.Text, 100))
+	return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f %d° %q}",
+		t.count, t.OrientedStart.X, t.OrientedStart.Y, t.Width(), t.Orient, truncate(t.Text, 100))
 }

 // Width returns the width of `t`.Text in the text direction.
@ -755,13 +765,25 @@ func (t XYText) Width() float64 {
 type TextList []XYText

 // Length returns the number of elements in `tl`.
-func (tl *TextList) Length() int {
-	return len(*tl)
+func (tl TextList) Length() int {
+	return len(tl)
+}
+
+// height returns the max height of the elements in `tl`.
+func (tl TextList) height() float64 {
+	fontHeight := 0.0
+	for _, t := range tl {
+		if t.Height > fontHeight {
+			fontHeight = t.Height
+		}
+	}
+	return fontHeight
 }

 // ToText returns the contents of `tl` as a single string.
-func (tl *TextList) ToText() string {
+func (tl TextList) ToText() string {
 	tl.printTexts("ToText: before sorting")
+
 	tl.SortPosition()

 	lines := tl.toLines()
@ -776,12 +798,16 @@ func (tl *TextList) ToText() string {
 // Sorting is by orientation then top to bottom, left to right when page is orientated so that text
 // is horizontal.
 func (tl *TextList) SortPosition() {
+	fontHeight := tl.height()
+	// We sort with a y tolerance to allow for subscripts, diacritics etc.
+	tol := min(fontHeight*0.2, 5.0)
+	common.Log.Trace("SortPosition: fontHeight=%.1f tol=%.1f", fontHeight, tol)
 	sort.SliceStable(*tl, func(i, j int) bool {
 		ti, tj := (*tl)[i], (*tl)[j]
 		if ti.Orient != tj.Orient {
 			return ti.Orient < tj.Orient
 		}
-		if ti.OrientedStart.Y != tj.OrientedStart.Y {
+		if math.Abs(ti.OrientedStart.Y-tj.OrientedStart.Y) > tol {
 			return ti.OrientedStart.Y > tj.OrientedStart.Y
 		}
 		return ti.OrientedStart.X < tj.OrientedStart.X
@ -793,26 +819,31 @@ type Line struct {
 	Y     float64   // y position of line.
 	Dx    []float64 // x distance between successive words in line.
 	Text  string    // text in the line.
-	Words []string  // words in the line
+	Words []string  // words in the line.
 }

 // toLines returns the text and positions in `tl` as a slice of Line.
-// NOTE: Caller must sort the text list by top-to-bottom, left-to-write (for orientation adjusted so
+// NOTE: Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so
 // that text is horizontal) before calling this function.
 func (tl TextList) toLines() []Line {
+	// We divide `tl` into slices which contain texts with the same orientation, extract the lines
+	// for each orientation then return the concatention of these lines sorted by orientation.
 	tlOrient := make(map[int]TextList, len(tl))
 	for _, t := range tl {
 		tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
 	}
-	lines := make([]Line, 0, 4)
-	for _, o := range []int{0, 90, 180, 270} {
+	var lines []Line
+	for _, o := range orientKeys(tlOrient) {
 		lines = append(lines, tlOrient[o].toLinesOrient()...)
 	}
 	return lines
 }

 // toLinesOrient returns the text and positions in `tl` as a slice of Line.
-// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
+// NOTE: This function only works on text lists where all text is the same orientation so it should
+// only be called from toLines.
+// Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so
+// that text is horizontal) before calling this function.
 func (tl TextList) toLinesOrient() []Line {
 	tl.printTexts("toLines: before")
 	if len(tl) == 0 {
@ -827,13 +858,14 @@ func (tl TextList) toLinesOrient() []Line {

 	averageCharWidth := exponAve{}
 	wordSpacing := exponAve{}
-	lastEndX := 0.0 // tl[i-1].End.X
+	lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X

 	for _, t := range tl {
 		if t.OrientedStart.Y < y {
 			if len(words) > 0 {
 				line := newLine(y, x, words)
 				if averageCharWidth.running {
+					line = combineDiacritics(line, averageCharWidth.ave)
 					line = removeDuplicates(line, averageCharWidth.ave)
 				}
 				lines = append(lines, line)
@ -893,6 +925,24 @@ func (tl TextList) toLinesOrient() []Line {
 	return lines
 }

+// orientKeys returns the keys of `tlOrient` as a sorted slice.
+func orientKeys(tlOrient map[int]TextList) []int {
+	keys := []int{}
+	for k := range tlOrient {
+		keys = append(keys, k)
+	}
+	sort.Ints(keys)
+	return keys
+}
+
+// min returns the lesser of `a` and `b`.
+func min(a, b float64) float64 {
+	if a < b {
+		return a
+	}
+	return b
+}
+
 // exponAve implements an exponential average.
 type exponAve struct {
 	ave     float64 // Current average value.
@ -974,6 +1024,144 @@ func removeDuplicates(line Line, charWidth float64) Line {
 	return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
 }

+// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
+// `charWidth` is the average character width for the line.
+// We have to do this because PDF can render diacritics separately to the characters they attach to
+// in extracted text.
+func combineDiacritics(line Line, charWidth float64) Line {
+	if len(line.Dx) == 0 {
+		return line
+	}
+
+	tol := charWidth * 0.2
+	common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)
+
+	words := []string{}
+	dxList := []float64{}
+	w := line.Words[0]
+	w, c := countDiacritic(w)
+	delta := 0.0
+	dx0 := 0.0
+	parts := []string{w}
+	numChars := c
+
+	for i := 0; i < len(line.Dx); i++ {
+		w = line.Words[i+1]
+		w, c := countDiacritic(w)
+		dx := line.Dx[i]
+		if numChars+c <= 1 && delta+dx <= tol {
+			if len(parts) == 0 {
+				dx0 = dx
+			} else {
+				delta += dx
+			}
+			parts = append(parts, w)
+			numChars += c
+		} else {
+			if len(parts) > 0 {
+				if len(words) > 0 {
+					dxList = append(dxList, dx0)
+				}
+				words = append(words, combine(parts))
+			}
+			parts = []string{w}
+			numChars = c
+			dx0 = dx
+			delta = 0.0
+		}
+	}
+	if len(parts) > 0 {
+		if len(words) > 0 {
+			dxList = append(dxList, dx0)
+		}
+		words = append(words, combine(parts))
+	}
+
+	if len(words) != len(dxList)+1 {
+		common.Log.Error("Inconsistent: \nwords=%d %q\ndxList=%d %.2f",
+			len(words), words, len(dxList), dxList)
+		return line
+	}
+	return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
+}
+
+// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
+func combine(parts []string) string {
+	if len(parts) == 1 {
+		// Must be a non-diacritic.
+		return parts[0]
+	}
+
+	// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
+	diacritic := map[string]bool{}
+	for _, w := range parts {
+		r := []rune(w)[0]
+		diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
+	}
+	sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })
+
+	// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
+	for i, w := range parts {
+		parts[i] = strings.TrimSpace(norm.NFKC.String(w))
+	}
+	return strings.Join(parts, "")
+}
+
+// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
+// non-diacritics in `w` (0 or 1)
+func countDiacritic(w string) (string, int) {
+	runes := []rune(w)
+	if len(runes) != 1 {
+		return w, 1
+	}
+	r := runes[0]
+	c := 1
+	if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) {
+		c = 0
+	}
+	if w2, ok := diacritics[r]; ok {
+		c = 0
+		w = w2
+	}
+	return w, c
+}
+
+// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
+// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
+var diacritics = map[rune]string{
+	0x0060: "\u0300",
+	0x02CB: "\u0300",
+	0x0027: "\u0301",
+	0x02B9: "\u0301",
+	0x02CA: "\u0301",
+	0x005e: "\u0302",
+	0x02C6: "\u0302",
+	0x007E: "\u0303",
+	0x02C9: "\u0304",
+	0x00B0: "\u030A",
+	0x02BA: "\u030B",
+	0x02C7: "\u030C",
+	0x02C8: "\u030D",
+	0x0022: "\u030E",
+	0x02BB: "\u0312",
+	0x02BC: "\u0313",
+	0x0486: "\u0313",
+	0x055A: "\u0313",
+	0x02BD: "\u0314",
+	0x0485: "\u0314",
+	0x0559: "\u0314",
+	0x02D4: "\u031D",
+	0x02D5: "\u031E",
+	0x02D6: "\u031F",
+	0x02D7: "\u0320",
+	0x02B2: "\u0321",
+	0x02CC: "\u0329",
+	0x02B7: "\u032B",
+	0x02CD: "\u0331",
+	0x005F: "\u0332",
+	0x204E: "\u0359",
+}
+
 // getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
 // empty.
 func (to *textObject) getCurrentFont() *model.PdfFont {
--- a/pdf/extractor/text_test.go
+++ b/pdf/extractor/text_test.go
@ -16,6 +16,7 @@ import (

 	"github.com/unidoc/unidoc/common"
 	"github.com/unidoc/unidoc/pdf/model"
+	"golang.org/x/text/unicode/norm"
 )

 // NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
@ -196,8 +197,10 @@ var fileExtractionTests = []struct {
 	},
 	{filename: "Ito_Formula.pdf",
 		expectedPageText: map[int][]string{
-			// 1: []string{"In the Itô stochastic calculus"},
-			1: []string{"In standard, non-stochastic calculus, one computes a derivative"},
+			1: []string{
+				"In the Itô stochastic calculus",
+				"In standard, non-stochastic calculus, one computes a derivative"},
+			2: []string{"Financial Economics Itô’s Formula"},
 		},
 	},
 	{filename: "circ2.pdf",
@ -215,6 +218,11 @@ var fileExtractionTests = []struct {
 			1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
 		},
 	},
+	{filename: "thanh.pdf",
+		expectedPageText: map[int][]string{
+			1: []string{"Hàn Thé̂ Thành"},
+		},
+	},
 }

 // testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
@ -244,6 +252,7 @@ func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]s
 		if !ok {
 			t.Fatalf("%q doesn't have page %d", filename, pageNum)
 		}
+		actualText = norm.NFKC.String(actualText)
 		if !containsSentences(t, expectedSentences, actualText) {
 			t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
 		}
@ -291,8 +300,9 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
 // containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
 func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
 	for _, e := range expectedSentences {
+		e = norm.NFKC.String(e)
 		if !strings.Contains(actualText, e) {
-			t.Errorf("No match for %#q", e)
+			t.Errorf("No match for %q", e)
 			return false
 		}
 	}
--- a/pdf/model/font.go
+++ b/pdf/model/font.go
@ -56,7 +56,7 @@ func (font PdfFont) GetFontDescriptor() (*PdfFontDescriptor, error) {
 	case *pdfCIDFontType2:
 		return t.fontDescriptor, nil
 	}
-	common.Log.Debug("ERROR: Cannot get font descriptor for font type %t (%s)", font, font)
+	common.Log.Debug("ERROR: Cannot get font descriptor for font type %T (%s)", font, font)
 	return nil, errors.New("font descriptor not found")
 }

@ -67,7 +67,6 @@ func (font PdfFont) String() string {
 		enc = font.context.Encoder().String()
 	}
 	return fmt.Sprintf("FONT{%T %s %s}", font.context, font.baseFields().coreString(), enc)
-
 }

 // BaseFont returns the font's "BaseFont" field.
@ -214,7 +213,7 @@ func NewStandard14FontWithEncoding(basefont Standard14Font, alphabet map[rune]in
 	return &PdfFont{context: &std}, encoder, nil
 }

-// GetAlphabet returns a map of the runes in `text`.
+// GetAlphabet returns a map of the runes in `text` and their frequencies.
 func GetAlphabet(text string) map[rune]int {
 	alphabet := map[rune]int{}
 	for _, r := range text {
@ -477,7 +476,7 @@ func (font PdfFont) ToPdfObject() core.PdfObject {
 	if t := font.actualFont(); t != nil {
 		return t.ToPdfObject()
 	}
-	common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object",
+	common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object.",
 		font.context)
 	return core.MakeNull()
 }
@ -560,7 +559,7 @@ func (font PdfFont) actualFont() fonts.Font {
 	case *pdfCIDFontType2:
 		return t
 	default:
-		common.Log.Debug("ERROR: actualFont. Unknown font type %t. font=%s", t, font)
+		common.Log.Debug("ERROR: actualFont. Unknown font type %T. font=%s", t, font)
 		return nil
 	}
 }
@ -581,7 +580,7 @@ func (font PdfFont) baseFields() *fontCommon {
 	case *pdfCIDFontType2:
 		return t.baseFields()
 	default:
-		common.Log.Debug("ERROR: base. Unknown font type %t. font=%s", t, font.String())
+		common.Log.Debug("ERROR: base. Unknown font type %T. font=%s", t, font.String())
 		return nil
 	}
 }
--- a/pdf/model/font_simple.go
+++ b/pdf/model/font_simple.go
@ -131,7 +131,6 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
 // returned to indicate whether or not the entry was found in the glyph to charcode mapping.
 func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
 	if width, ok := font.charWidths[code]; ok {
-		common.Log.Debug("GetCharMetrics 1: code=%d width=%.1f font=%s", code, width, font)
 		return fonts.CharMetrics{Wx: width}, true
 	}
 	if isBuiltin(Standard14Font(font.basefont)) {
@ -142,12 +141,11 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
 		if glyph, ok := font.encoder.CharcodeToGlyph(code); ok {
 			if metrics, ok := font.fontMetrics[glyph]; ok {
 				font.charWidths[code] = metrics.Wx
-				common.Log.Debug("GetCharMetrics 2: code=%d glyph=%q width=%.1f", code, glyph, metrics.Wx)
 				return metrics, true
 			}
 		}
 	}
-	common.Log.Debug("GetCharMetrics 3: code=%d font=%s", code, font)
+	common.Log.Debug("GetCharMetrics: No match for code=%d font=%s", code, font)
 	return fonts.CharMetrics{}, false
 }