From 6fe0d20a86725114b2b67f01ffb09258ead15790 Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Tue, 19 May 2020 11:46:51 +1000
Subject: [PATCH 1/4] Fixed filename:page in logging

---
 common/logging.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/logging.go b/common/logging.go
index b7452bf6..b3e62348 100644
--- a/common/logging.go
+++ b/common/logging.go
@@ -221,7 +221,7 @@ func (l WriterLogger) logToWriter(f io.Writer, prefix string, format string, arg
 }
 
 func logToWriter(f io.Writer, prefix string, format string, args ...interface{}) {
-	_, file, line, ok := runtime.Caller(2)
+	_, file, line, ok := runtime.Caller(3)
 	if !ok {
 		file = "???"
 		line = 0

From 22680be0975c8f05471acd463d54a1fc1a144f06 Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Tue, 19 May 2020 14:57:27 +1000
Subject: [PATCH 2/4] Got CMap working for multi-rune entries

---
 internal/cmap/cmap.go                     | 62 +++++++++++-------
 internal/cmap/cmap_parser.go              | 79 ++++++++++++++++++++---
 internal/cmap/cmap_test.go                | 13 ++--
 internal/textencoding/cmap.go             | 13 ++--
 internal/textencoding/glyphs_glyphlist.go |  8 ++-
 model/font.go                             |  4 +-
 6 files changed, 135 insertions(+), 44 deletions(-)

diff --git a/internal/cmap/cmap.go b/internal/cmap/cmap.go
index 1299faa5..7a7ea0b6 100644
--- a/internal/cmap/cmap.go
+++ b/internal/cmap/cmap.go
@@ -21,6 +21,9 @@ const (
 
 	// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = �. Was '?'.
 	MissingCodeRune = '\ufffd' // �
+
+	// MissingCodeRune replaces strings that can't be decoded.
+	MissingCodeString = string(MissingCodeRune)
 )
 
 // CharCode is a character code or Unicode
@@ -41,7 +44,7 @@ type charRange struct {
 type fbRange struct {
 	code0 CharCode
 	code1 CharCode
-	r0    rune
+	r0    rune // TODO (peterwilliams97): Change to string for compound codes.
 }
 
 // CIDSystemInfo contains information for identifying the character collection
@@ -106,8 +109,9 @@ type CMap struct {
 	cidToCode map[CharCode]CharCode // CID -> charcode
 
 	// Used by ctype 2 CMaps.
-	codeToUnicode map[CharCode]rune // CID -> Unicode
-	unicodeToCode map[rune]CharCode // Unicode -> CID
+	codeToUnicode map[CharCode]string // CID -> Unicode string
+	// XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode?
+	unicodeToCode map[rune]CharCode // Unicode rune -> CID
 
 	// cached contains the raw CMap data. It is used by the Bytes method in
 	// order to avoid generating the data for every call.
@@ -116,8 +120,13 @@ type CMap struct {
 	cached []byte
 }
 
-// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg.
-func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap {
+// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToRune` arg.
+func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
+	codeToUnicode := make(map[CharCode]string, len(codeToRune))
+	for code, r := range codeToRune {
+		codeToUnicode[code] = string(r)
+	}
+
 	cmap := &CMap{
 		name:  "Adobe-Identity-UCS",
 		ctype: 2,
@@ -135,6 +144,7 @@ func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap {
 	}
 
 	cmap.computeInverseMappings()
+
 	return cmap
 }
 
@@ -148,7 +158,7 @@ func newCMap(isSimple bool) *CMap {
 		nbits:         nbits,
 		codeToCID:     make(map[CharCode]CharCode),
 		cidToCode:     make(map[CharCode]CharCode),
-		codeToUnicode: make(map[CharCode]rune),
+		codeToUnicode: make(map[CharCode]string),
 		unicodeToCode: make(map[rune]CharCode),
 	}
 }
@@ -254,7 +264,8 @@ func (cmap *CMap) computeInverseMappings() {
 	}
 
 	// Generate Unicode -> CID map.
-	for cid, r := range cmap.codeToUnicode {
+	for cid, s := range cmap.codeToUnicode {
+		r := rune0(s)
 		if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
 			cmap.unicodeToCode[r] = cid
 		}
@@ -277,19 +288,18 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
 		return "", 0
 	}
 
-	var (
-		parts   []rune
-		missing []CharCode
-	)
-	for _, code := range charcodes {
+	parts := make([]string, len(charcodes))
+	var missing []CharCode
+	for i, code := range charcodes {
 		s, ok := cmap.codeToUnicode[code]
 		if !ok {
 			missing = append(missing, code)
-			s = MissingCodeRune
+			s = MissingCodeString
 		}
-		parts = append(parts, s)
+		parts[i] = s
 	}
-	unicode := string(parts)
+	unicode := strings.Join(parts, "")
+
 	if len(missing) > 0 {
 		common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+
 			"\tdata=[% 02x]=%#q\n"+
@@ -305,11 +315,11 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
 // CharcodeToUnicode converts a single character code `code` to a unicode string.
 // If `code` is not in the unicode map, '�' is returned.
 // NOTE: CharcodeBytesToUnicode is typically more efficient.
-func (cmap *CMap) CharcodeToUnicode(code CharCode) (rune, bool) {
+func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
 	if s, ok := cmap.codeToUnicode[code]; ok {
 		return s, true
 	}
-	return MissingCodeRune, false
+	return MissingCodeString, false
 }
 
 // RuneToCID maps the specified rune to a character identifier. If the provided
@@ -453,7 +463,7 @@ func (cmap *CMap) toBfData() string {
 	}
 
 	// codes is a sorted list of the codeToUnicode keys.
-	var codes []CharCode
+	codes := make([]CharCode, 0, len(cmap.codeToUnicode))
 	for code := range cmap.codeToUnicode {
 		codes = append(codes, code)
 	}
@@ -470,9 +480,11 @@ func (cmap *CMap) toBfData() string {
 	// character codes have been mapped to code ranges.
 	var charRanges []charRange
 	currCharRange := charRange{codes[0], codes[0]}
-	prevRune := cmap.codeToUnicode[codes[0]]
+	prevRune := rune0(cmap.codeToUnicode[codes[0]])
+	// fmt.Printf("      code=0x%04x prevRune=%q=0x%04x\n", codes[0], prevRune, prevRune)
 	for _, c := range codes[1:] {
-		currRune := cmap.codeToUnicode[c]
+		currRune := rune0(cmap.codeToUnicode[c])
+		// fmt.Printf("%4d: code=0x%04x currRune=%q=0x%04x\n", i+1, c, currRune, currRune)
 		if c == currCharRange.code1+1 && currRune == prevRune+1 {
 			currCharRange.code1 = c
 		} else {
@@ -493,7 +505,7 @@ func (cmap *CMap) toBfData() string {
 			fbRanges = append(fbRanges, fbRange{
 				code0: cr.code0,
 				code1: cr.code1,
-				r0:    cmap.codeToUnicode[cr.code0],
+				r0:    rune0(cmap.codeToUnicode[cr.code0]),
 			})
 		}
 	}
@@ -508,7 +520,7 @@ func (cmap *CMap) toBfData() string {
 			lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
 			for j := 0; j < n; j++ {
 				code := fbChars[i*maxBfEntries+j]
-				r := cmap.codeToUnicode[code]
+				r := rune0(cmap.codeToUnicode[code])
 				lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
 			}
 			lines = append(lines, "endbfchar")
@@ -549,3 +561,9 @@ end
 end
 `
 )
+
+// rune0 is a convenience function that returns the first rune in `s`.
+// Caller must check that `s` is not empty.
+func rune0(s string) rune {
+	return ([]rune(s))[0]
+}
diff --git a/internal/cmap/cmap_parser.go b/internal/cmap/cmap_parser.go
index 9236d782..b5d69feb 100644
--- a/internal/cmap/cmap_parser.go
+++ b/internal/cmap/cmap_parser.go
@@ -141,7 +141,6 @@ func (cmap *CMap) parseName() error {
 // parseType parses a cmap type and adds it to `cmap`.
 // cmap names are defined like this: /CMapType 1 def
 func (cmap *CMap) parseType() error {
-
 	ctype := 0
 	done := false
 	for i := 0; i < 3 && !done; i++ {
@@ -171,7 +170,6 @@ func (cmap *CMap) parseType() error {
 // We don't need the version. We do this to eat up the version code in the cmap definition
 // to reduce unhandled parse object warnings.
 func (cmap *CMap) parseVersion() error {
-
 	version := ""
 	done := false
 	for i := 0; i < 3 && !done; i++ {
@@ -471,7 +469,7 @@ func (cmap *CMap) parseBfchar() error {
 			}
 			return err
 		}
-		var target rune
+		var target []rune
 		switch v := o.(type) {
 		case cmapOperand:
 			if v.Operand == endbfchar {
@@ -480,16 +478,20 @@ func (cmap *CMap) parseBfchar() error {
 			common.Log.Debug("ERROR: Unexpected operand. %#v", v)
 			return ErrBadCMap
 		case cmapHexString:
-			target = hexToRune(v)
+			target = hexToRunes(v)
 		case cmapName:
 			common.Log.Debug("ERROR: Unexpected name. %#v", v)
-			target = MissingCodeRune
+			target = []rune{MissingCodeRune}
 		default:
 			common.Log.Debug("ERROR: Unexpected type. %#v", o)
 			return ErrBadCMap
 		}
 
-		cmap.codeToUnicode[code] = target
+		if ligature, ok := StringToLigature[string(target)]; ok {
+			cmap.codeToUnicode[code] = string(ligature)
+		} else {
+			cmap.codeToUnicode[code] = string(target)
+		}
 	}
 
 	return nil
@@ -563,15 +565,17 @@ func (cmap *CMap) parseBfrange() error {
 				if !ok {
 					return errors.New("non-hex string in array")
 				}
-				r := hexToRune(hexs)
-				cmap.codeToUnicode[code] = r
+				r := hexToRunes(hexs)
+				cmap.codeToUnicode[code] = string(r)
 			}
 
 		case cmapHexString:
 			// <codeFrom> <codeTo> <dst>, maps [from,to] to [dst,dst+to-from].
+			// XXX(peterwilliams97): Do we need to do this with multi-rune entries? I guess we
+			// would increment the last rune?
 			r := hexToRune(v)
 			for code := srcCodeFrom; code <= srcCodeTo; code++ {
-				cmap.codeToUnicode[code] = r
+				cmap.codeToUnicode[code] = string(r)
 				r++
 			}
 		default:
@@ -582,3 +586,60 @@ func (cmap *CMap) parseBfrange() error {
 
 	return nil
 }
+
+// ligatureToString is a map from ligature runes to their constituent characters.
+// https://en.wikipedia.org/wiki/Typographic_ligature#Ligatures_in_Unicode_(Latin_alphabets)
+// FIXME(peterwilliams97): I copied this here from glyphs_glyphlist.go to avoid a circular
+// dependency. Where should it go?
+var ligatureToString = map[rune]string{
+	'Ꜳ':          "AA",
+	'ꜳ':          "aa",
+	'Ꜵ':          "aa",
+	'ꜵ':          "ao",
+	'Ꜷ':          "AU",
+	'ꜷ':          "au",
+	'Ꜽ':          "AY",
+	'ꜽ':          "ay",
+	'\U0001f670': "et",
+	'ﬀ':          "ff",
+	'ﬃ':          "ffi",
+	'ﬄ':          "ffl",
+	'ﬁ':          "fi",
+	'ﬂ':          "fl",
+	'Œ':          "OE",
+	'œ':          "oe",
+	'Ꝏ':          "OO",
+	'ꝏ':          "oo",
+	'ẞ':          "fs",
+	'ß':          "fz",
+	'ﬆ':          "st",
+	'ﬅ':          "ſt",
+	'Ꜩ':          "TZ",
+	'ꜩ':          "tz",
+	'ᵫ':          "ue",
+	'Ꝡ':          "VY",
+	'ꝡ':          "vy",
+	// Reverse of ligatureMap
+	0xe000: "ft",
+	0xe001: "fj",
+	0xe002: "fb",
+	0xe003: "fh",
+	0xe004: "fk",
+	0xe005: "tt",
+	0xe006: "tf",
+	0xe007: "ffj",
+	0xe008: "ffb",
+	0xe009: "ffh",
+	0xe00a: "ffk",
+	0xe00b: "T_h",
+}
+
+var StringToLigature = reverseLigatures(ligatureToString)
+
+func reverseLigatures(l2s map[rune]string) map[string]rune {
+	s2l := make(map[string]rune, len(l2s))
+	for l, s := range l2s {
+		s2l[s] = l
+	}
+	return s2l
+}
diff --git a/internal/cmap/cmap_test.go b/internal/cmap/cmap_test.go
index 5c8da78d..de26766e 100644
--- a/internal/cmap/cmap_test.go
+++ b/internal/cmap/cmap_test.go
@@ -104,14 +104,14 @@ func TestCMapParser1(t *testing.T) {
 	}
 
 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v)
 			return
 		}
 	}
 
 	v, _ := cmap.CharcodeToUnicode(0x99)
-	if v != MissingCodeRune { //!= "notdef" {
+	if v != MissingCodeString { //!= "notdef" {
 		t.Errorf("Unmapped code, expected to map to undefined")
 		return
 	}
@@ -188,7 +188,7 @@ func TestCMapParser2(t *testing.T) {
 	}
 
 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v)
 			return
 		}
@@ -297,7 +297,7 @@ func TestCMapParser3(t *testing.T) {
 		0xd140: 0xa000,
 	}
 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v)
 			return
 		}
@@ -407,7 +407,7 @@ func TestCMapParser4(t *testing.T) {
 	}
 
 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v)
 			return
 		}
@@ -520,6 +520,7 @@ var (
 		0x017b: 'Ż',
 		0x017d: 'Ž',
 	}
+
 	codeToUnicode3 = map[CharCode]rune{ // 93 entries
 		0x0124: 'Ĥ',
 		0x0125: 'ĥ',
@@ -695,7 +696,7 @@ func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) {
 		}
 		u0 := codeToUnicode[code]
 		u := cmap.codeToUnicode[code]
-		if u != u0 {
+		if u != string(u0) {
 			t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u)
 			return
 		}
diff --git a/internal/textencoding/cmap.go b/internal/textencoding/cmap.go
index b0dfbedf..56b24c74 100644
--- a/internal/textencoding/cmap.go
+++ b/internal/textencoding/cmap.go
@@ -48,8 +48,8 @@ func (enc CMapEncoder) Decode(raw []byte) string {
 		if codes, ok := enc.codeToCID.BytesToCharcodes(raw); ok {
 			var buf bytes.Buffer
 			for _, code := range codes {
-				r, _ := enc.CharcodeToRune(CharCode(code))
-				buf.WriteRune(r)
+				s, _ := enc.charcodeToString(CharCode(code))
+				buf.WriteString(s)
 			}
 
 			return buf.String()
@@ -87,8 +87,13 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) {
 // CharcodeToRune converts PDF character code `code` to a rune.
 // The bool return flag is true if there was a match, and false otherwise.
 func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
+	s, ok := enc.charcodeToString(code)
+	return ([]rune(s))[0], ok
+}
+
+func (enc CMapEncoder) charcodeToString(code CharCode) (string, bool) {
 	if enc.cidToUnicode == nil {
-		return MissingCodeRune, false
+		return MissingCodeString, false
 	}
 
 	// Map charcode to CID. If charcode to CID CMap is nil, assume Identity encoding.
@@ -96,7 +101,7 @@ func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
 	if enc.codeToCID != nil {
 		var ok bool
 		if cid, ok = enc.codeToCID.CharcodeToCID(cmap.CharCode(code)); !ok {
-			return MissingCodeRune, false
+			return MissingCodeString, false
 		}
 	}
 
diff --git a/internal/textencoding/glyphs_glyphlist.go b/internal/textencoding/glyphs_glyphlist.go
index e794bea8..7f8bf840 100644
--- a/internal/textencoding/glyphs_glyphlist.go
+++ b/internal/textencoding/glyphs_glyphlist.go
@@ -18,7 +18,13 @@ import (
 )
 
 // MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'.
-const MissingCodeRune = '\ufffd' // �
+const (
+	// MissingCodeRune replaces runes that can't be decoded. .
+	MissingCodeRune = '\ufffd' // �
+
+	// MissingCodeRune replaces strings that can't be decoded.
+	MissingCodeString = string(MissingCodeRune)
+)
 
 // GlyphToRune returns the rune corresponding to glyph `glyph` if there is one.
 // TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi"
diff --git a/model/font.go b/model/font.go
index af688bf4..40a9d65e 100644
--- a/model/font.go
+++ b/model/font.go
@@ -428,8 +428,8 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
 	numMisses = 0
 	for _, code := range charcodes {
 		if fontBase.toUnicodeCmap != nil {
-			if r, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
-				runes = append(runes, r)
+			if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
+				runes = append(runes, []rune(s)...)
 				continue
 			}
 		}

From a9910e7e0619f14e09ce95272fb8f8ae1661ae4d Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Wed, 20 May 2020 18:43:09 +1000
Subject: [PATCH 3/4] Treat CMap entries as strings instead of runes to handle
 multi-byte encodings.

---
 extractor/text.go            | 15 ++++---
 internal/cmap/cmap.go        |  6 ++-
 internal/cmap/cmap_parser.go | 79 ++++--------------------------------
 model/font.go                | 26 +++++++++---
 4 files changed, 40 insertions(+), 86 deletions(-)

diff --git a/extractor/text.go b/extractor/text.go
index a91eff75..9be289a9 100644
--- a/extractor/text.go
+++ b/extractor/text.go
@@ -698,7 +698,7 @@ func (to *textObject) reset() {
 func (to *textObject) renderText(data []byte) error {
 	font := to.getCurrentFont()
 	charcodes := font.BytesToCharcodes(data)
-	runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes)
+	runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
 	if numMisses > 0 {
 		common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
 	}
@@ -717,18 +717,17 @@ func (to *textObject) renderText(data []byte) error {
 		spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
 	}
 	spaceWidth := spaceMetrics.Wx * glyphTextRatio
-	common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs)
+	//  common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)
 
 	stateMatrix := transform.NewMatrix(
 		tfs*th, 0,
 		0, tfs,
 		0, state.trise)
 
-	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
+	// common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, drunes)
 
-	for i, r := range runes {
-		// TODO(peterwilliams97): Need to find and fix cases where this happens.
-		if r == '\x00' {
+	for i, r := range runeSlices {
+		if len(r) == 1 && r[0] == '\x00' {
 			continue
 		}
 
@@ -742,14 +741,14 @@ func (to *textObject) renderText(data []byte) error {
 
 		// w is the unscaled movement at the end of a word.
 		w := 0.0
-		if r == ' ' {
+		if string(r) == " " {
 			w = state.tw
 		}
 
 		m, ok := font.GetCharMetrics(code)
 		if !ok {
 			common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
-			return errors.New("no char metrics")
+			return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code)
 		}
 
 		// c is the character size in unscaled text units.
diff --git a/internal/cmap/cmap.go b/internal/cmap/cmap.go
index 7a7ea0b6..11b2c634 100644
--- a/internal/cmap/cmap.go
+++ b/internal/cmap/cmap.go
@@ -265,6 +265,10 @@ func (cmap *CMap) computeInverseMappings() {
 
 	// Generate Unicode -> CID map.
 	for cid, s := range cmap.codeToUnicode {
+		// The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf
+		if len(s) == 0 {
+			continue
+		}
 		r := rune0(s)
 		if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
 			cmap.unicodeToCode[r] = cid
@@ -481,10 +485,8 @@ func (cmap *CMap) toBfData() string {
 	var charRanges []charRange
 	currCharRange := charRange{codes[0], codes[0]}
 	prevRune := rune0(cmap.codeToUnicode[codes[0]])
-	// fmt.Printf("      code=0x%04x prevRune=%q=0x%04x\n", codes[0], prevRune, prevRune)
 	for _, c := range codes[1:] {
 		currRune := rune0(cmap.codeToUnicode[c])
-		// fmt.Printf("%4d: code=0x%04x currRune=%q=0x%04x\n", i+1, c, currRune, currRune)
 		if c == currCharRange.code1+1 && currRune == prevRune+1 {
 			currCharRange.code1 = c
 		} else {
diff --git a/internal/cmap/cmap_parser.go b/internal/cmap/cmap_parser.go
index b5d69feb..a160f32c 100644
--- a/internal/cmap/cmap_parser.go
+++ b/internal/cmap/cmap_parser.go
@@ -105,7 +105,8 @@ func (cmap *CMap) parse() error {
 func (cmap *CMap) parseName() error {
 	name := ""
 	done := false
-	for i := 0; i < 10 && !done; i++ {
+	// /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf
+	for i := 0; i < 20 && !done; i++ {
 		o, err := cmap.parseObject()
 		if err != nil {
 			return err
@@ -487,11 +488,7 @@ func (cmap *CMap) parseBfchar() error {
 			return ErrBadCMap
 		}
 
-		if ligature, ok := StringToLigature[string(target)]; ok {
-			cmap.codeToUnicode[code] = string(ligature)
-		} else {
-			cmap.codeToUnicode[code] = string(target)
-		}
+		cmap.codeToUnicode[code] = string(target)
 	}
 
 	return nil
@@ -565,18 +562,17 @@ func (cmap *CMap) parseBfrange() error {
 				if !ok {
 					return errors.New("non-hex string in array")
 				}
-				r := hexToRunes(hexs)
-				cmap.codeToUnicode[code] = string(r)
+				runes := hexToRunes(hexs)
+				cmap.codeToUnicode[code] = string(runes)
 			}
 
 		case cmapHexString:
 			// <codeFrom> <codeTo> <dst>, maps [from,to] to [dst,dst+to-from].
-			// XXX(peterwilliams97): Do we need to do this with multi-rune entries? I guess we
-			// would increment the last rune?
-			r := hexToRune(v)
+			runes := hexToRunes(v)
+			n := len(runes)
 			for code := srcCodeFrom; code <= srcCodeTo; code++ {
-				cmap.codeToUnicode[code] = string(r)
-				r++
+				cmap.codeToUnicode[code] = string(runes)
+				runes[n-1]++
 			}
 		default:
 			common.Log.Debug("ERROR: Unexpected type %T", o)
@@ -586,60 +582,3 @@ func (cmap *CMap) parseBfrange() error {
 
 	return nil
 }
-
-// ligatureToString is a map from ligature runes to their constituent characters.
-// https://en.wikipedia.org/wiki/Typographic_ligature#Ligatures_in_Unicode_(Latin_alphabets)
-// FIXME(peterwilliams97): I copied this here from glyphs_glyphlist.go to avoid a circular
-// dependency. Where should it go?
-var ligatureToString = map[rune]string{
-	'Ꜳ':          "AA",
-	'ꜳ':          "aa",
-	'Ꜵ':          "aa",
-	'ꜵ':          "ao",
-	'Ꜷ':          "AU",
-	'ꜷ':          "au",
-	'Ꜽ':          "AY",
-	'ꜽ':          "ay",
-	'\U0001f670': "et",
-	'ﬀ':          "ff",
-	'ﬃ':          "ffi",
-	'ﬄ':          "ffl",
-	'ﬁ':          "fi",
-	'ﬂ':          "fl",
-	'Œ':          "OE",
-	'œ':          "oe",
-	'Ꝏ':          "OO",
-	'ꝏ':          "oo",
-	'ẞ':          "fs",
-	'ß':          "fz",
-	'ﬆ':          "st",
-	'ﬅ':          "ſt",
-	'Ꜩ':          "TZ",
-	'ꜩ':          "tz",
-	'ᵫ':          "ue",
-	'Ꝡ':          "VY",
-	'ꝡ':          "vy",
-	// Reverse of ligatureMap
-	0xe000: "ft",
-	0xe001: "fj",
-	0xe002: "fb",
-	0xe003: "fh",
-	0xe004: "fk",
-	0xe005: "tt",
-	0xe006: "tf",
-	0xe007: "ffj",
-	0xe008: "ffb",
-	0xe009: "ffh",
-	0xe00a: "ffk",
-	0xe00b: "T_h",
-}
-
-var StringToLigature = reverseLigatures(ligatureToString)
-
-func reverseLigatures(l2s map[rune]string) map[string]rune {
-	s2l := make(map[string]rune, len(l2s))
-	for l, s := range l2s {
-		s2l[s] = l
-	}
-	return s2l
-}
diff --git a/model/font.go b/model/font.go
index 40a9d65e..79011e26 100644
--- a/model/font.go
+++ b/model/font.go
@@ -422,14 +422,28 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
 
 // CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical
 // information about hits and misses from the reverse mapping process.
+// NOTE: The number of runes returned may be greater than the number of charcodes.
+// TODO(peterwilliams97): Deprecate?
 func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) {
+	runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes)
+	var runes []rune
+	for _, r := range runeSlices {
+		runes = append(runes, r...)
+	}
+	return runes, numHits, numMisses
+}
+
+// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices.
+// The int return is the number of unconvereted codes.
+// NOTE: The number of rune slices returned is equal to the number of charcodes
+func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) {
 	fontBase := font.baseFields()
-	runes := make([]rune, 0, len(charcodes))
-	numMisses = 0
+	runeSlices := make([][]rune, 0, len(charcodes))
+	numMisses := 0
 	for _, code := range charcodes {
 		if fontBase.toUnicodeCmap != nil {
 			if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
-				runes = append(runes, []rune(s)...)
+				runeSlices = append(runeSlices, []rune(s))
 				continue
 			}
 		}
@@ -438,7 +452,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
 		encoder := font.Encoder()
 		if encoder != nil {
 			if r, ok := encoder.CharcodeToRune(code); ok {
-				runes = append(runes, r)
+				runeSlices = append(runeSlices, []rune{r})
 				continue
 			}
 		}
@@ -447,7 +461,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
 			"\tfont=%s\n\tencoding=%s",
 			code, charcodes, fontBase.isCIDFont(), font, encoder)
 		numMisses++
-		runes = append(runes, cmap.MissingCodeRune)
+		runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune})
 	}
 
 	if numMisses != 0 {
@@ -457,7 +471,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
 			len(charcodes), numMisses, font)
 	}
 
-	return runes, len(runes), numMisses
+	return runeSlices, len(runeSlices), numMisses
 }
 
 // CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.

From 0c54cec2c5ac2c4c7d7f430befbffadb83d24f79 Mon Sep 17 00:00:00 2001
From: Peter Williams <peter.williams@papercut.com>
Date: Wed, 20 May 2020 19:07:22 +1000
Subject: [PATCH 4/4] Added a test for multibyte encoding.

---
 extractor/text_test.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/extractor/text_test.go b/extractor/text_test.go
index 92dfb976..cdfe47a9 100644
--- a/extractor/text_test.go
+++ b/extractor/text_test.go
@@ -316,6 +316,11 @@ var fileExtractionTests = []struct {
 				`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
 		},
 	},
+	{filename: "Saudi.pdf",
+		pageTerms: map[int][]string{
+			10: []string{"الله"},
+		},
+	},
 	// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
 	// {filename: "Ito_Formula.pdf",
 	// 	pageTerms: map[int][]string{