Merge branch 'cmap' into columns

2025-05-14 19:29:50 +08:00 · 2020-05-24 20:45:31 +10:00 · 2020-05-24 20:45:31 +10:00 · e9c46fa3b9
commit e9c46fa3b9
parent 5efaa02e23 6103fb8ea3
9 changed files with 111 additions and 61 deletions
--- a/common/logging.go
+++ b/common/logging.go
@ -221,7 +221,7 @@ func (l WriterLogger) logToWriter(f io.Writer, prefix string, format string, arg
 }

 func logToWriter(f io.Writer, prefix string, format string, args ...interface{}) {
-	_, file, line, ok := runtime.Caller(2)
+	_, file, line, ok := runtime.Caller(3)
 	if !ok {
 		file = "???"
 		line = 0
--- a/extractor/text.go
+++ b/extractor/text.go
@ -698,7 +698,7 @@ func (to *textObject) reset() {
 func (to *textObject) renderText(data []byte) error {
 	font := to.getCurrentFont()
 	charcodes := font.BytesToCharcodes(data)
-	runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes)
+	runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
 	if numMisses > 0 {
 		common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
 	}
@ -717,18 +717,17 @@ func (to *textObject) renderText(data []byte) error {
 		spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
 	}
 	spaceWidth := spaceMetrics.Wx * glyphTextRatio
-	common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs)
+	//  common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)

 	stateMatrix := transform.NewMatrix(
 		tfs*th, 0,
 		0, tfs,
 		0, state.trise)

-	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
+	// common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, drunes)

-	for i, r := range runes {
-		// TODO(peterwilliams97): Need to find and fix cases where this happens.
-		if r == '\x00' {
+	for i, r := range runeSlices {
+		if len(r) == 1 && r[0] == '\x00' {
 			continue
 		}

@ -742,14 +741,14 @@ func (to *textObject) renderText(data []byte) error {

 		// w is the unscaled movement at the end of a word.
 		w := 0.0
-		if r == ' ' {
+		if string(r) == " " {
 			w = state.tw
 		}

 		m, ok := font.GetCharMetrics(code)
 		if !ok {
 			common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
-			return errors.New("no char metrics")
+			return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code)
 		}

 		// c is the character size in unscaled text units.
--- a/extractor/text_test.go
+++ b/extractor/text_test.go
@ -316,6 +316,11 @@ var fileExtractionTests = []struct {
 				`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
 		},
 	},
+	{filename: "Saudi.pdf",
+		pageTerms: map[int][]string{
+			10: []string{"الله"},
+		},
+	},
 	// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
 	// {filename: "Ito_Formula.pdf",
 	// 	pageTerms: map[int][]string{
--- a/internal/cmap/cmap.go
+++ b/internal/cmap/cmap.go
@ -21,6 +21,9 @@ const (

 	// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
 	MissingCodeRune = '\ufffd' // <20>
+
+	// MissingCodeRune replaces strings that can't be decoded.
+	MissingCodeString = string(MissingCodeRune)
 )

 // CharCode is a character code or Unicode
@ -41,7 +44,7 @@ type charRange struct {
 type fbRange struct {
 	code0 CharCode
 	code1 CharCode
-	r0    rune
+	r0    rune // TODO (peterwilliams97): Change to string for compound codes.
 }

 // CIDSystemInfo contains information for identifying the character collection
@ -106,8 +109,9 @@ type CMap struct {
 	cidToCode map[CharCode]CharCode // CID -> charcode

 	// Used by ctype 2 CMaps.
-	codeToUnicode map[CharCode]rune // CID -> Unicode
-	unicodeToCode map[rune]CharCode // Unicode -> CID
+	codeToUnicode map[CharCode]string // CID -> Unicode string
+	// XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode?
+	unicodeToCode map[rune]CharCode // Unicode rune -> CID

 	// cached contains the raw CMap data. It is used by the Bytes method in
 	// order to avoid generating the data for every call.
@ -116,8 +120,13 @@ type CMap struct {
 	cached []byte
 }

-// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg.
-func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap {
+// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToRune` arg.
+func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
+	codeToUnicode := make(map[CharCode]string, len(codeToRune))
+	for code, r := range codeToRune {
+		codeToUnicode[code] = string(r)
+	}
+
 	cmap := &CMap{
 		name:  "Adobe-Identity-UCS",
 		ctype: 2,
@ -135,6 +144,7 @@ func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap {
 	}

 	cmap.computeInverseMappings()
+
 	return cmap
 }

@ -148,7 +158,7 @@ func newCMap(isSimple bool) *CMap {
 		nbits:         nbits,
 		codeToCID:     make(map[CharCode]CharCode),
 		cidToCode:     make(map[CharCode]CharCode),
-		codeToUnicode: make(map[CharCode]rune),
+		codeToUnicode: make(map[CharCode]string),
 		unicodeToCode: make(map[rune]CharCode),
 	}
 }
@ -254,7 +264,12 @@ func (cmap *CMap) computeInverseMappings() {
 	}

 	// Generate Unicode -> CID map.
-	for cid, r := range cmap.codeToUnicode {
+	for cid, s := range cmap.codeToUnicode {
+		// The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf
+		if len(s) == 0 {
+			continue
+		}
+		r := rune0(s)
 		if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
 			cmap.unicodeToCode[r] = cid
 		}
@ -277,19 +292,18 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
 		return "", 0
 	}

-	var (
-		parts   []rune
-		missing []CharCode
-	)
-	for _, code := range charcodes {
+	parts := make([]string, len(charcodes))
+	var missing []CharCode
+	for i, code := range charcodes {
 		s, ok := cmap.codeToUnicode[code]
 		if !ok {
 			missing = append(missing, code)
-			s = MissingCodeRune
+			s = MissingCodeString
 		}
-		parts = append(parts, s)
+		parts[i] = s
 	}
-	unicode := string(parts)
+	unicode := strings.Join(parts, "")
+
 	if len(missing) > 0 {
 		common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+
 			"\tdata=[% 02x]=%#q\n"+
@ -305,11 +319,11 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
 // CharcodeToUnicode converts a single character code `code` to a unicode string.
 // If `code` is not in the unicode map, '<27>' is returned.
 // NOTE: CharcodeBytesToUnicode is typically more efficient.
-func (cmap *CMap) CharcodeToUnicode(code CharCode) (rune, bool) {
+func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
 	if s, ok := cmap.codeToUnicode[code]; ok {
 		return s, true
 	}
-	return MissingCodeRune, false
+	return MissingCodeString, false
 }

 // RuneToCID maps the specified rune to a character identifier. If the provided
@ -453,7 +467,7 @@ func (cmap *CMap) toBfData() string {
 	}

 	// codes is a sorted list of the codeToUnicode keys.
-	var codes []CharCode
+	codes := make([]CharCode, 0, len(cmap.codeToUnicode))
 	for code := range cmap.codeToUnicode {
 		codes = append(codes, code)
 	}
@ -470,9 +484,9 @@ func (cmap *CMap) toBfData() string {
 	// character codes have been mapped to code ranges.
 	var charRanges []charRange
 	currCharRange := charRange{codes[0], codes[0]}
-	prevRune := cmap.codeToUnicode[codes[0]]
+	prevRune := rune0(cmap.codeToUnicode[codes[0]])
 	for _, c := range codes[1:] {
-		currRune := cmap.codeToUnicode[c]
+		currRune := rune0(cmap.codeToUnicode[c])
 		if c == currCharRange.code1+1 && currRune == prevRune+1 {
 			currCharRange.code1 = c
 		} else {
@ -493,7 +507,7 @@ func (cmap *CMap) toBfData() string {
 			fbRanges = append(fbRanges, fbRange{
 				code0: cr.code0,
 				code1: cr.code1,
-				r0:    cmap.codeToUnicode[cr.code0],
+				r0:    rune0(cmap.codeToUnicode[cr.code0]),
 			})
 		}
 	}
@ -508,7 +522,7 @@ func (cmap *CMap) toBfData() string {
 			lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
 			for j := 0; j < n; j++ {
 				code := fbChars[i*maxBfEntries+j]
-				r := cmap.codeToUnicode[code]
+				r := rune0(cmap.codeToUnicode[code])
 				lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
 			}
 			lines = append(lines, "endbfchar")
@ -549,3 +563,9 @@ end
 end
 `
 )
+
+// rune0 is a convenience function that returns the first rune in `s`.
+// Caller must check that `s` is not empty.
+func rune0(s string) rune {
+	return ([]rune(s))[0]
+}
--- a/internal/cmap/cmap_parser.go
+++ b/internal/cmap/cmap_parser.go
@ -105,7 +105,8 @@ func (cmap *CMap) parse() error {
 func (cmap *CMap) parseName() error {
 	name := ""
 	done := false
-	for i := 0; i < 10 && !done; i++ {
+	// /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf
+	for i := 0; i < 20 && !done; i++ {
 		o, err := cmap.parseObject()
 		if err != nil {
 			return err
@ -141,7 +142,6 @@ func (cmap *CMap) parseName() error {
 // parseType parses a cmap type and adds it to `cmap`.
 // cmap names are defined like this: /CMapType 1 def
 func (cmap *CMap) parseType() error {
-
 	ctype := 0
 	done := false
 	for i := 0; i < 3 && !done; i++ {
@ -171,7 +171,6 @@ func (cmap *CMap) parseType() error {
 // We don't need the version. We do this to eat up the version code in the cmap definition
 // to reduce unhandled parse object warnings.
 func (cmap *CMap) parseVersion() error {
-
 	version := ""
 	done := false
 	for i := 0; i < 3 && !done; i++ {
@ -471,7 +470,7 @@ func (cmap *CMap) parseBfchar() error {
 			}
 			return err
 		}
-		var target rune
+		var target []rune
 		switch v := o.(type) {
 		case cmapOperand:
 			if v.Operand == endbfchar {
@ -480,16 +479,16 @@ func (cmap *CMap) parseBfchar() error {
 			common.Log.Debug("ERROR: Unexpected operand. %#v", v)
 			return ErrBadCMap
 		case cmapHexString:
-			target = hexToRune(v)
+			target = hexToRunes(v)
 		case cmapName:
 			common.Log.Debug("ERROR: Unexpected name. %#v", v)
-			target = MissingCodeRune
+			target = []rune{MissingCodeRune}
 		default:
 			common.Log.Debug("ERROR: Unexpected type. %#v", o)
 			return ErrBadCMap
 		}

-		cmap.codeToUnicode[code] = target
+		cmap.codeToUnicode[code] = string(target)
 	}

 	return nil
@ -563,16 +562,17 @@ func (cmap *CMap) parseBfrange() error {
 				if !ok {
 					return errors.New("non-hex string in array")
 				}
-				r := hexToRune(hexs)
-				cmap.codeToUnicode[code] = r
+				runes := hexToRunes(hexs)
+				cmap.codeToUnicode[code] = string(runes)
 			}

 		case cmapHexString:
 			// <codeFrom> <codeTo> <dst>, maps [from,to] to [dst,dst+to-from].
-			r := hexToRune(v)
+			runes := hexToRunes(v)
+			n := len(runes)
 			for code := srcCodeFrom; code <= srcCodeTo; code++ {
-				cmap.codeToUnicode[code] = r
-				r++
+				cmap.codeToUnicode[code] = string(runes)
+				runes[n-1]++
 			}
 		default:
 			common.Log.Debug("ERROR: Unexpected type %T", o)
--- a/internal/cmap/cmap_test.go
+++ b/internal/cmap/cmap_test.go
@ -104,14 +104,14 @@ func TestCMapParser1(t *testing.T) {
 	}

 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v)
 			return
 		}
 	}

 	v, _ := cmap.CharcodeToUnicode(0x99)
-	if v != MissingCodeRune { //!= "notdef" {
+	if v != MissingCodeString { //!= "notdef" {
 		t.Errorf("Unmapped code, expected to map to undefined")
 		return
 	}
@ -188,7 +188,7 @@ func TestCMapParser2(t *testing.T) {
 	}

 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v)
 			return
 		}
@ -297,7 +297,7 @@ func TestCMapParser3(t *testing.T) {
 		0xd140: 0xa000,
 	}
 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v)
 			return
 		}
@ -407,7 +407,7 @@ func TestCMapParser4(t *testing.T) {
 	}

 	for k, expected := range expectedMappings {
-		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
+		if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
 			t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v)
 			return
 		}
@ -520,6 +520,7 @@ var (
 		0x017b: 'Ż',
 		0x017d: 'Ž',
 	}
+
 	codeToUnicode3 = map[CharCode]rune{ // 93 entries
 		0x0124: 'Ĥ',
 		0x0125: 'ĥ',
@ -695,7 +696,7 @@ func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) {
 		}
 		u0 := codeToUnicode[code]
 		u := cmap.codeToUnicode[code]
-		if u != u0 {
+		if u != string(u0) {
 			t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u)
 			return
 		}
--- a/internal/textencoding/cmap.go
+++ b/internal/textencoding/cmap.go
@ -48,8 +48,8 @@ func (enc CMapEncoder) Decode(raw []byte) string {
 		if codes, ok := enc.codeToCID.BytesToCharcodes(raw); ok {
 			var buf bytes.Buffer
 			for _, code := range codes {
-				r, _ := enc.CharcodeToRune(CharCode(code))
-				buf.WriteRune(r)
+				s, _ := enc.charcodeToString(CharCode(code))
+				buf.WriteString(s)
 			}

 			return buf.String()
@ -87,8 +87,13 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) {
 // CharcodeToRune converts PDF character code `code` to a rune.
 // The bool return flag is true if there was a match, and false otherwise.
 func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
+	s, ok := enc.charcodeToString(code)
+	return ([]rune(s))[0], ok
+}
+
+func (enc CMapEncoder) charcodeToString(code CharCode) (string, bool) {
 	if enc.cidToUnicode == nil {
-		return MissingCodeRune, false
+		return MissingCodeString, false
 	}

 	// Map charcode to CID. If charcode to CID CMap is nil, assume Identity encoding.
@ -96,7 +101,7 @@ func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
 	if enc.codeToCID != nil {
 		var ok bool
 		if cid, ok = enc.codeToCID.CharcodeToCID(cmap.CharCode(code)); !ok {
-			return MissingCodeRune, false
+			return MissingCodeString, false
 		}
 	}

--- a/internal/textencoding/glyphs_glyphlist.go
+++ b/internal/textencoding/glyphs_glyphlist.go
@ -18,7 +18,13 @@ import (
 )

 // MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'.
-const MissingCodeRune = '\ufffd' // <20>
+const (
+	// MissingCodeRune replaces runes that can't be decoded. .
+	MissingCodeRune = '\ufffd' // <20>
+
+	// MissingCodeRune replaces strings that can't be decoded.
+	MissingCodeString = string(MissingCodeRune)
+)

 // GlyphToRune returns the rune corresponding to glyph `glyph` if there is one.
 // TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi"
--- a/model/font.go
+++ b/model/font.go
@ -422,14 +422,28 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {

 // CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical
 // information about hits and misses from the reverse mapping process.
+// NOTE: The number of runes returned may be greater than the number of charcodes.
+// TODO(peterwilliams97): Deprecate?
 func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) {
+	runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes)
+	var runes []rune
+	for _, r := range runeSlices {
+		runes = append(runes, r...)
+	}
+	return runes, numHits, numMisses
+}
+
+// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices.
+// The int return is the number of unconvereted codes.
+// NOTE: The number of rune slices returned is equal to the number of charcodes
+func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) {
 	fontBase := font.baseFields()
-	runes := make([]rune, 0, len(charcodes))
-	numMisses = 0
+	runeSlices := make([][]rune, 0, len(charcodes))
+	numMisses := 0
 	for _, code := range charcodes {
 		if fontBase.toUnicodeCmap != nil {
-			if r, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
-				runes = append(runes, r)
+			if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
+				runeSlices = append(runeSlices, []rune(s))
 				continue
 			}
 		}
@ -438,7 +452,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
 		encoder := font.Encoder()
 		if encoder != nil {
 			if r, ok := encoder.CharcodeToRune(code); ok {
-				runes = append(runes, r)
+				runeSlices = append(runeSlices, []rune{r})
 				continue
 			}
 		}
@ -447,7 +461,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
 			"\tfont=%s\n\tencoding=%s",
 			code, charcodes, fontBase.isCIDFont(), font, encoder)
 		numMisses++
-		runes = append(runes, cmap.MissingCodeRune)
+		runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune})
 	}

 	if numMisses != 0 {
@ -457,7 +471,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
 			len(charcodes), numMisses, font)
 	}

-	return runes, len(runes), numMisses
+	return runeSlices, len(runeSlices), numMisses
 }

 // CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.