From 6fe0d20a86725114b2b67f01ffb09258ead15790 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 19 May 2020 11:46:51 +1000 Subject: [PATCH 1/4] Fixed filename:page in logging --- common/logging.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/logging.go b/common/logging.go index b7452bf6..b3e62348 100644 --- a/common/logging.go +++ b/common/logging.go @@ -221,7 +221,7 @@ func (l WriterLogger) logToWriter(f io.Writer, prefix string, format string, arg } func logToWriter(f io.Writer, prefix string, format string, args ...interface{}) { - _, file, line, ok := runtime.Caller(2) + _, file, line, ok := runtime.Caller(3) if !ok { file = "???" line = 0 From 22680be0975c8f05471acd463d54a1fc1a144f06 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 19 May 2020 14:57:27 +1000 Subject: [PATCH 2/4] Got CMap working for multi-rune entries --- internal/cmap/cmap.go | 62 +++++++++++------- internal/cmap/cmap_parser.go | 79 ++++++++++++++++++++--- internal/cmap/cmap_test.go | 13 ++-- internal/textencoding/cmap.go | 13 ++-- internal/textencoding/glyphs_glyphlist.go | 8 ++- model/font.go | 4 +- 6 files changed, 135 insertions(+), 44 deletions(-) diff --git a/internal/cmap/cmap.go b/internal/cmap/cmap.go index 1299faa5..7a7ea0b6 100644 --- a/internal/cmap/cmap.go +++ b/internal/cmap/cmap.go @@ -21,6 +21,9 @@ const ( // MissingCodeRune replaces runes that can't be decoded. '\ufffd' = �. Was '?'. MissingCodeRune = '\ufffd' // � + + // MissingCodeRune replaces strings that can't be decoded. + MissingCodeString = string(MissingCodeRune) ) // CharCode is a character code or Unicode @@ -41,7 +44,7 @@ type charRange struct { type fbRange struct { code0 CharCode code1 CharCode - r0 rune + r0 rune // TODO (peterwilliams97): Change to string for compound codes. } // CIDSystemInfo contains information for identifying the character collection @@ -106,8 +109,9 @@ type CMap struct { cidToCode map[CharCode]CharCode // CID -> charcode // Used by ctype 2 CMaps. - codeToUnicode map[CharCode]rune // CID -> Unicode - unicodeToCode map[rune]CharCode // Unicode -> CID + codeToUnicode map[CharCode]string // CID -> Unicode string + // XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode? + unicodeToCode map[rune]CharCode // Unicode rune -> CID // cached contains the raw CMap data. It is used by the Bytes method in // order to avoid generating the data for every call. @@ -116,8 +120,13 @@ type CMap struct { cached []byte } -// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg. -func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap { +// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToRune` arg. +func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap { + codeToUnicode := make(map[CharCode]string, len(codeToRune)) + for code, r := range codeToRune { + codeToUnicode[code] = string(r) + } + cmap := &CMap{ name: "Adobe-Identity-UCS", ctype: 2, @@ -135,6 +144,7 @@ func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap { } cmap.computeInverseMappings() + return cmap } @@ -148,7 +158,7 @@ func newCMap(isSimple bool) *CMap { nbits: nbits, codeToCID: make(map[CharCode]CharCode), cidToCode: make(map[CharCode]CharCode), - codeToUnicode: make(map[CharCode]rune), + codeToUnicode: make(map[CharCode]string), unicodeToCode: make(map[rune]CharCode), } } @@ -254,7 +264,8 @@ func (cmap *CMap) computeInverseMappings() { } // Generate Unicode -> CID map. - for cid, r := range cmap.codeToUnicode { + for cid, s := range cmap.codeToUnicode { + r := rune0(s) if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) { cmap.unicodeToCode[r] = cid } @@ -277,19 +288,18 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) { return "", 0 } - var ( - parts []rune - missing []CharCode - ) - for _, code := range charcodes { + parts := make([]string, len(charcodes)) + var missing []CharCode + for i, code := range charcodes { s, ok := cmap.codeToUnicode[code] if !ok { missing = append(missing, code) - s = MissingCodeRune + s = MissingCodeString } - parts = append(parts, s) + parts[i] = s } - unicode := string(parts) + unicode := strings.Join(parts, "") + if len(missing) > 0 { common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+ "\tdata=[% 02x]=%#q\n"+ @@ -305,11 +315,11 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) { // CharcodeToUnicode converts a single character code `code` to a unicode string. // If `code` is not in the unicode map, '�' is returned. // NOTE: CharcodeBytesToUnicode is typically more efficient. -func (cmap *CMap) CharcodeToUnicode(code CharCode) (rune, bool) { +func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) { if s, ok := cmap.codeToUnicode[code]; ok { return s, true } - return MissingCodeRune, false + return MissingCodeString, false } // RuneToCID maps the specified rune to a character identifier. If the provided @@ -453,7 +463,7 @@ func (cmap *CMap) toBfData() string { } // codes is a sorted list of the codeToUnicode keys. - var codes []CharCode + codes := make([]CharCode, 0, len(cmap.codeToUnicode)) for code := range cmap.codeToUnicode { codes = append(codes, code) } @@ -470,9 +480,11 @@ func (cmap *CMap) toBfData() string { // character codes have been mapped to code ranges. var charRanges []charRange currCharRange := charRange{codes[0], codes[0]} - prevRune := cmap.codeToUnicode[codes[0]] + prevRune := rune0(cmap.codeToUnicode[codes[0]]) + // fmt.Printf(" code=0x%04x prevRune=%q=0x%04x\n", codes[0], prevRune, prevRune) for _, c := range codes[1:] { - currRune := cmap.codeToUnicode[c] + currRune := rune0(cmap.codeToUnicode[c]) + // fmt.Printf("%4d: code=0x%04x currRune=%q=0x%04x\n", i+1, c, currRune, currRune) if c == currCharRange.code1+1 && currRune == prevRune+1 { currCharRange.code1 = c } else { @@ -493,7 +505,7 @@ func (cmap *CMap) toBfData() string { fbRanges = append(fbRanges, fbRange{ code0: cr.code0, code1: cr.code1, - r0: cmap.codeToUnicode[cr.code0], + r0: rune0(cmap.codeToUnicode[cr.code0]), }) } } @@ -508,7 +520,7 @@ func (cmap *CMap) toBfData() string { lines = append(lines, fmt.Sprintf("%d beginbfchar", n)) for j := 0; j < n; j++ { code := fbChars[i*maxBfEntries+j] - r := cmap.codeToUnicode[code] + r := rune0(cmap.codeToUnicode[code]) lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r)) } lines = append(lines, "endbfchar") @@ -549,3 +561,9 @@ end end ` ) + +// rune0 is a convenience function that returns the first rune in `s`. +// Caller must check that `s` is not empty. +func rune0(s string) rune { + return ([]rune(s))[0] +} diff --git a/internal/cmap/cmap_parser.go b/internal/cmap/cmap_parser.go index 9236d782..b5d69feb 100644 --- a/internal/cmap/cmap_parser.go +++ b/internal/cmap/cmap_parser.go @@ -141,7 +141,6 @@ func (cmap *CMap) parseName() error { // parseType parses a cmap type and adds it to `cmap`. // cmap names are defined like this: /CMapType 1 def func (cmap *CMap) parseType() error { - ctype := 0 done := false for i := 0; i < 3 && !done; i++ { @@ -171,7 +170,6 @@ func (cmap *CMap) parseType() error { // We don't need the version. We do this to eat up the version code in the cmap definition // to reduce unhandled parse object warnings. func (cmap *CMap) parseVersion() error { - version := "" done := false for i := 0; i < 3 && !done; i++ { @@ -471,7 +469,7 @@ func (cmap *CMap) parseBfchar() error { } return err } - var target rune + var target []rune switch v := o.(type) { case cmapOperand: if v.Operand == endbfchar { @@ -480,16 +478,20 @@ func (cmap *CMap) parseBfchar() error { common.Log.Debug("ERROR: Unexpected operand. %#v", v) return ErrBadCMap case cmapHexString: - target = hexToRune(v) + target = hexToRunes(v) case cmapName: common.Log.Debug("ERROR: Unexpected name. %#v", v) - target = MissingCodeRune + target = []rune{MissingCodeRune} default: common.Log.Debug("ERROR: Unexpected type. %#v", o) return ErrBadCMap } - cmap.codeToUnicode[code] = target + if ligature, ok := StringToLigature[string(target)]; ok { + cmap.codeToUnicode[code] = string(ligature) + } else { + cmap.codeToUnicode[code] = string(target) + } } return nil @@ -563,15 +565,17 @@ func (cmap *CMap) parseBfrange() error { if !ok { return errors.New("non-hex string in array") } - r := hexToRune(hexs) - cmap.codeToUnicode[code] = r + r := hexToRunes(hexs) + cmap.codeToUnicode[code] = string(r) } case cmapHexString: // , maps [from,to] to [dst,dst+to-from]. + // XXX(peterwilliams97): Do we need to do this with multi-rune entries? I guess we + // would increment the last rune? r := hexToRune(v) for code := srcCodeFrom; code <= srcCodeTo; code++ { - cmap.codeToUnicode[code] = r + cmap.codeToUnicode[code] = string(r) r++ } default: @@ -582,3 +586,60 @@ func (cmap *CMap) parseBfrange() error { return nil } + +// ligatureToString is a map from ligature runes to their constituent characters. +// https://en.wikipedia.org/wiki/Typographic_ligature#Ligatures_in_Unicode_(Latin_alphabets) +// FIXME(peterwilliams97): I copied this here from glyphs_glyphlist.go to avoid a circular +// dependency. Where should it go? +var ligatureToString = map[rune]string{ + 'Ꜳ': "AA", + 'ꜳ': "aa", + 'Ꜵ': "aa", + 'ꜵ': "ao", + 'Ꜷ': "AU", + 'ꜷ': "au", + 'Ꜽ': "AY", + 'ꜽ': "ay", + '\U0001f670': "et", + 'ff': "ff", + 'ffi': "ffi", + 'ffl': "ffl", + 'fi': "fi", + 'fl': "fl", + 'Œ': "OE", + 'œ': "oe", + 'Ꝏ': "OO", + 'ꝏ': "oo", + 'ẞ': "fs", + 'ß': "fz", + 'st': "st", + 'ſt': "ſt", + 'Ꜩ': "TZ", + 'ꜩ': "tz", + 'ᵫ': "ue", + 'Ꝡ': "VY", + 'ꝡ': "vy", + // Reverse of ligatureMap + 0xe000: "ft", + 0xe001: "fj", + 0xe002: "fb", + 0xe003: "fh", + 0xe004: "fk", + 0xe005: "tt", + 0xe006: "tf", + 0xe007: "ffj", + 0xe008: "ffb", + 0xe009: "ffh", + 0xe00a: "ffk", + 0xe00b: "T_h", +} + +var StringToLigature = reverseLigatures(ligatureToString) + +func reverseLigatures(l2s map[rune]string) map[string]rune { + s2l := make(map[string]rune, len(l2s)) + for l, s := range l2s { + s2l[s] = l + } + return s2l +} diff --git a/internal/cmap/cmap_test.go b/internal/cmap/cmap_test.go index 5c8da78d..de26766e 100644 --- a/internal/cmap/cmap_test.go +++ b/internal/cmap/cmap_test.go @@ -104,14 +104,14 @@ func TestCMapParser1(t *testing.T) { } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v) return } } v, _ := cmap.CharcodeToUnicode(0x99) - if v != MissingCodeRune { //!= "notdef" { + if v != MissingCodeString { //!= "notdef" { t.Errorf("Unmapped code, expected to map to undefined") return } @@ -188,7 +188,7 @@ func TestCMapParser2(t *testing.T) { } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v) return } @@ -297,7 +297,7 @@ func TestCMapParser3(t *testing.T) { 0xd140: 0xa000, } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v) return } @@ -407,7 +407,7 @@ func TestCMapParser4(t *testing.T) { } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v) return } @@ -520,6 +520,7 @@ var ( 0x017b: 'Ż', 0x017d: 'Ž', } + codeToUnicode3 = map[CharCode]rune{ // 93 entries 0x0124: 'Ĥ', 0x0125: 'ĥ', @@ -695,7 +696,7 @@ func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) { } u0 := codeToUnicode[code] u := cmap.codeToUnicode[code] - if u != u0 { + if u != string(u0) { t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u) return } diff --git a/internal/textencoding/cmap.go b/internal/textencoding/cmap.go index b0dfbedf..56b24c74 100644 --- a/internal/textencoding/cmap.go +++ b/internal/textencoding/cmap.go @@ -48,8 +48,8 @@ func (enc CMapEncoder) Decode(raw []byte) string { if codes, ok := enc.codeToCID.BytesToCharcodes(raw); ok { var buf bytes.Buffer for _, code := range codes { - r, _ := enc.CharcodeToRune(CharCode(code)) - buf.WriteRune(r) + s, _ := enc.charcodeToString(CharCode(code)) + buf.WriteString(s) } return buf.String() @@ -87,8 +87,13 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) { // CharcodeToRune converts PDF character code `code` to a rune. // The bool return flag is true if there was a match, and false otherwise. func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) { + s, ok := enc.charcodeToString(code) + return ([]rune(s))[0], ok +} + +func (enc CMapEncoder) charcodeToString(code CharCode) (string, bool) { if enc.cidToUnicode == nil { - return MissingCodeRune, false + return MissingCodeString, false } // Map charcode to CID. If charcode to CID CMap is nil, assume Identity encoding. @@ -96,7 +101,7 @@ func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) { if enc.codeToCID != nil { var ok bool if cid, ok = enc.codeToCID.CharcodeToCID(cmap.CharCode(code)); !ok { - return MissingCodeRune, false + return MissingCodeString, false } } diff --git a/internal/textencoding/glyphs_glyphlist.go b/internal/textencoding/glyphs_glyphlist.go index e794bea8..7f8bf840 100644 --- a/internal/textencoding/glyphs_glyphlist.go +++ b/internal/textencoding/glyphs_glyphlist.go @@ -18,7 +18,13 @@ import ( ) // MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'. -const MissingCodeRune = '\ufffd' // � +const ( + // MissingCodeRune replaces runes that can't be decoded. . + MissingCodeRune = '\ufffd' // � + + // MissingCodeRune replaces strings that can't be decoded. + MissingCodeString = string(MissingCodeRune) +) // GlyphToRune returns the rune corresponding to glyph `glyph` if there is one. // TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi" diff --git a/model/font.go b/model/font.go index af688bf4..40a9d65e 100644 --- a/model/font.go +++ b/model/font.go @@ -428,8 +428,8 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo numMisses = 0 for _, code := range charcodes { if fontBase.toUnicodeCmap != nil { - if r, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok { - runes = append(runes, r) + if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok { + runes = append(runes, []rune(s)...) continue } } From a9910e7e0619f14e09ce95272fb8f8ae1661ae4d Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Wed, 20 May 2020 18:43:09 +1000 Subject: [PATCH 3/4] Treat CMap entries as strings instead of runes to handle multi-byte encodings. --- extractor/text.go | 15 ++++--- internal/cmap/cmap.go | 6 ++- internal/cmap/cmap_parser.go | 79 ++++-------------------------------- model/font.go | 26 +++++++++--- 4 files changed, 40 insertions(+), 86 deletions(-) diff --git a/extractor/text.go b/extractor/text.go index a91eff75..9be289a9 100644 --- a/extractor/text.go +++ b/extractor/text.go @@ -698,7 +698,7 @@ func (to *textObject) reset() { func (to *textObject) renderText(data []byte) error { font := to.getCurrentFont() charcodes := font.BytesToCharcodes(data) - runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes) + runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes) if numMisses > 0 { common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses) } @@ -717,18 +717,17 @@ func (to *textObject) renderText(data []byte) error { spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ') } spaceWidth := spaceMetrics.Wx * glyphTextRatio - common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs) + // common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs) stateMatrix := transform.NewMatrix( tfs*th, 0, 0, tfs, 0, state.trise) - common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes) + // common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, drunes) - for i, r := range runes { - // TODO(peterwilliams97): Need to find and fix cases where this happens. - if r == '\x00' { + for i, r := range runeSlices { + if len(r) == 1 && r[0] == '\x00' { continue } @@ -742,14 +741,14 @@ func (to *textObject) renderText(data []byte) error { // w is the unscaled movement at the end of a word. w := 0.0 - if r == ' ' { + if string(r) == " " { w = state.tw } m, ok := font.GetCharMetrics(code) if !ok { common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font) - return errors.New("no char metrics") + return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code) } // c is the character size in unscaled text units. diff --git a/internal/cmap/cmap.go b/internal/cmap/cmap.go index 7a7ea0b6..11b2c634 100644 --- a/internal/cmap/cmap.go +++ b/internal/cmap/cmap.go @@ -265,6 +265,10 @@ func (cmap *CMap) computeInverseMappings() { // Generate Unicode -> CID map. for cid, s := range cmap.codeToUnicode { + // The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf + if len(s) == 0 { + continue + } r := rune0(s) if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) { cmap.unicodeToCode[r] = cid @@ -481,10 +485,8 @@ func (cmap *CMap) toBfData() string { var charRanges []charRange currCharRange := charRange{codes[0], codes[0]} prevRune := rune0(cmap.codeToUnicode[codes[0]]) - // fmt.Printf(" code=0x%04x prevRune=%q=0x%04x\n", codes[0], prevRune, prevRune) for _, c := range codes[1:] { currRune := rune0(cmap.codeToUnicode[c]) - // fmt.Printf("%4d: code=0x%04x currRune=%q=0x%04x\n", i+1, c, currRune, currRune) if c == currCharRange.code1+1 && currRune == prevRune+1 { currCharRange.code1 = c } else { diff --git a/internal/cmap/cmap_parser.go b/internal/cmap/cmap_parser.go index b5d69feb..a160f32c 100644 --- a/internal/cmap/cmap_parser.go +++ b/internal/cmap/cmap_parser.go @@ -105,7 +105,8 @@ func (cmap *CMap) parse() error { func (cmap *CMap) parseName() error { name := "" done := false - for i := 0; i < 10 && !done; i++ { + // /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf + for i := 0; i < 20 && !done; i++ { o, err := cmap.parseObject() if err != nil { return err @@ -487,11 +488,7 @@ func (cmap *CMap) parseBfchar() error { return ErrBadCMap } - if ligature, ok := StringToLigature[string(target)]; ok { - cmap.codeToUnicode[code] = string(ligature) - } else { - cmap.codeToUnicode[code] = string(target) - } + cmap.codeToUnicode[code] = string(target) } return nil @@ -565,18 +562,17 @@ func (cmap *CMap) parseBfrange() error { if !ok { return errors.New("non-hex string in array") } - r := hexToRunes(hexs) - cmap.codeToUnicode[code] = string(r) + runes := hexToRunes(hexs) + cmap.codeToUnicode[code] = string(runes) } case cmapHexString: // , maps [from,to] to [dst,dst+to-from]. - // XXX(peterwilliams97): Do we need to do this with multi-rune entries? I guess we - // would increment the last rune? - r := hexToRune(v) + runes := hexToRunes(v) + n := len(runes) for code := srcCodeFrom; code <= srcCodeTo; code++ { - cmap.codeToUnicode[code] = string(r) - r++ + cmap.codeToUnicode[code] = string(runes) + runes[n-1]++ } default: common.Log.Debug("ERROR: Unexpected type %T", o) @@ -586,60 +582,3 @@ func (cmap *CMap) parseBfrange() error { return nil } - -// ligatureToString is a map from ligature runes to their constituent characters. -// https://en.wikipedia.org/wiki/Typographic_ligature#Ligatures_in_Unicode_(Latin_alphabets) -// FIXME(peterwilliams97): I copied this here from glyphs_glyphlist.go to avoid a circular -// dependency. Where should it go? -var ligatureToString = map[rune]string{ - 'Ꜳ': "AA", - 'ꜳ': "aa", - 'Ꜵ': "aa", - 'ꜵ': "ao", - 'Ꜷ': "AU", - 'ꜷ': "au", - 'Ꜽ': "AY", - 'ꜽ': "ay", - '\U0001f670': "et", - 'ff': "ff", - 'ffi': "ffi", - 'ffl': "ffl", - 'fi': "fi", - 'fl': "fl", - 'Œ': "OE", - 'œ': "oe", - 'Ꝏ': "OO", - 'ꝏ': "oo", - 'ẞ': "fs", - 'ß': "fz", - 'st': "st", - 'ſt': "ſt", - 'Ꜩ': "TZ", - 'ꜩ': "tz", - 'ᵫ': "ue", - 'Ꝡ': "VY", - 'ꝡ': "vy", - // Reverse of ligatureMap - 0xe000: "ft", - 0xe001: "fj", - 0xe002: "fb", - 0xe003: "fh", - 0xe004: "fk", - 0xe005: "tt", - 0xe006: "tf", - 0xe007: "ffj", - 0xe008: "ffb", - 0xe009: "ffh", - 0xe00a: "ffk", - 0xe00b: "T_h", -} - -var StringToLigature = reverseLigatures(ligatureToString) - -func reverseLigatures(l2s map[rune]string) map[string]rune { - s2l := make(map[string]rune, len(l2s)) - for l, s := range l2s { - s2l[s] = l - } - return s2l -} diff --git a/model/font.go b/model/font.go index 40a9d65e..79011e26 100644 --- a/model/font.go +++ b/model/font.go @@ -422,14 +422,28 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode { // CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical // information about hits and misses from the reverse mapping process. +// NOTE: The number of runes returned may be greater than the number of charcodes. +// TODO(peterwilliams97): Deprecate? func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) { + runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes) + var runes []rune + for _, r := range runeSlices { + runes = append(runes, r...) + } + return runes, numHits, numMisses +} + +// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices. +// The int return is the number of unconvereted codes. +// NOTE: The number of rune slices returned is equal to the number of charcodes +func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) { fontBase := font.baseFields() - runes := make([]rune, 0, len(charcodes)) - numMisses = 0 + runeSlices := make([][]rune, 0, len(charcodes)) + numMisses := 0 for _, code := range charcodes { if fontBase.toUnicodeCmap != nil { if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok { - runes = append(runes, []rune(s)...) + runeSlices = append(runeSlices, []rune(s)) continue } } @@ -438,7 +452,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo encoder := font.Encoder() if encoder != nil { if r, ok := encoder.CharcodeToRune(code); ok { - runes = append(runes, r) + runeSlices = append(runeSlices, []rune{r}) continue } } @@ -447,7 +461,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo "\tfont=%s\n\tencoding=%s", code, charcodes, fontBase.isCIDFont(), font, encoder) numMisses++ - runes = append(runes, cmap.MissingCodeRune) + runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune}) } if numMisses != 0 { @@ -457,7 +471,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo len(charcodes), numMisses, font) } - return runes, len(runes), numMisses + return runeSlices, len(runeSlices), numMisses } // CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string. From 0c54cec2c5ac2c4c7d7f430befbffadb83d24f79 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Wed, 20 May 2020 19:07:22 +1000 Subject: [PATCH 4/4] Added a test for multibyte encoding. --- extractor/text_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extractor/text_test.go b/extractor/text_test.go index 92dfb976..cdfe47a9 100644 --- a/extractor/text_test.go +++ b/extractor/text_test.go @@ -316,6 +316,11 @@ var fileExtractionTests = []struct { `The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`}, }, }, + {filename: "Saudi.pdf", + pageTerms: map[int][]string{ + 10: []string{"الله"}, + }, + }, // TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed. // {filename: "Ito_Formula.pdf", // pageTerms: map[int][]string{