diff --git a/common/logging.go b/common/logging.go index b7452bf6..b3e62348 100644 --- a/common/logging.go +++ b/common/logging.go @@ -221,7 +221,7 @@ func (l WriterLogger) logToWriter(f io.Writer, prefix string, format string, arg } func logToWriter(f io.Writer, prefix string, format string, args ...interface{}) { - _, file, line, ok := runtime.Caller(2) + _, file, line, ok := runtime.Caller(3) if !ok { file = "???" line = 0 diff --git a/extractor/text.go b/extractor/text.go index a91eff75..9be289a9 100644 --- a/extractor/text.go +++ b/extractor/text.go @@ -698,7 +698,7 @@ func (to *textObject) reset() { func (to *textObject) renderText(data []byte) error { font := to.getCurrentFont() charcodes := font.BytesToCharcodes(data) - runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes) + runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes) if numMisses > 0 { common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses) } @@ -717,18 +717,17 @@ func (to *textObject) renderText(data []byte) error { spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ') } spaceWidth := spaceMetrics.Wx * glyphTextRatio - common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs) + // common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs) stateMatrix := transform.NewMatrix( tfs*th, 0, 0, tfs, 0, state.trise) - common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes) + // common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, drunes) - for i, r := range runes { - // TODO(peterwilliams97): Need to find and fix cases where this happens. - if r == '\x00' { + for i, r := range runeSlices { + if len(r) == 1 && r[0] == '\x00' { continue } @@ -742,14 +741,14 @@ func (to *textObject) renderText(data []byte) error { // w is the unscaled movement at the end of a word. w := 0.0 - if r == ' ' { + if string(r) == " " { w = state.tw } m, ok := font.GetCharMetrics(code) if !ok { common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font) - return errors.New("no char metrics") + return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code) } // c is the character size in unscaled text units. diff --git a/extractor/text_test.go b/extractor/text_test.go index 92dfb976..cdfe47a9 100644 --- a/extractor/text_test.go +++ b/extractor/text_test.go @@ -316,6 +316,11 @@ var fileExtractionTests = []struct { `The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`}, }, }, + {filename: "Saudi.pdf", + pageTerms: map[int][]string{ + 10: []string{"الله"}, + }, + }, // TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed. // {filename: "Ito_Formula.pdf", // pageTerms: map[int][]string{ diff --git a/internal/cmap/cmap.go b/internal/cmap/cmap.go index 1299faa5..11b2c634 100644 --- a/internal/cmap/cmap.go +++ b/internal/cmap/cmap.go @@ -21,6 +21,9 @@ const ( // MissingCodeRune replaces runes that can't be decoded. '\ufffd' = �. Was '?'. MissingCodeRune = '\ufffd' // � + + // MissingCodeRune replaces strings that can't be decoded. + MissingCodeString = string(MissingCodeRune) ) // CharCode is a character code or Unicode @@ -41,7 +44,7 @@ type charRange struct { type fbRange struct { code0 CharCode code1 CharCode - r0 rune + r0 rune // TODO (peterwilliams97): Change to string for compound codes. } // CIDSystemInfo contains information for identifying the character collection @@ -106,8 +109,9 @@ type CMap struct { cidToCode map[CharCode]CharCode // CID -> charcode // Used by ctype 2 CMaps. - codeToUnicode map[CharCode]rune // CID -> Unicode - unicodeToCode map[rune]CharCode // Unicode -> CID + codeToUnicode map[CharCode]string // CID -> Unicode string + // XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode? + unicodeToCode map[rune]CharCode // Unicode rune -> CID // cached contains the raw CMap data. It is used by the Bytes method in // order to avoid generating the data for every call. @@ -116,8 +120,13 @@ type CMap struct { cached []byte } -// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg. -func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap { +// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToRune` arg. +func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap { + codeToUnicode := make(map[CharCode]string, len(codeToRune)) + for code, r := range codeToRune { + codeToUnicode[code] = string(r) + } + cmap := &CMap{ name: "Adobe-Identity-UCS", ctype: 2, @@ -135,6 +144,7 @@ func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap { } cmap.computeInverseMappings() + return cmap } @@ -148,7 +158,7 @@ func newCMap(isSimple bool) *CMap { nbits: nbits, codeToCID: make(map[CharCode]CharCode), cidToCode: make(map[CharCode]CharCode), - codeToUnicode: make(map[CharCode]rune), + codeToUnicode: make(map[CharCode]string), unicodeToCode: make(map[rune]CharCode), } } @@ -254,7 +264,12 @@ func (cmap *CMap) computeInverseMappings() { } // Generate Unicode -> CID map. - for cid, r := range cmap.codeToUnicode { + for cid, s := range cmap.codeToUnicode { + // The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf + if len(s) == 0 { + continue + } + r := rune0(s) if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) { cmap.unicodeToCode[r] = cid } @@ -277,19 +292,18 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) { return "", 0 } - var ( - parts []rune - missing []CharCode - ) - for _, code := range charcodes { + parts := make([]string, len(charcodes)) + var missing []CharCode + for i, code := range charcodes { s, ok := cmap.codeToUnicode[code] if !ok { missing = append(missing, code) - s = MissingCodeRune + s = MissingCodeString } - parts = append(parts, s) + parts[i] = s } - unicode := string(parts) + unicode := strings.Join(parts, "") + if len(missing) > 0 { common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+ "\tdata=[% 02x]=%#q\n"+ @@ -305,11 +319,11 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) { // CharcodeToUnicode converts a single character code `code` to a unicode string. // If `code` is not in the unicode map, '�' is returned. // NOTE: CharcodeBytesToUnicode is typically more efficient. -func (cmap *CMap) CharcodeToUnicode(code CharCode) (rune, bool) { +func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) { if s, ok := cmap.codeToUnicode[code]; ok { return s, true } - return MissingCodeRune, false + return MissingCodeString, false } // RuneToCID maps the specified rune to a character identifier. If the provided @@ -453,7 +467,7 @@ func (cmap *CMap) toBfData() string { } // codes is a sorted list of the codeToUnicode keys. - var codes []CharCode + codes := make([]CharCode, 0, len(cmap.codeToUnicode)) for code := range cmap.codeToUnicode { codes = append(codes, code) } @@ -470,9 +484,9 @@ func (cmap *CMap) toBfData() string { // character codes have been mapped to code ranges. var charRanges []charRange currCharRange := charRange{codes[0], codes[0]} - prevRune := cmap.codeToUnicode[codes[0]] + prevRune := rune0(cmap.codeToUnicode[codes[0]]) for _, c := range codes[1:] { - currRune := cmap.codeToUnicode[c] + currRune := rune0(cmap.codeToUnicode[c]) if c == currCharRange.code1+1 && currRune == prevRune+1 { currCharRange.code1 = c } else { @@ -493,7 +507,7 @@ func (cmap *CMap) toBfData() string { fbRanges = append(fbRanges, fbRange{ code0: cr.code0, code1: cr.code1, - r0: cmap.codeToUnicode[cr.code0], + r0: rune0(cmap.codeToUnicode[cr.code0]), }) } } @@ -508,7 +522,7 @@ func (cmap *CMap) toBfData() string { lines = append(lines, fmt.Sprintf("%d beginbfchar", n)) for j := 0; j < n; j++ { code := fbChars[i*maxBfEntries+j] - r := cmap.codeToUnicode[code] + r := rune0(cmap.codeToUnicode[code]) lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r)) } lines = append(lines, "endbfchar") @@ -549,3 +563,9 @@ end end ` ) + +// rune0 is a convenience function that returns the first rune in `s`. +// Caller must check that `s` is not empty. +func rune0(s string) rune { + return ([]rune(s))[0] +} diff --git a/internal/cmap/cmap_parser.go b/internal/cmap/cmap_parser.go index 9236d782..a160f32c 100644 --- a/internal/cmap/cmap_parser.go +++ b/internal/cmap/cmap_parser.go @@ -105,7 +105,8 @@ func (cmap *CMap) parse() error { func (cmap *CMap) parseName() error { name := "" done := false - for i := 0; i < 10 && !done; i++ { + // /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf + for i := 0; i < 20 && !done; i++ { o, err := cmap.parseObject() if err != nil { return err @@ -141,7 +142,6 @@ func (cmap *CMap) parseName() error { // parseType parses a cmap type and adds it to `cmap`. // cmap names are defined like this: /CMapType 1 def func (cmap *CMap) parseType() error { - ctype := 0 done := false for i := 0; i < 3 && !done; i++ { @@ -171,7 +171,6 @@ func (cmap *CMap) parseType() error { // We don't need the version. We do this to eat up the version code in the cmap definition // to reduce unhandled parse object warnings. func (cmap *CMap) parseVersion() error { - version := "" done := false for i := 0; i < 3 && !done; i++ { @@ -471,7 +470,7 @@ func (cmap *CMap) parseBfchar() error { } return err } - var target rune + var target []rune switch v := o.(type) { case cmapOperand: if v.Operand == endbfchar { @@ -480,16 +479,16 @@ func (cmap *CMap) parseBfchar() error { common.Log.Debug("ERROR: Unexpected operand. %#v", v) return ErrBadCMap case cmapHexString: - target = hexToRune(v) + target = hexToRunes(v) case cmapName: common.Log.Debug("ERROR: Unexpected name. %#v", v) - target = MissingCodeRune + target = []rune{MissingCodeRune} default: common.Log.Debug("ERROR: Unexpected type. %#v", o) return ErrBadCMap } - cmap.codeToUnicode[code] = target + cmap.codeToUnicode[code] = string(target) } return nil @@ -563,16 +562,17 @@ func (cmap *CMap) parseBfrange() error { if !ok { return errors.New("non-hex string in array") } - r := hexToRune(hexs) - cmap.codeToUnicode[code] = r + runes := hexToRunes(hexs) + cmap.codeToUnicode[code] = string(runes) } case cmapHexString: // , maps [from,to] to [dst,dst+to-from]. - r := hexToRune(v) + runes := hexToRunes(v) + n := len(runes) for code := srcCodeFrom; code <= srcCodeTo; code++ { - cmap.codeToUnicode[code] = r - r++ + cmap.codeToUnicode[code] = string(runes) + runes[n-1]++ } default: common.Log.Debug("ERROR: Unexpected type %T", o) diff --git a/internal/cmap/cmap_test.go b/internal/cmap/cmap_test.go index 5c8da78d..de26766e 100644 --- a/internal/cmap/cmap_test.go +++ b/internal/cmap/cmap_test.go @@ -104,14 +104,14 @@ func TestCMapParser1(t *testing.T) { } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v) return } } v, _ := cmap.CharcodeToUnicode(0x99) - if v != MissingCodeRune { //!= "notdef" { + if v != MissingCodeString { //!= "notdef" { t.Errorf("Unmapped code, expected to map to undefined") return } @@ -188,7 +188,7 @@ func TestCMapParser2(t *testing.T) { } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v) return } @@ -297,7 +297,7 @@ func TestCMapParser3(t *testing.T) { 0xd140: 0xa000, } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v) return } @@ -407,7 +407,7 @@ func TestCMapParser4(t *testing.T) { } for k, expected := range expectedMappings { - if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected { + if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) { t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v) return } @@ -520,6 +520,7 @@ var ( 0x017b: 'Ż', 0x017d: 'Ž', } + codeToUnicode3 = map[CharCode]rune{ // 93 entries 0x0124: 'Ĥ', 0x0125: 'ĥ', @@ -695,7 +696,7 @@ func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) { } u0 := codeToUnicode[code] u := cmap.codeToUnicode[code] - if u != u0 { + if u != string(u0) { t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u) return } diff --git a/internal/textencoding/cmap.go b/internal/textencoding/cmap.go index b0dfbedf..56b24c74 100644 --- a/internal/textencoding/cmap.go +++ b/internal/textencoding/cmap.go @@ -48,8 +48,8 @@ func (enc CMapEncoder) Decode(raw []byte) string { if codes, ok := enc.codeToCID.BytesToCharcodes(raw); ok { var buf bytes.Buffer for _, code := range codes { - r, _ := enc.CharcodeToRune(CharCode(code)) - buf.WriteRune(r) + s, _ := enc.charcodeToString(CharCode(code)) + buf.WriteString(s) } return buf.String() @@ -87,8 +87,13 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) { // CharcodeToRune converts PDF character code `code` to a rune. // The bool return flag is true if there was a match, and false otherwise. func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) { + s, ok := enc.charcodeToString(code) + return ([]rune(s))[0], ok +} + +func (enc CMapEncoder) charcodeToString(code CharCode) (string, bool) { if enc.cidToUnicode == nil { - return MissingCodeRune, false + return MissingCodeString, false } // Map charcode to CID. If charcode to CID CMap is nil, assume Identity encoding. @@ -96,7 +101,7 @@ func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) { if enc.codeToCID != nil { var ok bool if cid, ok = enc.codeToCID.CharcodeToCID(cmap.CharCode(code)); !ok { - return MissingCodeRune, false + return MissingCodeString, false } } diff --git a/internal/textencoding/glyphs_glyphlist.go b/internal/textencoding/glyphs_glyphlist.go index e794bea8..7f8bf840 100644 --- a/internal/textencoding/glyphs_glyphlist.go +++ b/internal/textencoding/glyphs_glyphlist.go @@ -18,7 +18,13 @@ import ( ) // MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'. -const MissingCodeRune = '\ufffd' // � +const ( + // MissingCodeRune replaces runes that can't be decoded. . + MissingCodeRune = '\ufffd' // � + + // MissingCodeRune replaces strings that can't be decoded. + MissingCodeString = string(MissingCodeRune) +) // GlyphToRune returns the rune corresponding to glyph `glyph` if there is one. // TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi" diff --git a/model/font.go b/model/font.go index af688bf4..79011e26 100644 --- a/model/font.go +++ b/model/font.go @@ -422,14 +422,28 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode { // CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical // information about hits and misses from the reverse mapping process. +// NOTE: The number of runes returned may be greater than the number of charcodes. +// TODO(peterwilliams97): Deprecate? func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) { + runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes) + var runes []rune + for _, r := range runeSlices { + runes = append(runes, r...) + } + return runes, numHits, numMisses +} + +// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices. +// The int return is the number of unconvereted codes. +// NOTE: The number of rune slices returned is equal to the number of charcodes +func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) { fontBase := font.baseFields() - runes := make([]rune, 0, len(charcodes)) - numMisses = 0 + runeSlices := make([][]rune, 0, len(charcodes)) + numMisses := 0 for _, code := range charcodes { if fontBase.toUnicodeCmap != nil { - if r, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok { - runes = append(runes, r) + if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok { + runeSlices = append(runeSlices, []rune(s)) continue } } @@ -438,7 +452,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo encoder := font.Encoder() if encoder != nil { if r, ok := encoder.CharcodeToRune(code); ok { - runes = append(runes, r) + runeSlices = append(runeSlices, []rune{r}) continue } } @@ -447,7 +461,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo "\tfont=%s\n\tencoding=%s", code, charcodes, fontBase.isCIDFont(), font, encoder) numMisses++ - runes = append(runes, cmap.MissingCodeRune) + runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune}) } if numMisses != 0 { @@ -457,7 +471,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo len(charcodes), numMisses, font) } - return runes, len(runes), numMisses + return runeSlices, len(runeSlices), numMisses } // CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.