Merge branch 'cmap' into columns

This commit is contained in:
Peter Williams 2020-05-24 20:45:31 +10:00
commit e9c46fa3b9
9 changed files with 111 additions and 61 deletions

View File

@ -221,7 +221,7 @@ func (l WriterLogger) logToWriter(f io.Writer, prefix string, format string, arg
}
func logToWriter(f io.Writer, prefix string, format string, args ...interface{}) {
_, file, line, ok := runtime.Caller(2)
_, file, line, ok := runtime.Caller(3)
if !ok {
file = "???"
line = 0

View File

@ -698,7 +698,7 @@ func (to *textObject) reset() {
func (to *textObject) renderText(data []byte) error {
font := to.getCurrentFont()
charcodes := font.BytesToCharcodes(data)
runes, numChars, numMisses := font.CharcodesToUnicodeWithStats(charcodes)
runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
if numMisses > 0 {
common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
}
@ -717,18 +717,17 @@ func (to *textObject) renderText(data []byte) error {
spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
}
spaceWidth := spaceMetrics.Wx * glyphTextRatio
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes, font, tfs)
// common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)
stateMatrix := transform.NewMatrix(
tfs*th, 0,
0, tfs,
0, state.trise)
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
// common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, drunes)
for i, r := range runes {
// TODO(peterwilliams97): Need to find and fix cases where this happens.
if r == '\x00' {
for i, r := range runeSlices {
if len(r) == 1 && r[0] == '\x00' {
continue
}
@ -742,14 +741,14 @@ func (to *textObject) renderText(data []byte) error {
// w is the unscaled movement at the end of a word.
w := 0.0
if r == ' ' {
if string(r) == " " {
w = state.tw
}
m, ok := font.GetCharMetrics(code)
if !ok {
common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%+q %s", code, r, r, font)
return errors.New("no char metrics")
return fmt.Errorf("no char metrics: font=%s code=%d", font.String(), code)
}
// c is the character size in unscaled text units.

View File

@ -316,6 +316,11 @@ var fileExtractionTests = []struct {
`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
},
},
{filename: "Saudi.pdf",
pageTerms: map[int][]string{
10: []string{"الله"},
},
},
// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
// {filename: "Ito_Formula.pdf",
// pageTerms: map[int][]string{

View File

@ -21,6 +21,9 @@ const (
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
MissingCodeRune = '\ufffd' // <20>
// MissingCodeRune replaces strings that can't be decoded.
MissingCodeString = string(MissingCodeRune)
)
// CharCode is a character code or Unicode
@ -41,7 +44,7 @@ type charRange struct {
type fbRange struct {
code0 CharCode
code1 CharCode
r0 rune
r0 rune // TODO (peterwilliams97): Change to string for compound codes.
}
// CIDSystemInfo contains information for identifying the character collection
@ -106,8 +109,9 @@ type CMap struct {
cidToCode map[CharCode]CharCode // CID -> charcode
// Used by ctype 2 CMaps.
codeToUnicode map[CharCode]rune // CID -> Unicode
unicodeToCode map[rune]CharCode // Unicode -> CID
codeToUnicode map[CharCode]string // CID -> Unicode string
// XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode?
unicodeToCode map[rune]CharCode // Unicode rune -> CID
// cached contains the raw CMap data. It is used by the Bytes method in
// order to avoid generating the data for every call.
@ -116,8 +120,13 @@ type CMap struct {
cached []byte
}
// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToUnicode` arg.
func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap {
// NewToUnicodeCMap returns an identity CMap with codeToUnicode matching the `codeToRune` arg.
func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
codeToUnicode := make(map[CharCode]string, len(codeToRune))
for code, r := range codeToRune {
codeToUnicode[code] = string(r)
}
cmap := &CMap{
name: "Adobe-Identity-UCS",
ctype: 2,
@ -135,6 +144,7 @@ func NewToUnicodeCMap(codeToUnicode map[CharCode]rune) *CMap {
}
cmap.computeInverseMappings()
return cmap
}
@ -148,7 +158,7 @@ func newCMap(isSimple bool) *CMap {
nbits: nbits,
codeToCID: make(map[CharCode]CharCode),
cidToCode: make(map[CharCode]CharCode),
codeToUnicode: make(map[CharCode]rune),
codeToUnicode: make(map[CharCode]string),
unicodeToCode: make(map[rune]CharCode),
}
}
@ -254,7 +264,12 @@ func (cmap *CMap) computeInverseMappings() {
}
// Generate Unicode -> CID map.
for cid, r := range cmap.codeToUnicode {
for cid, s := range cmap.codeToUnicode {
// The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf
if len(s) == 0 {
continue
}
r := rune0(s)
if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
cmap.unicodeToCode[r] = cid
}
@ -277,19 +292,18 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
return "", 0
}
var (
parts []rune
missing []CharCode
)
for _, code := range charcodes {
parts := make([]string, len(charcodes))
var missing []CharCode
for i, code := range charcodes {
s, ok := cmap.codeToUnicode[code]
if !ok {
missing = append(missing, code)
s = MissingCodeRune
s = MissingCodeString
}
parts = append(parts, s)
parts[i] = s
}
unicode := string(parts)
unicode := strings.Join(parts, "")
if len(missing) > 0 {
common.Log.Debug("ERROR: CharcodeBytesToUnicode. Not in map.\n"+
"\tdata=[% 02x]=%#q\n"+
@ -305,11 +319,11 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
// CharcodeToUnicode converts a single character code `code` to a unicode string.
// If `code` is not in the unicode map, '<27>' is returned.
// NOTE: CharcodeBytesToUnicode is typically more efficient.
func (cmap *CMap) CharcodeToUnicode(code CharCode) (rune, bool) {
func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
if s, ok := cmap.codeToUnicode[code]; ok {
return s, true
}
return MissingCodeRune, false
return MissingCodeString, false
}
// RuneToCID maps the specified rune to a character identifier. If the provided
@ -453,7 +467,7 @@ func (cmap *CMap) toBfData() string {
}
// codes is a sorted list of the codeToUnicode keys.
var codes []CharCode
codes := make([]CharCode, 0, len(cmap.codeToUnicode))
for code := range cmap.codeToUnicode {
codes = append(codes, code)
}
@ -470,9 +484,9 @@ func (cmap *CMap) toBfData() string {
// character codes have been mapped to code ranges.
var charRanges []charRange
currCharRange := charRange{codes[0], codes[0]}
prevRune := cmap.codeToUnicode[codes[0]]
prevRune := rune0(cmap.codeToUnicode[codes[0]])
for _, c := range codes[1:] {
currRune := cmap.codeToUnicode[c]
currRune := rune0(cmap.codeToUnicode[c])
if c == currCharRange.code1+1 && currRune == prevRune+1 {
currCharRange.code1 = c
} else {
@ -493,7 +507,7 @@ func (cmap *CMap) toBfData() string {
fbRanges = append(fbRanges, fbRange{
code0: cr.code0,
code1: cr.code1,
r0: cmap.codeToUnicode[cr.code0],
r0: rune0(cmap.codeToUnicode[cr.code0]),
})
}
}
@ -508,7 +522,7 @@ func (cmap *CMap) toBfData() string {
lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
for j := 0; j < n; j++ {
code := fbChars[i*maxBfEntries+j]
r := cmap.codeToUnicode[code]
r := rune0(cmap.codeToUnicode[code])
lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
}
lines = append(lines, "endbfchar")
@ -549,3 +563,9 @@ end
end
`
)
// rune0 is a convenience function that returns the first rune in `s`.
// Caller must check that `s` is not empty.
func rune0(s string) rune {
return ([]rune(s))[0]
}

View File

@ -105,7 +105,8 @@ func (cmap *CMap) parse() error {
func (cmap *CMap) parseName() error {
name := ""
done := false
for i := 0; i < 10 && !done; i++ {
// /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf
for i := 0; i < 20 && !done; i++ {
o, err := cmap.parseObject()
if err != nil {
return err
@ -141,7 +142,6 @@ func (cmap *CMap) parseName() error {
// parseType parses a cmap type and adds it to `cmap`.
// cmap names are defined like this: /CMapType 1 def
func (cmap *CMap) parseType() error {
ctype := 0
done := false
for i := 0; i < 3 && !done; i++ {
@ -171,7 +171,6 @@ func (cmap *CMap) parseType() error {
// We don't need the version. We do this to eat up the version code in the cmap definition
// to reduce unhandled parse object warnings.
func (cmap *CMap) parseVersion() error {
version := ""
done := false
for i := 0; i < 3 && !done; i++ {
@ -471,7 +470,7 @@ func (cmap *CMap) parseBfchar() error {
}
return err
}
var target rune
var target []rune
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfchar {
@ -480,16 +479,16 @@ func (cmap *CMap) parseBfchar() error {
common.Log.Debug("ERROR: Unexpected operand. %#v", v)
return ErrBadCMap
case cmapHexString:
target = hexToRune(v)
target = hexToRunes(v)
case cmapName:
common.Log.Debug("ERROR: Unexpected name. %#v", v)
target = MissingCodeRune
target = []rune{MissingCodeRune}
default:
common.Log.Debug("ERROR: Unexpected type. %#v", o)
return ErrBadCMap
}
cmap.codeToUnicode[code] = target
cmap.codeToUnicode[code] = string(target)
}
return nil
@ -563,16 +562,17 @@ func (cmap *CMap) parseBfrange() error {
if !ok {
return errors.New("non-hex string in array")
}
r := hexToRune(hexs)
cmap.codeToUnicode[code] = r
runes := hexToRunes(hexs)
cmap.codeToUnicode[code] = string(runes)
}
case cmapHexString:
// <codeFrom> <codeTo> <dst>, maps [from,to] to [dst,dst+to-from].
r := hexToRune(v)
runes := hexToRunes(v)
n := len(runes)
for code := srcCodeFrom; code <= srcCodeTo; code++ {
cmap.codeToUnicode[code] = r
r++
cmap.codeToUnicode[code] = string(runes)
runes[n-1]++
}
default:
common.Log.Debug("ERROR: Unexpected type %T", o)

View File

@ -104,14 +104,14 @@ func TestCMapParser1(t *testing.T) {
}
for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v)
return
}
}
v, _ := cmap.CharcodeToUnicode(0x99)
if v != MissingCodeRune { //!= "notdef" {
if v != MissingCodeString { //!= "notdef" {
t.Errorf("Unmapped code, expected to map to undefined")
return
}
@ -188,7 +188,7 @@ func TestCMapParser2(t *testing.T) {
}
for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v)
return
}
@ -297,7 +297,7 @@ func TestCMapParser3(t *testing.T) {
0xd140: 0xa000,
}
for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v)
return
}
@ -407,7 +407,7 @@ func TestCMapParser4(t *testing.T) {
}
for k, expected := range expectedMappings {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v)
return
}
@ -520,6 +520,7 @@ var (
0x017b: 'Ż',
0x017d: 'Ž',
}
codeToUnicode3 = map[CharCode]rune{ // 93 entries
0x0124: 'Ĥ',
0x0125: 'ĥ',
@ -695,7 +696,7 @@ func checkCmapWriteRead(t *testing.T, codeToUnicode map[CharCode]rune) {
}
u0 := codeToUnicode[code]
u := cmap.codeToUnicode[code]
if u != u0 {
if u != string(u0) {
t.Errorf("Unicode mismatch: i=%d code0=0x%04x expected=%q test=%q", i, code, u0, u)
return
}

View File

@ -48,8 +48,8 @@ func (enc CMapEncoder) Decode(raw []byte) string {
if codes, ok := enc.codeToCID.BytesToCharcodes(raw); ok {
var buf bytes.Buffer
for _, code := range codes {
r, _ := enc.CharcodeToRune(CharCode(code))
buf.WriteRune(r)
s, _ := enc.charcodeToString(CharCode(code))
buf.WriteString(s)
}
return buf.String()
@ -87,8 +87,13 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) {
// CharcodeToRune converts PDF character code `code` to a rune.
// The bool return flag is true if there was a match, and false otherwise.
func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
s, ok := enc.charcodeToString(code)
return ([]rune(s))[0], ok
}
func (enc CMapEncoder) charcodeToString(code CharCode) (string, bool) {
if enc.cidToUnicode == nil {
return MissingCodeRune, false
return MissingCodeString, false
}
// Map charcode to CID. If charcode to CID CMap is nil, assume Identity encoding.
@ -96,7 +101,7 @@ func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool) {
if enc.codeToCID != nil {
var ok bool
if cid, ok = enc.codeToCID.CharcodeToCID(cmap.CharCode(code)); !ok {
return MissingCodeRune, false
return MissingCodeString, false
}
}

View File

@ -18,7 +18,13 @@ import (
)
// MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'.
const MissingCodeRune = '\ufffd' // <20>
const (
// MissingCodeRune replaces runes that can't be decoded. .
MissingCodeRune = '\ufffd' // <20>
// MissingCodeRune replaces strings that can't be decoded.
MissingCodeString = string(MissingCodeRune)
)
// GlyphToRune returns the rune corresponding to glyph `glyph` if there is one.
// TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi"

View File

@ -422,14 +422,28 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical
// information about hits and misses from the reverse mapping process.
// NOTE: The number of runes returned may be greater than the number of charcodes.
// TODO(peterwilliams97): Deprecate?
func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) {
runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes)
var runes []rune
for _, r := range runeSlices {
runes = append(runes, r...)
}
return runes, numHits, numMisses
}
// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices.
// The int return is the number of unconvereted codes.
// NOTE: The number of rune slices returned is equal to the number of charcodes
func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) {
fontBase := font.baseFields()
runes := make([]rune, 0, len(charcodes))
numMisses = 0
runeSlices := make([][]rune, 0, len(charcodes))
numMisses := 0
for _, code := range charcodes {
if fontBase.toUnicodeCmap != nil {
if r, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
runes = append(runes, r)
if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
runeSlices = append(runeSlices, []rune(s))
continue
}
}
@ -438,7 +452,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
encoder := font.Encoder()
if encoder != nil {
if r, ok := encoder.CharcodeToRune(code); ok {
runes = append(runes, r)
runeSlices = append(runeSlices, []rune{r})
continue
}
}
@ -447,7 +461,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
"\tfont=%s\n\tencoding=%s",
code, charcodes, fontBase.isCIDFont(), font, encoder)
numMisses++
runes = append(runes, cmap.MissingCodeRune)
runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune})
}
if numMisses != 0 {
@ -457,7 +471,7 @@ func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCo
len(charcodes), numMisses, font)
}
return runes, len(runes), numMisses
return runeSlices, len(runeSlices), numMisses
}
// CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.