unipdf/internal/cmap/parser.go
2020-01-06 11:05:42 -08:00

441 lines
8.8 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package cmap
import (
"bufio"
"bytes"
"encoding/hex"
"fmt"
"io"
"strconv"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/internal/parseutils"
)
// cMapParser parses CMap character to unicode mapping files.
type cMapParser struct {
reader *bufio.Reader
}
// cMapParser creates a new instance of the PDF CMap parser from input data.
func newCMapParser(content []byte) *cMapParser {
parser := cMapParser{}
buffer := bytes.NewBuffer(content)
parser.reader = bufio.NewReader(buffer)
return &parser
}
// parseObject detects the signature at the current file position and parses the corresponding object.
func (p *cMapParser) parseObject() (cmapObject, error) {
p.skipSpaces()
for {
bb, err := p.reader.Peek(2)
if err != nil {
return nil, err
}
if bb[0] == '%' {
p.parseComment()
p.skipSpaces()
continue
} else if bb[0] == '/' {
name, err := p.parseName()
return name, err
} else if bb[0] == '(' {
str, err := p.parseString()
return str, err
} else if bb[0] == '[' {
arr, err := p.parseArray()
return arr, err
} else if (bb[0] == '<') && (bb[1] == '<') {
dict, err := p.parseDict()
return dict, err
} else if bb[0] == '<' {
shex, err := p.parseHexString()
return shex, err
} else if core.IsDecimalDigit(bb[0]) || (bb[0] == '-' && core.IsDecimalDigit(bb[1])) {
number, err := p.parseNumber()
if err != nil {
return nil, err
}
return number, nil
} else {
// Operand?
operand, err := p.parseOperand()
if err != nil {
return nil, err
}
return operand, nil
}
}
}
// skipSpaces skips over any spaces. Returns the number of spaces skipped and an error if any.
func (p *cMapParser) skipSpaces() (int, error) {
cnt := 0
for {
bb, err := p.reader.Peek(1)
if err != nil {
return 0, err
}
if core.IsWhiteSpace(bb[0]) {
p.reader.ReadByte()
cnt++
} else {
break
}
}
return cnt, nil
}
// parseComment reads a comment line starting with '%'.
func (p *cMapParser) parseComment() (string, error) {
var r bytes.Buffer
_, err := p.skipSpaces()
if err != nil {
return r.String(), err
}
isFirst := true
for {
bb, err := p.reader.Peek(1)
if err != nil {
common.Log.Debug("parseComment: err=%v", err)
return r.String(), err
}
if isFirst && bb[0] != '%' {
return r.String(), ErrBadCMapComment
}
isFirst = false
if (bb[0] != '\r') && (bb[0] != '\n') {
b, _ := p.reader.ReadByte()
r.WriteByte(b)
} else {
break
}
}
return r.String(), nil
}
// parseName parses a name starting with '/'.
func (p *cMapParser) parseName() (cmapName, error) {
name := ""
nameStarted := false
for {
bb, err := p.reader.Peek(1)
if err == io.EOF {
break // Can happen when loading from object stream.
}
if err != nil {
return cmapName{name}, err
}
if !nameStarted {
// Should always start with '/', otherwise not valid.
if bb[0] == '/' {
nameStarted = true
p.reader.ReadByte()
} else {
common.Log.Debug("ERROR: Name starting with %s (% x)", bb, bb)
return cmapName{name}, fmt.Errorf("invalid name: (%c)", bb[0])
}
} else {
if core.IsWhiteSpace(bb[0]) {
break
} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
break // Looks like start of next statement.
} else if bb[0] == '#' {
hexcode, err := p.reader.Peek(3)
if err != nil {
return cmapName{name}, err
}
p.reader.Discard(3)
code, err := hex.DecodeString(string(hexcode[1:3]))
if err != nil {
return cmapName{name}, err
}
name += string(code)
} else {
b, _ := p.reader.ReadByte()
name += string(b)
}
}
}
return cmapName{name}, nil
}
// parseString parses a string starts with '(' and ends with ')'.
func (p *cMapParser) parseString() (cmapString, error) {
p.reader.ReadByte()
buf := bytes.Buffer{}
count := 1
for {
bb, err := p.reader.Peek(1)
if err != nil {
return cmapString{buf.String()}, err
}
if bb[0] == '\\' { // Escape sequence.
p.reader.ReadByte() // Skip the escape \ byte.
b, err := p.reader.ReadByte()
if err != nil {
return cmapString{buf.String()}, err
}
// Octal '\ddd' number (base 8).
if core.IsOctalDigit(b) {
bb, err := p.reader.Peek(2)
if err != nil {
return cmapString{buf.String()}, err
}
var numeric []byte
numeric = append(numeric, b)
for _, val := range bb {
if core.IsOctalDigit(val) {
numeric = append(numeric, val)
} else {
break
}
}
p.reader.Discard(len(numeric) - 1)
common.Log.Trace("Numeric string \"%s\"", numeric)
code, err := strconv.ParseUint(string(numeric), 8, 32)
if err != nil {
return cmapString{buf.String()}, err
}
buf.WriteByte(byte(code))
continue
}
switch b {
case 'n':
buf.WriteByte('\n')
case 'r':
buf.WriteByte('\r')
case 't':
buf.WriteByte('\t')
case 'b':
buf.WriteByte('\b')
case 'f':
buf.WriteByte('\f')
case '(':
buf.WriteByte('(')
case ')':
buf.WriteByte(')')
case '\\':
buf.WriteByte('\\')
}
continue
} else if bb[0] == '(' {
count++
} else if bb[0] == ')' {
count--
if count == 0 {
p.reader.ReadByte()
break
}
}
b, _ := p.reader.ReadByte()
buf.WriteByte(b)
}
return cmapString{buf.String()}, nil
}
// parseHexString parses a PostScript hex string.
// Hex strings start with '<' ends with '>'.
// Currently not converting the hex codes to characters.
func (p *cMapParser) parseHexString() (cmapHexString, error) {
p.reader.ReadByte()
hextable := []byte("0123456789abcdefABCDEF")
buf := bytes.Buffer{}
for {
p.skipSpaces()
bb, err := p.reader.Peek(1)
if err != nil {
return cmapHexString{}, err
}
if bb[0] == '>' {
p.reader.ReadByte()
break
}
b, _ := p.reader.ReadByte()
if bytes.IndexByte(hextable, b) >= 0 {
buf.WriteByte(b)
}
}
if buf.Len()%2 == 1 {
common.Log.Debug("parseHexString: appending '0' to %#q", buf.String())
buf.WriteByte('0')
}
numBytes := buf.Len() / 2
hexb, _ := hex.DecodeString(buf.String())
return cmapHexString{numBytes: numBytes, b: hexb}, nil
}
// parseArray parses a PDF array, which starts with '[', ends with ']'and can contain any kinds of
// direct objects.
func (p *cMapParser) parseArray() (cmapArray, error) {
arr := cmapArray{}
arr.Array = []cmapObject{}
p.reader.ReadByte()
for {
p.skipSpaces()
bb, err := p.reader.Peek(1)
if err != nil {
return arr, err
}
if bb[0] == ']' {
p.reader.ReadByte()
break
}
obj, err := p.parseObject()
if err != nil {
return arr, err
}
arr.Array = append(arr.Array, obj)
}
return arr, nil
}
// parseDict parses a PDF dictionary object, which starts with with '<<' and ends with '>>'.
func (p *cMapParser) parseDict() (cmapDict, error) {
common.Log.Trace("Reading PDF Dict!")
dict := makeDict()
// Pass the '<<'
c, _ := p.reader.ReadByte()
if c != '<' {
return dict, ErrBadCMapDict
}
c, _ = p.reader.ReadByte()
if c != '<' {
return dict, ErrBadCMapDict
}
for {
p.skipSpaces()
bb, err := p.reader.Peek(2)
if err != nil {
return dict, err
}
if (bb[0] == '>') && (bb[1] == '>') {
p.reader.ReadByte()
p.reader.ReadByte()
break
}
key, err := p.parseName()
common.Log.Trace("Key: %s", key.Name)
if err != nil {
common.Log.Debug("ERROR: Returning name. err=%v", err)
return dict, err
}
p.skipSpaces()
val, err := p.parseObject()
if err != nil {
return dict, err
}
dict.Dict[key.Name] = val
// Skip "def" which optionally follows key value dict definitions in CMaps.
p.skipSpaces()
bb, err = p.reader.Peek(3)
if err != nil {
return dict, err
}
if string(bb) == "def" {
p.reader.Discard(3)
}
}
return dict, nil
}
// parseDict parseNumber a PDF number.
func (p *cMapParser) parseNumber() (cmapObject, error) {
num, err := parseutils.ParseNumber(p.reader)
if err != nil {
return nil, err
}
switch num := num.(type) {
case float64:
return cmapFloat{num}, nil
case int64:
return cmapInt{num}, nil
}
return nil, fmt.Errorf("unhandled number type %T", num)
}
// parseOperand parses an operand, which is a text command represented by a word.
func (p *cMapParser) parseOperand() (cmapOperand, error) {
op := cmapOperand{}
buf := bytes.Buffer{}
for {
bb, err := p.reader.Peek(1)
if err != nil {
if err == io.EOF {
break
}
return op, err
}
if core.IsDelimiter(bb[0]) {
break
}
if core.IsWhiteSpace(bb[0]) {
break
}
b, _ := p.reader.ReadByte()
buf.WriteByte(b)
}
if buf.Len() == 0 {
return op, fmt.Errorf("invalid operand (empty)")
}
op.Operand = buf.String()
return op, nil
}