unipdf/pdf/internal/cmap/parser.go

470 lines
9.2 KiB
Go
Raw Normal View History

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package cmap
import (
"bufio"
"bytes"
"encoding/hex"
"fmt"
"io"
"strconv"
"github.com/unidoc/unidoc/common"
. "github.com/unidoc/unidoc/pdf/core"
)
// cMapParser parses CMap character to unicode mapping files.
type cMapParser struct {
reader *bufio.Reader
}
// cMapParser creates a new instance of the PDF CMap parser from input data.
func newCMapParser(content []byte) *cMapParser {
parser := cMapParser{}
buffer := bytes.NewBuffer(content)
parser.reader = bufio.NewReader(buffer)
return &parser
}
// parseObject detects the signature at the current file position and parses the corresponding object.
func (p *cMapParser) parseObject() (cmapObject, error) {
p.skipSpaces()
for {
bb, err := p.reader.Peek(2)
if err != nil {
return nil, err
}
if bb[0] == '%' {
p.parseComment()
p.skipSpaces()
continue
} else if bb[0] == '/' {
name, err := p.parseName()
return name, err
} else if bb[0] == '(' {
str, err := p.parseString()
return str, err
} else if bb[0] == '[' {
arr, err := p.parseArray()
return arr, err
} else if (bb[0] == '<') && (bb[1] == '<') {
dict, err := p.parseDict()
return dict, err
} else if bb[0] == '<' {
shex, err := p.parseHexString()
return shex, err
} else if IsDecimalDigit(bb[0]) || (bb[0] == '-' && IsDecimalDigit(bb[1])) {
number, err := p.parseNumber()
if err != nil {
return nil, err
}
return number, nil
} else {
// Operand?
operand, err := p.parseOperand()
if err != nil {
return nil, err
}
return operand, nil
}
}
}
// skipSpaces skips over any spaces. Returns the number of spaces skipped and an error if any.
func (p *cMapParser) skipSpaces() (int, error) {
cnt := 0
for {
bb, err := p.reader.Peek(1)
if err != nil {
return 0, err
}
if IsWhiteSpace(bb[0]) {
p.reader.ReadByte()
cnt++
} else {
break
}
}
return cnt, nil
}
// parseComment reads a comment line starting with '%'.
func (p *cMapParser) parseComment() (string, error) {
var r bytes.Buffer
_, err := p.skipSpaces()
if err != nil {
return r.String(), err
}
isFirst := true
for {
bb, err := p.reader.Peek(1)
if err != nil {
common.Log.Debug("parseComment: err=%v", err)
return r.String(), err
}
if isFirst && bb[0] != '%' {
return r.String(), ErrBadCMapComment
}
isFirst = false
if (bb[0] != '\r') && (bb[0] != '\n') {
b, _ := p.reader.ReadByte()
r.WriteByte(b)
} else {
break
}
}
return r.String(), nil
}
// Parse a name starting with '/'.
func (p *cMapParser) parseName() (cmapName, error) {
name := ""
nameStarted := false
for {
bb, err := p.reader.Peek(1)
if err == io.EOF {
break // Can happen when loading from object stream.
}
if err != nil {
return cmapName{name}, err
}
if !nameStarted {
// Should always start with '/', otherwise not valid.
if bb[0] == '/' {
nameStarted = true
p.reader.ReadByte()
} else {
common.Log.Debug("ERROR Name starting with %s (% x)", bb, bb)
return cmapName{name}, fmt.Errorf("Invalid name: (%c)", bb[0])
}
} else {
if IsWhiteSpace(bb[0]) {
break
} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
break // Looks like start of next statement.
} else if bb[0] == '#' {
hexcode, err := p.reader.Peek(3)
if err != nil {
return cmapName{name}, err
}
p.reader.Discard(3)
code, err := hex.DecodeString(string(hexcode[1:3]))
if err != nil {
return cmapName{name}, err
}
name += string(code)
} else {
b, _ := p.reader.ReadByte()
name += string(b)
}
}
}
return cmapName{name}, nil
}
// A string starts with '(' and ends with ')'.
func (p *cMapParser) parseString() (cmapString, error) {
p.reader.ReadByte()
buf := bytes.Buffer{}
count := 1
for {
bb, err := p.reader.Peek(1)
if err != nil {
return cmapString{buf.String()}, err
}
if bb[0] == '\\' { // Escape sequence.
p.reader.ReadByte() // Skip the escape \ byte.
b, err := p.reader.ReadByte()
if err != nil {
return cmapString{buf.String()}, err
}
// Octal '\ddd' number (base 8).
if IsOctalDigit(b) {
bb, err := p.reader.Peek(2)
if err != nil {
return cmapString{buf.String()}, err
}
numeric := []byte{}
numeric = append(numeric, b)
for _, val := range bb {
if IsOctalDigit(val) {
numeric = append(numeric, val)
} else {
break
}
}
p.reader.Discard(len(numeric) - 1)
common.Log.Trace("Numeric string \"%s\"", numeric)
code, err := strconv.ParseUint(string(numeric), 8, 32)
if err != nil {
return cmapString{buf.String()}, err
}
buf.WriteByte(byte(code))
continue
}
switch b {
case 'n':
buf.WriteByte('\n')
case 'r':
buf.WriteByte('\r')
case 't':
buf.WriteByte('\t')
case 'b':
buf.WriteByte('\b')
case 'f':
buf.WriteByte('\f')
case '(':
buf.WriteByte('(')
case ')':
buf.WriteByte(')')
case '\\':
buf.WriteByte('\\')
}
continue
} else if bb[0] == '(' {
count++
} else if bb[0] == ')' {
count--
if count == 0 {
p.reader.ReadByte()
break
}
}
b, _ := p.reader.ReadByte()
buf.WriteByte(b)
}
return cmapString{buf.String()}, nil
}
// parseHexString parses a PostScript hex string.
// Hex strings start with '<' ends with '>'.
// Currently not converting the hex codes to characters.
func (p *cMapParser) parseHexString() (cmapHexString, error) {
p.reader.ReadByte()
hextable := []byte("0123456789abcdefABCDEF")
buf := bytes.Buffer{}
for {
p.skipSpaces()
bb, err := p.reader.Peek(1)
if err != nil {
return cmapHexString{}, err
}
if bb[0] == '>' {
p.reader.ReadByte()
break
}
b, _ := p.reader.ReadByte()
if bytes.IndexByte(hextable, b) >= 0 {
buf.WriteByte(b)
}
}
if buf.Len()%2 == 1 {
common.Log.Debug("parseHexString: appending '0' to %#q", buf.String())
buf.WriteByte('0')
}
numBytes := buf.Len() / 2
hexb, _ := hex.DecodeString(buf.String())
return cmapHexString{numBytes: numBytes, b: hexb}, nil
}
// Starts with '[' ends with ']'. Can contain any kinds of direct objects.
func (p *cMapParser) parseArray() (cmapArray, error) {
arr := cmapArray{}
arr.Array = []cmapObject{}
p.reader.ReadByte()
for {
p.skipSpaces()
bb, err := p.reader.Peek(1)
if err != nil {
return arr, err
}
if bb[0] == ']' {
p.reader.ReadByte()
break
}
obj, err := p.parseObject()
if err != nil {
return arr, err
}
arr.Array = append(arr.Array, obj)
}
return arr, nil
}
// Reads and parses a PDF dictionary object enclosed with '<<' and '>>'
func (p *cMapParser) parseDict() (cmapDict, error) {
common.Log.Trace("Reading PDF Dict!")
dict := makeDict()
// Pass the '<<'
c, _ := p.reader.ReadByte()
if c != '<' {
return dict, ErrBadCMapDict
}
c, _ = p.reader.ReadByte()
if c != '<' {
return dict, ErrBadCMapDict
}
for {
p.skipSpaces()
bb, err := p.reader.Peek(2)
if err != nil {
return dict, err
}
if (bb[0] == '>') && (bb[1] == '>') {
p.reader.ReadByte()
p.reader.ReadByte()
break
}
key, err := p.parseName()
common.Log.Trace("Key: %s", key.Name)
if err != nil {
common.Log.Debug("ERROR: Returning name. err=%v", err)
return dict, err
}
p.skipSpaces()
val, err := p.parseObject()
if err != nil {
return dict, err
}
dict.Dict[key.Name] = val
// Skip "def" which optionally follows key value dict definitions in CMaps.
p.skipSpaces()
bb, err = p.reader.Peek(3)
if err != nil {
return dict, err
}
if string(bb) == "def" {
p.reader.Discard(3)
}
}
return dict, nil
}
func (p *cMapParser) parseNumber() (cmapObject, error) {
isFloat := false
allowSigns := true
numStr := bytes.Buffer{}
for {
bb, err := p.reader.Peek(1)
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if IsDecimalDigit(bb[0]) {
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
} else if bb[0] == '.' {
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
isFloat = true
} else if bb[0] == 'e' {
// Exponential number format.
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
isFloat = true
allowSigns = true
} else {
break
}
}
if isFloat {
fVal, err := strconv.ParseFloat(numStr.String(), 64)
o := cmapFloat{fVal}
return o, err
}
intVal, err := strconv.ParseInt(numStr.String(), 10, 64)
o := cmapInt{intVal}
return o, err
}
// An operand is a text command represented by a word.
func (p *cMapParser) parseOperand() (cmapOperand, error) {
op := cmapOperand{}
buf := bytes.Buffer{}
for {
bb, err := p.reader.Peek(1)
if err != nil {
if err == io.EOF {
break
}
return op, err
}
if IsDelimiter(bb[0]) {
break
}
if IsWhiteSpace(bb[0]) {
break
}
b, _ := p.reader.ReadByte()
buf.WriteByte(b)
}
if buf.Len() == 0 {
return op, fmt.Errorf("Invalid operand (empty)")
}
op.Operand = buf.String()
return op, nil
}