unipdf/contentstream/parser.go

534 lines
13 KiB
Go
Raw Normal View History

2016-08-22 08:46:18 +00:00
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package contentstream
2016-08-22 08:46:18 +00:00
import (
"bufio"
"bytes"
"encoding/hex"
"errors"
"fmt"
"io"
"strconv"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core"
2016-08-22 08:46:18 +00:00
)
// ContentStreamParser represents a content stream parser for parsing content streams in PDFs.
2016-08-22 08:46:18 +00:00
type ContentStreamParser struct {
reader *bufio.Reader
}
// NewContentStreamParser creates a new instance of the content stream parser from an input content
// stream string.
2016-08-22 08:46:18 +00:00
func NewContentStreamParser(contentStr string) *ContentStreamParser {
// Each command has parameters and an operand (command).
parser := ContentStreamParser{}
buffer := bytes.NewBufferString(contentStr + "\n") // Add newline at end to get last operand without EOF error.
2016-08-22 08:46:18 +00:00
parser.reader = bufio.NewReader(buffer)
return &parser
}
// Parse parses all commands in content stream, returning a list of operation data.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) Parse() (*ContentStreamOperations, error) {
operations := ContentStreamOperations{}
2016-08-22 08:46:18 +00:00
for {
operation := ContentStreamOperation{}
for {
obj, isOperand, err := csp.parseObject()
2016-08-22 08:46:18 +00:00
if err != nil {
if err == io.EOF {
2017-08-04 22:50:28 +00:00
// End of data. Successful exit point.
return &operations, nil
2016-08-22 08:46:18 +00:00
}
return &operations, err
2016-08-22 08:46:18 +00:00
}
if isOperand {
operation.Operand, _ = core.GetStringVal(obj)
2016-08-22 08:46:18 +00:00
operations = append(operations, &operation)
break
} else {
operation.Params = append(operation.Params, obj)
}
}
if operation.Operand == "BI" {
// Parse an inline image, reads everything between the "BI" and "EI".
// The image is stored as the parameter.
2018-10-15 10:13:50 +00:00
im, err := csp.ParseInlineImage()
if err != nil {
return &operations, err
}
operation.Params = append(operation.Params, im)
}
2016-08-22 08:46:18 +00:00
}
}
// Skip over any spaces. Returns the number of spaces skipped and
// an error if any.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) skipSpaces() (int, error) {
2016-08-22 08:46:18 +00:00
cnt := 0
for {
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(1)
2016-08-22 08:46:18 +00:00
if err != nil {
return 0, err
}
if core.IsWhiteSpace(bb[0]) {
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
cnt++
} else {
break
}
}
return cnt, nil
}
2017-01-12 20:08:55 +00:00
// Skip over comments and spaces. Can handle multi-line comments.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) skipComments() error {
if _, err := csp.skipSpaces(); err != nil {
2017-01-12 20:08:55 +00:00
return err
}
isFirst := true
for {
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(1)
2017-01-12 20:08:55 +00:00
if err != nil {
common.Log.Debug("Error %s", err.Error())
return err
}
if isFirst && bb[0] != '%' {
// Not a comment clearly.
return nil
}
isFirst = false
2017-01-12 20:08:55 +00:00
if (bb[0] != '\r') && (bb[0] != '\n') {
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
2017-01-12 20:08:55 +00:00
} else {
break
}
}
// Call recursively to handle multiline comments.
2018-10-15 10:13:50 +00:00
return csp.skipComments()
2017-01-12 20:08:55 +00:00
}
2016-08-22 08:46:18 +00:00
// Parse a name starting with '/'.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseName() (core.PdfObjectName, error) {
2016-08-22 08:46:18 +00:00
name := ""
nameStarted := false
for {
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(1)
2016-08-22 08:46:18 +00:00
if err == io.EOF {
break // Can happen when loading from object stream.
}
if err != nil {
return core.PdfObjectName(name), err
2016-08-22 08:46:18 +00:00
}
if !nameStarted {
// Should always start with '/', otherwise not valid.
if bb[0] == '/' {
nameStarted = true
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
} else {
common.Log.Error("Name starting with %s (% x)", bb, bb)
2018-12-08 19:16:52 +02:00
return core.PdfObjectName(name), fmt.Errorf("invalid name: (%c)", bb[0])
2016-08-22 08:46:18 +00:00
}
} else {
if core.IsWhiteSpace(bb[0]) {
2016-08-22 08:46:18 +00:00
break
} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
break // Looks like start of next statement.
} else if bb[0] == '#' {
2018-10-15 10:13:50 +00:00
hexcode, err := csp.reader.Peek(3)
2016-08-22 08:46:18 +00:00
if err != nil {
return core.PdfObjectName(name), err
2016-08-22 08:46:18 +00:00
}
2018-10-15 10:13:50 +00:00
csp.reader.Discard(3)
2016-08-22 08:46:18 +00:00
code, err := hex.DecodeString(string(hexcode[1:3]))
if err != nil {
return core.PdfObjectName(name), err
2016-08-22 08:46:18 +00:00
}
name += string(code)
} else {
2018-10-15 10:13:50 +00:00
b, _ := csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
name += string(b)
}
}
}
return core.PdfObjectName(name), nil
2016-08-22 08:46:18 +00:00
}
// Numeric objects.
// Section 7.3.3.
// Integer or Float.
//
// An integer shall be written as one or more decimal digits optionally
// preceded by a sign. The value shall be interpreted as a signed
// decimal integer and shall be converted to an integer object.
//
// A real value shall be written as one or more decimal digits with an
// optional sign and a leading, trailing, or embedded PERIOD (2Eh)
// (decimal point). The value shall be interpreted as a real number
// and shall be converted to a real object.
//
// Regarding exponential numbers: 7.3.3 Numeric Objects:
// A conforming writer shall not use the PostScript syntax for numbers
// with non-decimal radices (such as 16#FFFE) or in exponential format
// (such as 6.02E23).
2017-08-04 08:26:36 +00:00
// Nonetheless, we sometimes get numbers with exponential format, so
2016-08-22 08:46:18 +00:00
// we will support it in the reader (no confusion with other types, so
// no compromise).
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseNumber() (core.PdfObject, error) {
2020-01-06 11:11:26 -08:00
return core.ParseNumber(csp.reader)
2016-08-22 08:46:18 +00:00
}
// A string starts with '(' and ends with ')'.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseString() (*core.PdfObjectString, error) {
csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
var bytes []byte
2016-08-22 08:46:18 +00:00
count := 1
for {
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(1)
2016-08-22 08:46:18 +00:00
if err != nil {
return core.MakeString(string(bytes)), err
2016-08-22 08:46:18 +00:00
}
if bb[0] == '\\' { // Escape sequence.
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte() // Skip the escape \ byte.
b, err := csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
if err != nil {
return core.MakeString(string(bytes)), err
2016-08-22 08:46:18 +00:00
}
// Octal '\ddd' number (base 8).
if core.IsOctalDigit(b) {
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(2)
2016-08-22 08:46:18 +00:00
if err != nil {
return core.MakeString(string(bytes)), err
2016-08-22 08:46:18 +00:00
}
var numeric []byte
2016-08-22 08:46:18 +00:00
numeric = append(numeric, b)
for _, val := range bb {
if core.IsOctalDigit(val) {
2016-08-22 08:46:18 +00:00
numeric = append(numeric, val)
} else {
break
}
}
2018-10-15 10:13:50 +00:00
csp.reader.Discard(len(numeric) - 1)
2016-08-22 08:46:18 +00:00
common.Log.Trace("Numeric string \"%s\"", numeric)
2016-08-22 08:46:18 +00:00
code, err := strconv.ParseUint(string(numeric), 8, 32)
if err != nil {
return core.MakeString(string(bytes)), err
2016-08-22 08:46:18 +00:00
}
bytes = append(bytes, byte(code))
continue
}
switch b {
case 'n':
bytes = append(bytes, '\n')
case 'r':
bytes = append(bytes, '\r')
case 't':
bytes = append(bytes, '\t')
case 'b':
bytes = append(bytes, '\b')
case 'f':
bytes = append(bytes, '\f')
case '(':
bytes = append(bytes, '(')
case ')':
bytes = append(bytes, ')')
case '\\':
bytes = append(bytes, '\\')
}
continue
} else if bb[0] == '(' {
count++
} else if bb[0] == ')' {
count--
if count == 0 {
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
break
}
}
2018-10-15 10:13:50 +00:00
b, _ := csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
bytes = append(bytes, b)
}
return core.MakeString(string(bytes)), nil
2016-08-22 08:46:18 +00:00
}
2016-12-04 00:19:24 +00:00
// Starts with '<' ends with '>'.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseHexString() (*core.PdfObjectString, error) {
csp.reader.ReadByte()
2016-12-04 00:19:24 +00:00
hextable := []byte("0123456789abcdefABCDEF")
var tmp []byte
2016-12-04 00:19:24 +00:00
for {
2018-10-15 10:13:50 +00:00
csp.skipSpaces()
2016-12-04 00:19:24 +00:00
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(1)
2016-12-04 00:19:24 +00:00
if err != nil {
return core.MakeString(""), err
2016-12-04 00:19:24 +00:00
}
if bb[0] == '>' {
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
2016-12-04 00:19:24 +00:00
break
}
2018-10-15 10:13:50 +00:00
b, _ := csp.reader.ReadByte()
2016-12-04 00:19:24 +00:00
if bytes.IndexByte(hextable, b) >= 0 {
tmp = append(tmp, b)
}
}
if len(tmp)%2 == 1 {
tmp = append(tmp, '0')
}
buf, _ := hex.DecodeString(string(tmp))
return core.MakeHexString(string(buf)), nil
2016-12-04 00:19:24 +00:00
}
2016-08-22 08:46:18 +00:00
// Starts with '[' ends with ']'. Can contain any kinds of direct objects.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseArray() (*core.PdfObjectArray, error) {
arr := core.MakeArray()
2016-08-22 08:46:18 +00:00
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
for {
2018-10-15 10:13:50 +00:00
csp.skipSpaces()
2016-08-22 08:46:18 +00:00
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(1)
2016-08-22 08:46:18 +00:00
if err != nil {
return arr, err
}
if bb[0] == ']' {
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
break
}
obj, _, err := csp.parseObject()
2016-08-22 08:46:18 +00:00
if err != nil {
return arr, err
}
arr.Append(obj)
2016-08-22 08:46:18 +00:00
}
return arr, nil
}
// Parse bool object.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseBool() (core.PdfObjectBool, error) {
bb, err := csp.reader.Peek(4)
2016-08-22 08:46:18 +00:00
if err != nil {
return core.PdfObjectBool(false), err
2016-08-22 08:46:18 +00:00
}
if (len(bb) >= 4) && (string(bb[:4]) == "true") {
2018-10-15 10:13:50 +00:00
csp.reader.Discard(4)
return core.PdfObjectBool(true), nil
2016-08-22 08:46:18 +00:00
}
2018-10-15 10:13:50 +00:00
bb, err = csp.reader.Peek(5)
2016-08-22 08:46:18 +00:00
if err != nil {
return core.PdfObjectBool(false), err
2016-08-22 08:46:18 +00:00
}
if (len(bb) >= 5) && (string(bb[:5]) == "false") {
2018-10-15 10:13:50 +00:00
csp.reader.Discard(5)
return core.PdfObjectBool(false), nil
2016-08-22 08:46:18 +00:00
}
2018-12-08 19:16:52 +02:00
return core.PdfObjectBool(false), errors.New("unexpected boolean string")
2016-08-22 08:46:18 +00:00
}
// Parse null object.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseNull() (core.PdfObjectNull, error) {
_, err := csp.reader.Discard(4)
return core.PdfObjectNull{}, err
2016-08-22 08:46:18 +00:00
}
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseDict() (*core.PdfObjectDictionary, error) {
common.Log.Trace("Reading content stream dict!")
2016-12-04 00:19:24 +00:00
dict := core.MakeDict()
2016-12-04 00:19:24 +00:00
// Pass the '<<'
2018-10-15 10:13:50 +00:00
c, _ := csp.reader.ReadByte()
2016-12-04 00:19:24 +00:00
if c != '<' {
2018-12-08 19:16:52 +02:00
return nil, errors.New("invalid dict")
2016-12-04 00:19:24 +00:00
}
2018-10-15 10:13:50 +00:00
c, _ = csp.reader.ReadByte()
2016-12-04 00:19:24 +00:00
if c != '<' {
2018-12-08 19:16:52 +02:00
return nil, errors.New("invalid dict")
2016-12-04 00:19:24 +00:00
}
for {
2018-10-15 10:13:50 +00:00
csp.skipSpaces()
2016-12-04 00:19:24 +00:00
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(2)
2016-12-04 00:19:24 +00:00
if err != nil {
return nil, err
}
common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))
2016-12-04 00:19:24 +00:00
if (bb[0] == '>') && (bb[1] == '>') {
common.Log.Trace("EOF dictionary")
2018-10-15 10:13:50 +00:00
csp.reader.ReadByte()
csp.reader.ReadByte()
2016-12-04 00:19:24 +00:00
break
}
common.Log.Trace("Parse the name!")
2016-12-04 00:19:24 +00:00
2018-10-15 10:13:50 +00:00
keyName, err := csp.parseName()
common.Log.Trace("Key: %s", keyName)
2016-12-04 00:19:24 +00:00
if err != nil {
common.Log.Debug("ERROR Returning name err %s", err)
return nil, err
}
if len(keyName) > 4 && keyName[len(keyName)-4:] == "null" {
// Some writers have a bug where the null is appended without
// space. For example "\Boundsnull"
newKey := keyName[0 : len(keyName)-4]
common.Log.Trace("Taking care of null bug (%s)", keyName)
common.Log.Trace("New key \"%s\" = null", newKey)
2018-10-15 10:13:50 +00:00
csp.skipSpaces()
bb, _ := csp.reader.Peek(1)
2016-12-04 00:19:24 +00:00
if bb[0] == '/' {
dict.Set(newKey, core.MakeNull())
2016-12-04 00:19:24 +00:00
continue
}
}
2018-10-15 10:13:50 +00:00
csp.skipSpaces()
2016-12-04 00:19:24 +00:00
val, _, err := csp.parseObject()
2016-12-04 00:19:24 +00:00
if err != nil {
return nil, err
}
dict.Set(keyName, val)
2016-12-04 00:19:24 +00:00
common.Log.Trace("dict[%s] = %s", keyName, val.String())
2016-12-04 00:19:24 +00:00
}
return dict, nil
2016-12-04 00:19:24 +00:00
}
2016-08-22 08:46:18 +00:00
// An operand is a text command represented by a word.
2018-10-15 10:13:50 +00:00
func (csp *ContentStreamParser) parseOperand() (*core.PdfObjectString, error) {
var bytes []byte
2016-08-22 08:46:18 +00:00
for {
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(1)
2016-08-22 08:46:18 +00:00
if err != nil {
return core.MakeString(string(bytes)), err
2016-08-22 08:46:18 +00:00
}
if core.IsDelimiter(bb[0]) {
2016-08-22 08:46:18 +00:00
break
}
if core.IsWhiteSpace(bb[0]) {
2016-08-22 08:46:18 +00:00
break
}
2018-10-15 10:13:50 +00:00
b, _ := csp.reader.ReadByte()
2016-08-22 08:46:18 +00:00
bytes = append(bytes, b)
}
return core.MakeString(string(bytes)), nil
2016-08-22 08:46:18 +00:00
}
// Parse a generic object. Returns the object, an error code, and a bool
// value indicating whether the object is an operand. An operand
// is contained in a pdf string object.
func (csp *ContentStreamParser) parseObject() (obj core.PdfObject, isop bool, err error) {
2016-08-22 08:46:18 +00:00
// Determine the kind of object.
// parse it!
// make a list of operands, then once operand arrives put into a package.
2018-10-15 10:13:50 +00:00
csp.skipSpaces()
2016-08-22 08:46:18 +00:00
for {
2018-10-15 10:13:50 +00:00
bb, err := csp.reader.Peek(2)
2016-08-22 08:46:18 +00:00
if err != nil {
return nil, false, err
2016-08-22 08:46:18 +00:00
}
common.Log.Trace("Peek string: %s", string(bb))
2016-08-22 08:46:18 +00:00
// Determine type.
2017-01-12 20:08:55 +00:00
if bb[0] == '%' {
2018-10-15 10:13:50 +00:00
csp.skipComments()
2017-01-12 20:08:55 +00:00
continue
} else if bb[0] == '/' {
2018-10-15 10:13:50 +00:00
name, err := csp.parseName()
common.Log.Trace("->Name: '%s'", name)
return &name, false, err
2016-08-22 08:46:18 +00:00
} else if bb[0] == '(' {
common.Log.Trace("->String!")
2018-10-15 10:13:50 +00:00
str, err := csp.parseString()
return str, false, err
2016-12-04 00:19:24 +00:00
} else if bb[0] == '<' && bb[1] != '<' {
common.Log.Trace("->Hex String!")
2018-10-15 10:13:50 +00:00
str, err := csp.parseHexString()
return str, false, err
2016-08-22 08:46:18 +00:00
} else if bb[0] == '[' {
common.Log.Trace("->Array!")
2018-10-15 10:13:50 +00:00
arr, err := csp.parseArray()
return arr, false, err
} else if core.IsFloatDigit(bb[0]) || (bb[0] == '-' && core.IsFloatDigit(bb[1])) {
common.Log.Trace("->Number!")
2018-10-15 10:13:50 +00:00
number, err := csp.parseNumber()
return number, false, err
2016-12-04 00:19:24 +00:00
} else if bb[0] == '<' && bb[1] == '<' {
2018-10-15 10:13:50 +00:00
dict, err := csp.parseDict()
return dict, false, err
2016-08-22 08:46:18 +00:00
} else {
// Otherwise, can be: keyword such as "null", "false", "true" or an operand...
common.Log.Trace("->Operand or bool?")
2016-08-22 08:46:18 +00:00
// Let's peek farther to find out.
2018-10-15 10:13:50 +00:00
bb, _ = csp.reader.Peek(5)
2016-08-22 08:46:18 +00:00
peekStr := string(bb)
common.Log.Trace("cont Peek str: %s", peekStr)
2016-08-22 08:46:18 +00:00
if (len(peekStr) > 3) && (peekStr[:4] == "null") {
2018-10-15 10:13:50 +00:00
null, err := csp.parseNull()
return &null, false, err
2016-08-22 08:46:18 +00:00
} else if (len(peekStr) > 4) && (peekStr[:5] == "false") {
2018-10-15 10:13:50 +00:00
b, err := csp.parseBool()
return &b, false, err
2016-08-22 08:46:18 +00:00
} else if (len(peekStr) > 3) && (peekStr[:4] == "true") {
2018-10-15 10:13:50 +00:00
b, err := csp.parseBool()
return &b, false, err
2016-08-22 08:46:18 +00:00
}
2018-10-15 10:13:50 +00:00
operand, err := csp.parseOperand()
if err != nil {
return operand, false, err
}
if len(operand.String()) < 1 {
return operand, false, ErrInvalidOperand
}
return operand, true, nil
2016-08-22 08:46:18 +00:00
}
}
}