unipdf/pdf/contentstream/parser.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package contentstream

import (
	"bufio"
	"bytes"
	"encoding/hex"
	"errors"
	"fmt"
	"io"
	"strconv"

	"github.com/unidoc/unidoc/common"
	. "github.com/unidoc/unidoc/pdf/core"
)

// Content stream parser.
type ContentStreamParser struct {
	reader *bufio.Reader
}

// Create a new instance of the content stream parser from an input content
// stream string.
func NewContentStreamParser(contentStr string) *ContentStreamParser {
	// Each command has parameters and an operand (command).
	parser := ContentStreamParser{}

	buffer := bytes.NewBufferString(contentStr + "\n") // Add newline at end to get last operand without EOF error.
	parser.reader = bufio.NewReader(buffer)

	return &parser
}

// Parses all commands in content stream, returning a list of operation data.
func (this *ContentStreamParser) Parse() (ContentStreamOperations, error) {
	operations := ContentStreamOperations{}

	for {
		operation := ContentStreamOperation{}

		for {
			obj, err, isOperand := this.parseObject()
			if err != nil {
				if err == io.EOF {
					return operations, nil
				}
				return nil, err
			}
			if isOperand {
				operation.Operand = string(*obj.(*PdfObjectString))
				operations = append(operations, &operation)
				break
			} else {
				operation.Params = append(operation.Params, obj)
			}
		}

		if operation.Operand == "BI" {
			// Parse an inline image, reads everything between the "BI" and "EI".
			// The image is stored as the parameter.
			im, err := this.ParseInlineImage()
			if err != nil {
				return nil, err
			}
			operation.Params = append(operation.Params, im)
		}
	}

	common.Log.Debug("Operation list: %v\n", operations)
	return operations, nil
}

// Skip over any spaces.  Returns the number of spaces skipped and
// an error if any.
func (this *ContentStreamParser) skipSpaces() (int, error) {
	cnt := 0
	for {
		bb, err := this.reader.Peek(1)
		if err != nil {
			return 0, err
		}
		if IsWhiteSpace(bb[0]) {
			this.reader.ReadByte()
			cnt++
		} else {
			break
		}
	}

	return cnt, nil
}

// Skip over comments and spaces. Can handle multi-line comments.
func (this *ContentStreamParser) skipComments() error {
	if _, err := this.skipSpaces(); err != nil {
		return err
	}

	isFirst := true
	for {
		bb, err := this.reader.Peek(1)
		if err != nil {
			common.Log.Debug("Error %s", err.Error())
			return err
		}
		if isFirst && bb[0] != '%' {
			// Not a comment clearly.
			return nil
		} else {
			isFirst = false
		}
		if (bb[0] != '\r') && (bb[0] != '\n') {
			this.reader.ReadByte()
		} else {
			break
		}
	}

	// Call recursively to handle multiline comments.
	return this.skipComments()
}

// Parse a name starting with '/'.
func (this *ContentStreamParser) parseName() (PdfObjectName, error) {
	name := ""
	nameStarted := false
	for {
		bb, err := this.reader.Peek(1)
		if err == io.EOF {
			break // Can happen when loading from object stream.
		}
		if err != nil {
			return PdfObjectName(name), err
		}

		if !nameStarted {
			// Should always start with '/', otherwise not valid.
			if bb[0] == '/' {
				nameStarted = true
				this.reader.ReadByte()
			} else {
				common.Log.Error("Name starting with %s (% x)", bb, bb)
				return PdfObjectName(name), fmt.Errorf("Invalid name: (%c)", bb[0])
			}
		} else {
			if IsWhiteSpace(bb[0]) {
				break
			} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
				break // Looks like start of next statement.
			} else if bb[0] == '#' {
				hexcode, err := this.reader.Peek(3)
				if err != nil {
					return PdfObjectName(name), err
				}
				this.reader.Discard(3)

				code, err := hex.DecodeString(string(hexcode[1:3]))
				if err != nil {
					return PdfObjectName(name), err
				}
				name += string(code)
			} else {
				b, _ := this.reader.ReadByte()
				name += string(b)
			}
		}
	}
	return PdfObjectName(name), nil
}

// Numeric objects.
// Section 7.3.3.
// Integer or Float.
//
// An integer shall be written as one or more decimal digits optionally
// preceded by a sign. The value shall be interpreted as a signed
// decimal integer and shall be converted to an integer object.
//
// A real value shall be written as one or more decimal digits with an
// optional sign and a leading, trailing, or embedded PERIOD (2Eh)
// (decimal point). The value shall be interpreted as a real number
// and shall be converted to a real object.
//
// Regarding exponential numbers: 7.3.3 Numeric Objects:
// A conforming writer shall not use the PostScript syntax for numbers
// with non-decimal radices (such as 16#FFFE) or in exponential format
// (such as 6.02E23).
// Nontheless, we sometimes get numbers with exponential format, so
// we will support it in the reader (no confusion with other types, so
// no compromise).
func (this *ContentStreamParser) parseNumber() (PdfObject, error) {
	isFloat := false
	allowSigns := true
	numStr := ""
	for {
		common.Log.Trace("Parsing number \"%s\"", numStr)
		bb, err := this.reader.Peek(1)
		if err == io.EOF {
			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
			// encoded object streams that the object is at the end.
			// In other cases, we will get the EOF error elsewhere at any rate.
			break // Handle like EOF
		}
		if err != nil {
			common.Log.Error("ERROR %s", err)
			return nil, err
		}
		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
			// Only appear in the beginning, otherwise serves as a delimiter.
			b, _ := this.reader.ReadByte()
			numStr += string(b)
			allowSigns = false // Only allowed in beginning, and after e (exponential).
		} else if IsDecimalDigit(bb[0]) {
			b, _ := this.reader.ReadByte()
			numStr += string(b)
		} else if bb[0] == '.' {
			b, _ := this.reader.ReadByte()
			numStr += string(b)
			isFloat = true
		} else if bb[0] == 'e' {
			// Exponential number format.
			b, _ := this.reader.ReadByte()
			numStr += string(b)
			isFloat = true
			allowSigns = true
		} else {
			break
		}
	}

	if isFloat {
		fVal, err := strconv.ParseFloat(numStr, 64)
		o := PdfObjectFloat(fVal)
		return &o, err
	} else {
		intVal, err := strconv.ParseInt(numStr, 10, 64)
		o := PdfObjectInteger(intVal)
		return &o, err
	}
}

// A string starts with '(' and ends with ')'.
func (this *ContentStreamParser) parseString() (PdfObjectString, error) {
	this.reader.ReadByte()

	bytes := []byte{}
	count := 1
	for {
		bb, err := this.reader.Peek(1)
		if err != nil {
			return PdfObjectString(bytes), err
		}

		if bb[0] == '\\' { // Escape sequence.
			this.reader.ReadByte() // Skip the escape \ byte.
			b, err := this.reader.ReadByte()
			if err != nil {
				return PdfObjectString(bytes), err
			}

			// Octal '\ddd' number (base 8).
			if IsOctalDigit(b) {
				bb, err := this.reader.Peek(2)
				if err != nil {
					return PdfObjectString(bytes), err
				}

				numeric := []byte{}
				numeric = append(numeric, b)
				for _, val := range bb {
					if IsOctalDigit(val) {
						numeric = append(numeric, val)
					} else {
						break
					}
				}
				this.reader.Discard(len(numeric) - 1)

				common.Log.Trace("Numeric string \"%s\"", numeric)
				code, err := strconv.ParseUint(string(numeric), 8, 32)
				if err != nil {
					return PdfObjectString(bytes), err
				}
				bytes = append(bytes, byte(code))
				continue
			}

			switch b {
			case 'n':
				bytes = append(bytes, '\n')
			case 'r':
				bytes = append(bytes, '\r')
			case 't':
				bytes = append(bytes, '\t')
			case 'b':
				bytes = append(bytes, '\b')
			case 'f':
				bytes = append(bytes, '\f')
			case '(':
				bytes = append(bytes, '(')
			case ')':
				bytes = append(bytes, ')')
			case '\\':
				bytes = append(bytes, '\\')
			}

			continue
		} else if bb[0] == '(' {
			count++
		} else if bb[0] == ')' {
			count--
			if count == 0 {
				this.reader.ReadByte()
				break
			}
		}

		b, _ := this.reader.ReadByte()
		bytes = append(bytes, b)
	}

	return PdfObjectString(bytes), nil
}

// Starts with '<' ends with '>'.
func (this *ContentStreamParser) parseHexString() (PdfObjectString, error) {
	this.reader.ReadByte()

	hextable := []byte("0123456789abcdefABCDEF")

	tmp := []byte{}
	for {
		this.skipSpaces()

		bb, err := this.reader.Peek(1)
		if err != nil {
			return PdfObjectString(""), err
		}

		if bb[0] == '>' {
			this.reader.ReadByte()
			break
		}

		b, _ := this.reader.ReadByte()
		if bytes.IndexByte(hextable, b) >= 0 {
			tmp = append(tmp, b)
		}
	}

	if len(tmp)%2 == 1 {
		tmp = append(tmp, '0')
	}

	buf, _ := hex.DecodeString(string(tmp))
	return PdfObjectString(buf), nil
}

// Starts with '[' ends with ']'.  Can contain any kinds of direct objects.
func (this *ContentStreamParser) parseArray() (PdfObjectArray, error) {
	arr := make(PdfObjectArray, 0)

	this.reader.ReadByte()

	for {
		this.skipSpaces()

		bb, err := this.reader.Peek(1)
		if err != nil {
			return arr, err
		}

		if bb[0] == ']' {
			this.reader.ReadByte()
			break
		}

		obj, err, _ := this.parseObject()
		if err != nil {
			return arr, err
		}
		arr = append(arr, obj)
	}

	return arr, nil
}

// Parse bool object.
func (this *ContentStreamParser) parseBool() (PdfObjectBool, error) {
	bb, err := this.reader.Peek(4)
	if err != nil {
		return PdfObjectBool(false), err
	}
	if (len(bb) >= 4) && (string(bb[:4]) == "true") {
		this.reader.Discard(4)
		return PdfObjectBool(true), nil
	}

	bb, err = this.reader.Peek(5)
	if err != nil {
		return PdfObjectBool(false), err
	}
	if (len(bb) >= 5) && (string(bb[:5]) == "false") {
		this.reader.Discard(5)
		return PdfObjectBool(false), nil
	}

	return PdfObjectBool(false), errors.New("Unexpected boolean string")
}

// Parse null object.
func (this *ContentStreamParser) parseNull() (PdfObjectNull, error) {
	_, err := this.reader.Discard(4)
	return PdfObjectNull{}, err
}

func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
	common.Log.Trace("Reading content stream dict!")

	dict := make(PdfObjectDictionary)

	// Pass the '<<'
	c, _ := this.reader.ReadByte()
	if c != '<' {
		return nil, errors.New("Invalid dict")
	}
	c, _ = this.reader.ReadByte()
	if c != '<' {
		return nil, errors.New("Invalid dict")
	}

	for {
		this.skipSpaces()

		bb, err := this.reader.Peek(2)
		if err != nil {
			return nil, err
		}

		common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))
		if (bb[0] == '>') && (bb[1] == '>') {
			common.Log.Trace("EOF dictionary")
			this.reader.ReadByte()
			this.reader.ReadByte()
			break
		}
		common.Log.Trace("Parse the name!")

		keyName, err := this.parseName()
		common.Log.Trace("Key: %s", keyName)
		if err != nil {
			common.Log.Debug("ERROR Returning name err %s", err)
			return nil, err
		}

		if len(keyName) > 4 && keyName[len(keyName)-4:] == "null" {
			// Some writers have a bug where the null is appended without
			// space.  For example "\Boundsnull"
			newKey := keyName[0 : len(keyName)-4]
			common.Log.Trace("Taking care of null bug (%s)", keyName)
			common.Log.Trace("New key \"%s\" = null", newKey)
			this.skipSpaces()
			bb, _ := this.reader.Peek(1)
			if bb[0] == '/' {
				var nullObj PdfObjectNull
				dict[newKey] = &nullObj
				continue
			}
		}

		this.skipSpaces()

		val, err, _ := this.parseObject()
		if err != nil {
			return nil, err
		}
		dict[keyName] = val

		common.Log.Trace("dict[%s] = %s", keyName, val.String())
	}

	return &dict, nil
}

// An operand is a text command represented by a word.
func (this *ContentStreamParser) parseOperand() (PdfObjectString, error) {
	bytes := []byte{}
	for {
		bb, err := this.reader.Peek(1)
		if err != nil {
			return PdfObjectString(bytes), err
		}
		if IsDelimiter(bb[0]) {
			break
		}
		if IsWhiteSpace(bb[0]) {
			break
		}

		b, _ := this.reader.ReadByte()
		bytes = append(bytes, b)
	}

	return PdfObjectString(bytes), nil
}

// Parse a generic object.  Returns the object, an error code, and a bool
// value indicating whether the object is an operand.  An operand
// is contained in a pdf string object.
func (this *ContentStreamParser) parseObject() (PdfObject, error, bool) {
	// Determine the kind of object.
	// parse it!
	// make a list of operands, then once operand arrives put into a package.

	this.skipSpaces()
	for {
		bb, err := this.reader.Peek(2)
		if err != nil {
			return nil, err, false
		}

		common.Log.Trace("Peek string: %s", string(bb))
		// Determine type.
		if bb[0] == '%' {
			this.skipComments()
			continue
		} else if bb[0] == '/' {
			name, err := this.parseName()
			common.Log.Trace("->Name: '%s'", name)
			return &name, err, false
		} else if bb[0] == '(' {
			common.Log.Trace("->String!")
			str, err := this.parseString()
			return &str, err, false
		} else if bb[0] == '<' && bb[1] != '<' {
			common.Log.Trace("->Hex String!")
			str, err := this.parseHexString()
			return &str, err, false
		} else if bb[0] == '[' {
			common.Log.Trace("->Array!")
			arr, err := this.parseArray()
			return &arr, err, false
		} else if IsFloatDigit(bb[0]) || (bb[0] == '-' && IsFloatDigit(bb[1])) {
			common.Log.Trace("->Number!")
			number, err := this.parseNumber()
			return number, err, false
		} else if bb[0] == '<' && bb[1] == '<' {
			dict, err := this.parseDict()
			return dict, err, false
		} else {
			common.Log.Trace("->Operand or bool?")
			// Let's peek farther to find out.
			bb, _ = this.reader.Peek(5)
			peekStr := string(bb)
			common.Log.Trace("Peek str: %s", peekStr)

			if (len(peekStr) > 3) && (peekStr[:4] == "null") {
				null, err := this.parseNull()
				return &null, err, false
			} else if (len(peekStr) > 4) && (peekStr[:5] == "false") {
				b, err := this.parseBool()
				return &b, err, false
			} else if (len(peekStr) > 3) && (peekStr[:4] == "true") {
				b, err := this.parseBool()
				return &b, err, false
			}

			operand, err := this.parseOperand()
			return &operand, err, true
		}
	}
}
text extraction 2016-08-22 08:46:18 +00:00			`/*`
			`* This file is subject to the terms and conditions defined in`
			`* file 'LICENSE.md', which is part of this source code package.`
			`*/`

Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`package contentstream`
text extraction 2016-08-22 08:46:18 +00:00
			`import (`
			`"bufio"`
			`"bytes"`
			`"encoding/hex"`
			`"errors"`
			`"fmt"`
			`"io"`
			`"strconv"`

			`"github.com/unidoc/unidoc/common"`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`. "github.com/unidoc/unidoc/pdf/core"`
text extraction 2016-08-22 08:46:18 +00:00			`)`

			`// Content stream parser.`
			`type ContentStreamParser struct {`
			`reader *bufio.Reader`
			`}`

Support for parsing content streams and extracting text 2016-08-23 01:26:45 +00:00			`// Create a new instance of the content stream parser from an input content`
			`// stream string.`
text extraction 2016-08-22 08:46:18 +00:00			`func NewContentStreamParser(contentStr string) *ContentStreamParser {`
			`// Each command has parameters and an operand (command).`
			`parser := ContentStreamParser{}`

Initial implementation of a simple graphics state handler / content stream processor. Multiple changes and fixes in colorspace handling. And more. 2017-03-14 13:04:51 +00:00			`buffer := bytes.NewBufferString(contentStr + "\n") // Add newline at end to get last operand without EOF error.`
text extraction 2016-08-22 08:46:18 +00:00			`parser.reader = bufio.NewReader(buffer)`

			`return &parser`
			`}`

Support for parsing content streams and extracting text 2016-08-23 01:26:45 +00:00			`// Parses all commands in content stream, returning a list of operation data.`
Initial implementation of a simple graphics state handler / content stream processor. Multiple changes and fixes in colorspace handling. And more. 2017-03-14 13:04:51 +00:00			`func (this *ContentStreamParser) Parse() (ContentStreamOperations, error) {`
			`operations := ContentStreamOperations{}`
text extraction 2016-08-22 08:46:18 +00:00
			`for {`
			`operation := ContentStreamOperation{}`

			`for {`
			`obj, err, isOperand := this.parseObject()`
			`if err != nil {`
			`if err == io.EOF {`
			`return operations, nil`
			`}`
			`return nil, err`
			`}`
			`if isOperand {`
			`operation.Operand = string(obj.(PdfObjectString))`
			`operations = append(operations, &operation)`
			`break`
			`} else {`
			`operation.Params = append(operation.Params, obj)`
			`}`
			`}`
Support for LZW stream decoding. Support for content streams with inline images. 2017-02-12 21:37:07 +00:00
			`if operation.Operand == "BI" {`
			`// Parse an inline image, reads everything between the "BI" and "EI".`
			`// The image is stored as the parameter.`
			`im, err := this.ParseInlineImage()`
			`if err != nil {`
			`return nil, err`
			`}`
			`operation.Params = append(operation.Params, im)`
			`}`
text extraction 2016-08-22 08:46:18 +00:00			`}`
Support for parsing content streams and extracting text 2016-08-23 01:26:45 +00:00
			`common.Log.Debug("Operation list: %v\n", operations)`
text extraction 2016-08-22 08:46:18 +00:00			`return operations, nil`
			`}`

			`// Skip over any spaces. Returns the number of spaces skipped and`
			`// an error if any.`
			`func (this *ContentStreamParser) skipSpaces() (int, error) {`
			`cnt := 0`
			`for {`
			`bb, err := this.reader.Peek(1)`
			`if err != nil {`
			`return 0, err`
			`}`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`if IsWhiteSpace(bb[0]) {`
text extraction 2016-08-22 08:46:18 +00:00			`this.reader.ReadByte()`
			`cnt++`
			`} else {`
			`break`
			`}`
			`}`

			`return cnt, nil`
			`}`

Support comments in content streams 2017-01-12 20:08:55 +00:00			`// Skip over comments and spaces. Can handle multi-line comments.`
			`func (this *ContentStreamParser) skipComments() error {`
			`if _, err := this.skipSpaces(); err != nil {`
			`return err`
			`}`

			`isFirst := true`
			`for {`
			`bb, err := this.reader.Peek(1)`
			`if err != nil {`
			`common.Log.Debug("Error %s", err.Error())`
			`return err`
			`}`
			`if isFirst && bb[0] != '%' {`
			`// Not a comment clearly.`
			`return nil`
			`} else {`
			`isFirst = false`
			`}`
			`if (bb[0] != '\r') && (bb[0] != '\n') {`
			`this.reader.ReadByte()`
			`} else {`
			`break`
			`}`
			`}`

			`// Call recursively to handle multiline comments.`
			`return this.skipComments()`
			`}`

text extraction 2016-08-22 08:46:18 +00:00			`// Parse a name starting with '/'.`
			`func (this *ContentStreamParser) parseName() (PdfObjectName, error) {`
			`name := ""`
			`nameStarted := false`
			`for {`
			`bb, err := this.reader.Peek(1)`
			`if err == io.EOF {`
			`break // Can happen when loading from object stream.`
			`}`
			`if err != nil {`
			`return PdfObjectName(name), err`
			`}`

			`if !nameStarted {`
			`// Should always start with '/', otherwise not valid.`
			`if bb[0] == '/' {`
			`nameStarted = true`
			`this.reader.ReadByte()`
			`} else {`
			`common.Log.Error("Name starting with %s (% x)", bb, bb)`
			`return PdfObjectName(name), fmt.Errorf("Invalid name: (%c)", bb[0])`
			`}`
			`} else {`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`if IsWhiteSpace(bb[0]) {`
text extraction 2016-08-22 08:46:18 +00:00			`break`
			`} else if (bb[0] == '/') \|\| (bb[0] == '[') \|\| (bb[0] == '(') \|\| (bb[0] == ']') \|\| (bb[0] == '<') \|\| (bb[0] == '>') {`
			`break // Looks like start of next statement.`
			`} else if bb[0] == '#' {`
			`hexcode, err := this.reader.Peek(3)`
			`if err != nil {`
			`return PdfObjectName(name), err`
			`}`
			`this.reader.Discard(3)`

			`code, err := hex.DecodeString(string(hexcode[1:3]))`
			`if err != nil {`
			`return PdfObjectName(name), err`
			`}`
			`name += string(code)`
			`} else {`
			`b, _ := this.reader.ReadByte()`
			`name += string(b)`
			`}`
			`}`
			`}`
			`return PdfObjectName(name), nil`
			`}`

			`// Numeric objects.`
			`// Section 7.3.3.`
			`// Integer or Float.`
			`//`
			`// An integer shall be written as one or more decimal digits optionally`
			`// preceded by a sign. The value shall be interpreted as a signed`
			`// decimal integer and shall be converted to an integer object.`
			`//`
			`// A real value shall be written as one or more decimal digits with an`
			`// optional sign and a leading, trailing, or embedded PERIOD (2Eh)`
			`// (decimal point). The value shall be interpreted as a real number`
			`// and shall be converted to a real object.`
			`//`
			`// Regarding exponential numbers: 7.3.3 Numeric Objects:`
			`// A conforming writer shall not use the PostScript syntax for numbers`
			`// with non-decimal radices (such as 16#FFFE) or in exponential format`
			`// (such as 6.02E23).`
			`// Nontheless, we sometimes get numbers with exponential format, so`
			`// we will support it in the reader (no confusion with other types, so`
			`// no compromise).`
			`func (this *ContentStreamParser) parseNumber() (PdfObject, error) {`
			`isFloat := false`
			`allowSigns := true`
			`numStr := ""`
			`for {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Parsing number \"%s\"", numStr)`
text extraction 2016-08-22 08:46:18 +00:00			`bb, err := this.reader.Peek(1)`
			`if err == io.EOF {`
			`// GH: EOF handling. Handle EOF like end of line. Can happen with`
			`// encoded object streams that the object is at the end.`
			`// In other cases, we will get the EOF error elsewhere at any rate.`
			`break // Handle like EOF`
			`}`
			`if err != nil {`
			`common.Log.Error("ERROR %s", err)`
			`return nil, err`
			`}`
			`if allowSigns && (bb[0] == '-' \|\| bb[0] == '+') {`
			`// Only appear in the beginning, otherwise serves as a delimiter.`
			`b, _ := this.reader.ReadByte()`
			`numStr += string(b)`
			`allowSigns = false // Only allowed in beginning, and after e (exponential).`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`} else if IsDecimalDigit(bb[0]) {`
text extraction 2016-08-22 08:46:18 +00:00			`b, _ := this.reader.ReadByte()`
			`numStr += string(b)`
			`} else if bb[0] == '.' {`
			`b, _ := this.reader.ReadByte()`
			`numStr += string(b)`
			`isFloat = true`
			`} else if bb[0] == 'e' {`
			`// Exponential number format.`
			`b, _ := this.reader.ReadByte()`
			`numStr += string(b)`
			`isFloat = true`
			`allowSigns = true`
			`} else {`
			`break`
			`}`
			`}`

			`if isFloat {`
			`fVal, err := strconv.ParseFloat(numStr, 64)`
			`o := PdfObjectFloat(fVal)`
			`return &o, err`
			`} else {`
			`intVal, err := strconv.ParseInt(numStr, 10, 64)`
			`o := PdfObjectInteger(intVal)`
			`return &o, err`
			`}`
			`}`

			`// A string starts with '(' and ends with ')'.`
			`func (this *ContentStreamParser) parseString() (PdfObjectString, error) {`
			`this.reader.ReadByte()`

			`bytes := []byte{}`
			`count := 1`
			`for {`
			`bb, err := this.reader.Peek(1)`
			`if err != nil {`
			`return PdfObjectString(bytes), err`
			`}`

			`if bb[0] == '\\' { // Escape sequence.`
			`this.reader.ReadByte() // Skip the escape \ byte.`
			`b, err := this.reader.ReadByte()`
			`if err != nil {`
			`return PdfObjectString(bytes), err`
			`}`

			`// Octal '\ddd' number (base 8).`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`if IsOctalDigit(b) {`
text extraction 2016-08-22 08:46:18 +00:00			`bb, err := this.reader.Peek(2)`
			`if err != nil {`
			`return PdfObjectString(bytes), err`
			`}`

			`numeric := []byte{}`
			`numeric = append(numeric, b)`
			`for _, val := range bb {`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`if IsOctalDigit(val) {`
text extraction 2016-08-22 08:46:18 +00:00			`numeric = append(numeric, val)`
			`} else {`
			`break`
			`}`
			`}`
			`this.reader.Discard(len(numeric) - 1)`

Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Numeric string \"%s\"", numeric)`
text extraction 2016-08-22 08:46:18 +00:00			`code, err := strconv.ParseUint(string(numeric), 8, 32)`
			`if err != nil {`
			`return PdfObjectString(bytes), err`
			`}`
			`bytes = append(bytes, byte(code))`
			`continue`
			`}`

			`switch b {`
			`case 'n':`
			`bytes = append(bytes, '\n')`
			`case 'r':`
			`bytes = append(bytes, '\r')`
			`case 't':`
			`bytes = append(bytes, '\t')`
			`case 'b':`
			`bytes = append(bytes, '\b')`
			`case 'f':`
			`bytes = append(bytes, '\f')`
			`case '(':`
			`bytes = append(bytes, '(')`
			`case ')':`
			`bytes = append(bytes, ')')`
			`case '\\':`
			`bytes = append(bytes, '\\')`
			`}`

			`continue`
			`} else if bb[0] == '(' {`
			`count++`
			`} else if bb[0] == ')' {`
			`count--`
			`if count == 0 {`
			`this.reader.ReadByte()`
			`break`
			`}`
			`}`

			`b, _ := this.reader.ReadByte()`
			`bytes = append(bytes, b)`
			`}`

			`return PdfObjectString(bytes), nil`
			`}`

Improved content stream parsing 2016-12-04 00:19:24 +00:00			`// Starts with '<' ends with '>'.`
			`func (this *ContentStreamParser) parseHexString() (PdfObjectString, error) {`
			`this.reader.ReadByte()`

			`hextable := []byte("0123456789abcdefABCDEF")`

			`tmp := []byte{}`
			`for {`
			`this.skipSpaces()`

			`bb, err := this.reader.Peek(1)`
			`if err != nil {`
			`return PdfObjectString(""), err`
			`}`

			`if bb[0] == '>' {`
			`this.reader.ReadByte()`
			`break`
			`}`

			`b, _ := this.reader.ReadByte()`
			`if bytes.IndexByte(hextable, b) >= 0 {`
			`tmp = append(tmp, b)`
			`}`
			`}`

			`if len(tmp)%2 == 1 {`
			`tmp = append(tmp, '0')`
			`}`

			`buf, _ := hex.DecodeString(string(tmp))`
			`return PdfObjectString(buf), nil`
			`}`

text extraction 2016-08-22 08:46:18 +00:00			`// Starts with '[' ends with ']'. Can contain any kinds of direct objects.`
			`func (this *ContentStreamParser) parseArray() (PdfObjectArray, error) {`
			`arr := make(PdfObjectArray, 0)`

			`this.reader.ReadByte()`

			`for {`
			`this.skipSpaces()`

			`bb, err := this.reader.Peek(1)`
			`if err != nil {`
			`return arr, err`
			`}`

			`if bb[0] == ']' {`
			`this.reader.ReadByte()`
			`break`
			`}`

			`obj, err, _ := this.parseObject()`
			`if err != nil {`
			`return arr, err`
			`}`
			`arr = append(arr, obj)`
			`}`

			`return arr, nil`
			`}`

			`// Parse bool object.`
			`func (this *ContentStreamParser) parseBool() (PdfObjectBool, error) {`
			`bb, err := this.reader.Peek(4)`
			`if err != nil {`
			`return PdfObjectBool(false), err`
			`}`
			`if (len(bb) >= 4) && (string(bb[:4]) == "true") {`
			`this.reader.Discard(4)`
			`return PdfObjectBool(true), nil`
			`}`

			`bb, err = this.reader.Peek(5)`
			`if err != nil {`
			`return PdfObjectBool(false), err`
			`}`
			`if (len(bb) >= 5) && (string(bb[:5]) == "false") {`
			`this.reader.Discard(5)`
			`return PdfObjectBool(false), nil`
			`}`

			`return PdfObjectBool(false), errors.New("Unexpected boolean string")`
			`}`

			`// Parse null object.`
			`func (this *ContentStreamParser) parseNull() (PdfObjectNull, error) {`
			`_, err := this.reader.Discard(4)`
			`return PdfObjectNull{}, err`
			`}`

Improved content stream parsing 2016-12-04 00:19:24 +00:00			`func (this ContentStreamParser) parseDict() (PdfObjectDictionary, error) {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Reading content stream dict!")`
Improved content stream parsing 2016-12-04 00:19:24 +00:00
			`dict := make(PdfObjectDictionary)`

			`// Pass the '<<'`
			`c, _ := this.reader.ReadByte()`
			`if c != '<' {`
			`return nil, errors.New("Invalid dict")`
			`}`
			`c, _ = this.reader.ReadByte()`
			`if c != '<' {`
			`return nil, errors.New("Invalid dict")`
			`}`

			`for {`
			`this.skipSpaces()`

			`bb, err := this.reader.Peek(2)`
			`if err != nil {`
			`return nil, err`
			`}`

Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`if (bb[0] == '>') && (bb[1] == '>') {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("EOF dictionary")`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`this.reader.ReadByte()`
			`this.reader.ReadByte()`
			`break`
			`}`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Parse the name!")`
Improved content stream parsing 2016-12-04 00:19:24 +00:00
			`keyName, err := this.parseName()`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Key: %s", keyName)`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`if err != nil {`
			`common.Log.Debug("ERROR Returning name err %s", err)`
			`return nil, err`
			`}`

			`if len(keyName) > 4 && keyName[len(keyName)-4:] == "null" {`
			`// Some writers have a bug where the null is appended without`
			`// space. For example "\Boundsnull"`
			`newKey := keyName[0 : len(keyName)-4]`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Taking care of null bug (%s)", keyName)`
			`common.Log.Trace("New key \"%s\" = null", newKey)`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`this.skipSpaces()`
			`bb, _ := this.reader.Peek(1)`
			`if bb[0] == '/' {`
			`var nullObj PdfObjectNull`
			`dict[newKey] = &nullObj`
			`continue`
			`}`
			`}`

			`this.skipSpaces()`

			`val, err, _ := this.parseObject()`
			`if err != nil {`
			`return nil, err`
			`}`
			`dict[keyName] = val`

Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("dict[%s] = %s", keyName, val.String())`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`}`

			`return &dict, nil`
			`}`

text extraction 2016-08-22 08:46:18 +00:00			`// An operand is a text command represented by a word.`
			`func (this *ContentStreamParser) parseOperand() (PdfObjectString, error) {`
			`bytes := []byte{}`
			`for {`
			`bb, err := this.reader.Peek(1)`
			`if err != nil {`
			`return PdfObjectString(bytes), err`
			`}`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`if IsDelimiter(bb[0]) {`
text extraction 2016-08-22 08:46:18 +00:00			`break`
			`}`
Refactoring into core and model. (in process) As the package was growing large, there is a need to clean up. Testing to refactor into a core (basic pdf parsing and primitive objects) and a higher level model package. 2016-09-08 17:53:45 +00:00			`if IsWhiteSpace(bb[0]) {`
text extraction 2016-08-22 08:46:18 +00:00			`break`
			`}`

			`b, _ := this.reader.ReadByte()`
			`bytes = append(bytes, b)`
			`}`

			`return PdfObjectString(bytes), nil`
			`}`

Support for parsing content streams and extracting text 2016-08-23 01:26:45 +00:00			`// Parse a generic object. Returns the object, an error code, and a bool`
			`// value indicating whether the object is an operand. An operand`
			`// is contained in a pdf string object.`
text extraction 2016-08-22 08:46:18 +00:00			`func (this *ContentStreamParser) parseObject() (PdfObject, error, bool) {`
			`// Determine the kind of object.`
			`// parse it!`
			`// make a list of operands, then once operand arrives put into a package.`

			`this.skipSpaces()`
			`for {`
			`bb, err := this.reader.Peek(2)`
			`if err != nil {`
			`return nil, err, false`
			`}`

Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Peek string: %s", string(bb))`
text extraction 2016-08-22 08:46:18 +00:00			`// Determine type.`
Support comments in content streams 2017-01-12 20:08:55 +00:00			`if bb[0] == '%' {`
			`this.skipComments()`
			`continue`
			`} else if bb[0] == '/' {`
text extraction 2016-08-22 08:46:18 +00:00			`name, err := this.parseName()`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("->Name: '%s'", name)`
text extraction 2016-08-22 08:46:18 +00:00			`return &name, err, false`
			`} else if bb[0] == '(' {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("->String!")`
text extraction 2016-08-22 08:46:18 +00:00			`str, err := this.parseString()`
			`return &str, err, false`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`} else if bb[0] == '<' && bb[1] != '<' {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("->Hex String!")`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`str, err := this.parseHexString()`
			`return &str, err, false`
text extraction 2016-08-22 08:46:18 +00:00			`} else if bb[0] == '[' {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("->Array!")`
text extraction 2016-08-22 08:46:18 +00:00			`arr, err := this.parseArray()`
			`return &arr, err, false`
Parse floating point numbers correctly in content streams e.g. .1 .1 .2 RG 2017-03-29 17:31:46 +11:00			`} else if IsFloatDigit(bb[0]) \|\| (bb[0] == '-' && IsFloatDigit(bb[1])) {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("->Number!")`
text extraction 2016-08-22 08:46:18 +00:00			`number, err := this.parseNumber()`
			`return number, err, false`
Improved content stream parsing 2016-12-04 00:19:24 +00:00			`} else if bb[0] == '<' && bb[1] == '<' {`
			`dict, err := this.parseDict()`
			`return dict, err, false`
text extraction 2016-08-22 08:46:18 +00:00			`} else {`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("->Operand or bool?")`
text extraction 2016-08-22 08:46:18 +00:00			`// Let's peek farther to find out.`
			`bb, _ = this.reader.Peek(5)`
			`peekStr := string(bb)`
Refactored contentstream into a separate package. 2017-03-01 16:02:53 +00:00			`common.Log.Trace("Peek str: %s", peekStr)`
text extraction 2016-08-22 08:46:18 +00:00
			`if (len(peekStr) > 3) && (peekStr[:4] == "null") {`
			`null, err := this.parseNull()`
			`return &null, err, false`
			`} else if (len(peekStr) > 4) && (peekStr[:5] == "false") {`
			`b, err := this.parseBool()`
			`return &b, err, false`
			`} else if (len(peekStr) > 3) && (peekStr[:4] == "true") {`
			`b, err := this.parseBool()`
			`return &b, err, false`
			`}`

			`operand, err := this.parseOperand()`
			`return &operand, err, true`
			`}`
			`}`
			`}`