Merge pull request #148 from quetz/master

Parser crude optimizations.
This commit is contained in:
Gunnsteinn Hall 2018-03-20 11:47:34 +00:00 committed by GitHub
commit e80a3dab58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 162 additions and 98 deletions

View File

@ -74,14 +74,14 @@ func (parser *PdfParser) GetTrailer() *PdfObjectDictionary {
func (parser *PdfParser) skipSpaces() (int, error) {
cnt := 0
for {
bb, err := parser.reader.Peek(1)
b, err := parser.reader.ReadByte()
if err != nil {
return 0, err
}
if IsWhiteSpace(bb[0]) {
parser.reader.ReadByte()
if IsWhiteSpace(b) {
cnt++
} else {
parser.reader.UnreadByte()
break
}
}
@ -121,11 +121,11 @@ func (parser *PdfParser) skipComments() error {
// Read a comment starting with '%'.
func (parser *PdfParser) readComment() (string, error) {
commentText := ""
var r bytes.Buffer
_, err := parser.skipSpaces()
if err != nil {
return commentText, err
return r.String(), err
}
isFirst := true
@ -133,21 +133,21 @@ func (parser *PdfParser) readComment() (string, error) {
bb, err := parser.reader.Peek(1)
if err != nil {
common.Log.Debug("Error %s", err.Error())
return commentText, err
return r.String(), err
}
if isFirst && bb[0] != '%' {
return commentText, errors.New("Comment should start with %")
return r.String(), errors.New("Comment should start with %")
} else {
isFirst = false
}
if (bb[0] != '\r') && (bb[0] != '\n') {
b, _ := parser.reader.ReadByte()
commentText += string(b)
r.WriteByte(b)
} else {
break
}
}
return commentText, nil
return r.String(), nil
}
// Read a single line of text from current position.
@ -171,7 +171,7 @@ func (parser *PdfParser) readTextLine() (string, error) {
// Parse a name starting with '/'.
func (parser *PdfParser) parseName() (PdfObjectName, error) {
name := ""
var r bytes.Buffer
nameStarted := false
for {
bb, err := parser.reader.Peek(1)
@ -179,7 +179,7 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) {
break // Can happen when loading from object stream.
}
if err != nil {
return PdfObjectName(name), err
return PdfObjectName(r.String()), err
}
if !nameStarted {
@ -192,7 +192,7 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) {
parser.skipSpaces()
} else {
common.Log.Debug("ERROR Name starting with %s (% x)", bb, bb)
return PdfObjectName(name), fmt.Errorf("Invalid name: (%c)", bb[0])
return PdfObjectName(r.String()), fmt.Errorf("Invalid name: (%c)", bb[0])
}
} else {
if IsWhiteSpace(bb[0]) {
@ -202,22 +202,22 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) {
} else if bb[0] == '#' {
hexcode, err := parser.reader.Peek(3)
if err != nil {
return PdfObjectName(name), err
return PdfObjectName(r.String()), err
}
parser.reader.Discard(3)
code, err := hex.DecodeString(string(hexcode[1:3]))
if err != nil {
return PdfObjectName(name), err
return PdfObjectName(r.String()), err
}
name += string(code)
r.Write(code)
} else {
b, _ := parser.reader.ReadByte()
name += string(b)
r.WriteByte(b)
}
}
}
return PdfObjectName(name), nil
return PdfObjectName(r.String()), nil
}
// Numeric objects.
@ -243,9 +243,9 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) {
func (parser *PdfParser) parseNumber() (PdfObject, error) {
isFloat := false
allowSigns := true
numStr := ""
var r bytes.Buffer
for {
common.Log.Trace("Parsing number \"%s\"", numStr)
common.Log.Trace("Parsing number \"%s\"", r.String())
bb, err := parser.reader.Peek(1)
if err == io.EOF {
// GH: EOF handling. Handle EOF like end of line. Can happen with
@ -260,19 +260,19 @@ func (parser *PdfParser) parseNumber() (PdfObject, error) {
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := parser.reader.ReadByte()
numStr += string(b)
r.WriteByte(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if IsDecimalDigit(bb[0]) {
b, _ := parser.reader.ReadByte()
numStr += string(b)
r.WriteByte(b)
} else if bb[0] == '.' {
b, _ := parser.reader.ReadByte()
numStr += string(b)
r.WriteByte(b)
isFloat = true
} else if bb[0] == 'e' {
// Exponential number format.
b, _ := parser.reader.ReadByte()
numStr += string(b)
r.WriteByte(b)
isFloat = true
allowSigns = true
} else {
@ -281,11 +281,11 @@ func (parser *PdfParser) parseNumber() (PdfObject, error) {
}
if isFloat {
fVal, err := strconv.ParseFloat(numStr, 64)
fVal, err := strconv.ParseFloat(r.String(), 64)
o := PdfObjectFloat(fVal)
return &o, err
} else {
intVal, err := strconv.ParseInt(numStr, 10, 64)
intVal, err := strconv.ParseInt(r.String(), 10, 64)
o := PdfObjectInteger(intVal)
return &o, err
}
@ -295,26 +295,26 @@ func (parser *PdfParser) parseNumber() (PdfObject, error) {
func (parser *PdfParser) parseString() (PdfObjectString, error) {
parser.reader.ReadByte()
bytes := []byte{}
var r bytes.Buffer
count := 1
for {
bb, err := parser.reader.Peek(1)
if err != nil {
return PdfObjectString(bytes), err
return PdfObjectString(r.String()), err
}
if bb[0] == '\\' { // Escape sequence.
parser.reader.ReadByte() // Skip the escape \ byte.
b, err := parser.reader.ReadByte()
if err != nil {
return PdfObjectString(bytes), err
return PdfObjectString(r.String()), err
}
// Octal '\ddd' number (base 8).
if IsOctalDigit(b) {
bb, err := parser.reader.Peek(2)
if err != nil {
return PdfObjectString(bytes), err
return PdfObjectString(r.String()), err
}
numeric := []byte{}
@ -331,29 +331,29 @@ func (parser *PdfParser) parseString() (PdfObjectString, error) {
common.Log.Trace("Numeric string \"%s\"", numeric)
code, err := strconv.ParseUint(string(numeric), 8, 32)
if err != nil {
return PdfObjectString(bytes), err
return PdfObjectString(r.String()), err
}
bytes = append(bytes, byte(code))
r.WriteByte(byte(code))
continue
}
switch b {
case 'n':
bytes = append(bytes, '\n')
r.WriteRune('\n')
case 'r':
bytes = append(bytes, '\r')
r.WriteRune('\r')
case 't':
bytes = append(bytes, '\t')
r.WriteRune('\t')
case 'b':
bytes = append(bytes, '\b')
r.WriteRune('\b')
case 'f':
bytes = append(bytes, '\f')
r.WriteRune('\f')
case '(':
bytes = append(bytes, '(')
r.WriteRune('(')
case ')':
bytes = append(bytes, ')')
r.WriteRune(')')
case '\\':
bytes = append(bytes, '\\')
r.WriteRune('\\')
}
continue
@ -368,10 +368,10 @@ func (parser *PdfParser) parseString() (PdfObjectString, error) {
}
b, _ := parser.reader.ReadByte()
bytes = append(bytes, b)
r.WriteByte(b)
}
return PdfObjectString(bytes), nil
return PdfObjectString(r.String()), nil
}
// Starts with '<' ends with '>'.
@ -379,12 +379,8 @@ func (parser *PdfParser) parseString() (PdfObjectString, error) {
func (parser *PdfParser) parseHexString() (PdfObjectString, error) {
parser.reader.ReadByte()
hextable := []byte("0123456789abcdefABCDEF")
tmp := []byte{}
var r bytes.Buffer
for {
parser.skipSpaces()
bb, err := parser.reader.Peek(1)
if err != nil {
return PdfObjectString(""), err
@ -396,16 +392,16 @@ func (parser *PdfParser) parseHexString() (PdfObjectString, error) {
}
b, _ := parser.reader.ReadByte()
if bytes.IndexByte(hextable, b) >= 0 {
tmp = append(tmp, b)
if !IsWhiteSpace(b) {
r.WriteByte(b)
}
}
if len(tmp)%2 == 1 {
tmp = append(tmp, '0')
if r.Len()%2 == 1 {
r.WriteRune('0')
}
buf, _ := hex.DecodeString(string(tmp))
buf, _ := hex.DecodeString(r.String())
return PdfObjectString(buf), nil
}

View File

@ -8,6 +8,7 @@ package core
import (
"bufio"
"bytes"
"encoding/hex"
//"fmt"
"io"
//"os"
@ -27,27 +28,54 @@ func makeReaderForText(txt string) (*bytes.Reader, *bufio.Reader, int64) {
return bufReader, bufferedReader, int64(len(txt))
}
func makeParserForText(txt string) *PdfParser {
rs, reader, fileSize := makeReaderForText(txt)
return &PdfParser{rs: rs, reader: reader, fileSize: fileSize}
}
func BenchmarkSkipSpaces(b *testing.B) {
parser := makeParserForText(" \t\t \tABC")
for n := 0; n < b.N; n++ {
parser.skipSpaces()
parser.SetFileOffset(0)
}
}
var namePairs = map[string]string{
"/Name1": "Name1",
"/ASomewhatLongerName": "ASomewhatLongerName",
"/A;Name_With-Various***Characters?": "A;Name_With-Various***Characters?",
"/1.2": "1.2",
"/$$": "$$",
"/@pattern": "@pattern",
"/.notdef": ".notdef",
"/Lime#20Green": "Lime Green",
"/paired#28#29parentheses": "paired()parentheses",
"/The_Key_of_F#23_Minor": "The_Key_of_F#_Minor",
"/A#42": "AB",
"/": "",
"/ ": "",
"/#3CBC88#3E#3CC5ED#3E#3CD544#3E#3CC694#3E": "<BC88><C5ED><D544><C694>",
}
func BenchmarkNameParsing(b *testing.B) {
for n := 0; n < b.N; n++ {
for str, name := range namePairs {
parser := makeParserForText(str)
o, err := parser.parseName()
if err != nil && err != io.EOF {
b.Errorf("Unable to parse name string, error: %s", err)
}
if string(o) != name {
b.Errorf("Mismatch %s != %s", o, name)
}
}
}
}
func TestNameParsing(t *testing.T) {
namePairs := map[string]string{}
namePairs["/Name1"] = "Name1"
namePairs["/ASomewhatLongerName"] = "ASomewhatLongerName"
namePairs["/A;Name_With-Various***Characters?"] = "A;Name_With-Various***Characters?"
namePairs["/1.2"] = "1.2"
namePairs["/$$"] = "$$"
namePairs["/@pattern"] = "@pattern"
namePairs["/.notdef"] = ".notdef"
namePairs["/Lime#20Green"] = "Lime Green"
namePairs["/paired#28#29parentheses"] = "paired()parentheses"
namePairs["/The_Key_of_F#23_Minor"] = "The_Key_of_F#_Minor"
namePairs["/A#42"] = "AB"
namePairs["/"] = ""
namePairs["/ "] = ""
namePairs["/#3CBC88#3E#3CC5ED#3E#3CD544#3E#3CC694#3E"] = "<BC88><C5ED><D544><C694>"
for str, name := range namePairs {
parser := PdfParser{}
parser.rs, parser.reader, parser.fileSize = makeReaderForText(str)
parser := makeParserForText(str)
o, err := parser.parseName()
if err != nil && err != io.EOF {
t.Errorf("Unable to parse name string, error: %s", err)
@ -58,8 +86,7 @@ func TestNameParsing(t *testing.T) {
}
// Should fail (require starting with '/')
parser := PdfParser{}
parser.rs, parser.reader, parser.fileSize = makeReaderForText(" /Name")
parser := makeParserForText(" /Name")
_, err := parser.parseName()
if err == nil || err == io.EOF {
t.Errorf("Should be invalid name")
@ -71,33 +98,42 @@ type testStringEntry struct {
expected string
}
func TestStringParsing(t *testing.T) {
testEntries := []testStringEntry{
{"(This is a string)", "This is a string"},
{"(Strings may contain\n newlines and such)", "Strings may contain\n newlines and such"},
{"(Strings may contain balanced parenthesis () and\nspecial characters (*!&}^% and so on).)",
"Strings may contain balanced parenthesis () and\nspecial characters (*!&}^% and so on)."},
{"(These \\\ntwo strings \\\nare the same.)", "These two strings are the same."},
{"(These two strings are the same.)", "These two strings are the same."},
{"(\\\\)", "\\"},
{"(This string has an end-of-line at the end of it.\n)",
"This string has an end-of-line at the end of it.\n"},
{"(So does this one.\\n)", "So does this one.\n"},
{"(\\0053)", "\0053"},
{"(\\053)", "\053"},
{"(\\53)", "\053"},
{"(\\053)", "+"},
{"(\\53\\101)", "+A"},
func BenchmarkStringParsing(b *testing.B) {
entry := "(Strings may contain balanced parenthesis () and\nspecial characters (*!&}^% and so on).)"
parser := makeParserForText(entry)
for n := 0; n < b.N; n++ {
_, err := parser.parseString()
if err != nil && err != io.EOF {
b.Errorf("Unable to parse string, error: %s", err)
}
parser.SetFileOffset(0)
}
for _, entry := range testEntries {
parser := PdfParser{}
parser.rs, parser.reader, parser.fileSize = makeReaderForText(entry.raw)
}
var stringPairs = map[string]string{
"(This is a string)": "This is a string",
"(Strings may contain\n newlines and such)": "Strings may contain\n newlines and such",
"(Strings may contain balanced parenthesis () and\nspecial characters (*!&}^% and so on).)": "Strings may contain balanced parenthesis () and\nspecial characters (*!&}^% and so on).",
"(These \\\ntwo strings \\\nare the same.)": "These two strings are the same.",
"(These two strings are the same.)": "These two strings are the same.",
"(\\\\)": "\\",
"(This string has an end-of-line at the end of it.\n)": "This string has an end-of-line at the end of it.\n",
"(So does this one.\\n)": "So does this one.\n",
"(\\0053)": "\0053",
"(\\53)": "\053",
"(\\053)": "+",
"(\\53\\101)": "+A",
}
func TestStringParsing(t *testing.T) {
for raw, expected := range stringPairs {
parser := makeParserForText(raw)
o, err := parser.parseString()
if err != nil && err != io.EOF {
t.Errorf("Unable to parse string, error: %s", err)
}
if string(o) != entry.expected {
t.Errorf("String Mismatch %s: \"%s\" != \"%s\"", entry.raw, o, entry.expected)
if string(o) != expected {
t.Errorf("String Mismatch %s: \"%s\" != \"%s\"", raw, o, expected)
}
}
}
@ -106,8 +142,7 @@ func TestReadTextLine(t *testing.T) {
// reading text ling + rewinding should be idempotent, that is:
// if we rewind back len(str) bytes after reading string str we should arrive at beginning of str
rawText := "abc\xb0cde"
parser := PdfParser{}
parser.rs, parser.reader, parser.fileSize = makeReaderForText(rawText)
parser := makeParserForText(rawText)
s, err := parser.readTextLine()
if err != nil && err != io.EOF {
t.Errorf("Unable to parse string, error: %s", err)
@ -172,6 +207,21 @@ func TestBoolParsing(t *testing.T) {
}
}
func BenchmarkNumbericParsing(b *testing.B) {
txt1 := "[34.5 -3.62 1 +123.6 4. -.002 0.0]"
parser := PdfParser{}
parser.rs, parser.reader, parser.fileSize = makeReaderForText(txt1)
for n := 0; n < b.N; n++ {
_, err := parser.parseArray()
if err != nil {
b.Errorf("Error parsing array")
return
}
parser.SetFileOffset(0)
}
}
func TestNumericParsing1(t *testing.T) {
// 7.3.3
txt1 := "[34.5 -3.62 1 +123.6 4. -.002 0.0]"
@ -285,6 +335,25 @@ func TestNumericParsing3(t *testing.T) {
}
}
func BenchmarkHexStringParsing(b *testing.B) {
var ref bytes.Buffer
for i := 0; i < 0xff; i++ {
ref.WriteByte(byte(i))
}
parser := makeParserForText("<" + hex.EncodeToString(ref.Bytes()) + ">")
for n := 0; n < b.N; n++ {
hs, err := parser.parseHexString()
if err != nil {
b.Errorf("Error parsing hex string: %s", err.Error())
return
}
if string(hs) != ref.String() {
b.Errorf("Reference and parsed hex strings mismatch")
}
parser.SetFileOffset(0)
}
}
func TestHexStringParsing(t *testing.T) {
// 7.3.4.3
}

View File

@ -12,9 +12,8 @@ func IsWhiteSpace(ch byte) bool {
// spaceCharacters := string([]byte{0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20})
if (ch == 0x00) || (ch == 0x09) || (ch == 0x0A) || (ch == 0x0C) || (ch == 0x0D) || (ch == 0x20) {
return true
} else {
return false
}
return false
}
// IsFloatDigit checks if a character can be a part of a float number string.