Unify and optimize number parsing

This commit is contained in:
Samuel Stauffer 2019-12-18 12:30:54 -08:00
parent d0f9c139ad
commit e85397b57a
6 changed files with 160 additions and 254 deletions

View File

@ -16,6 +16,7 @@ import (
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/internal/parseutils"
)
// ContentStreamParser represents a content stream parser for parsing content streams in PDFs.
@ -191,67 +192,19 @@ func (csp *ContentStreamParser) parseName() (core.PdfObjectName, error) {
// we will support it in the reader (no confusion with other types, so
// no compromise).
func (csp *ContentStreamParser) parseNumber() (core.PdfObject, error) {
isFloat := false
allowSigns := true
numStr := ""
for {
common.Log.Trace("Parsing number \"%s\"", numStr)
bb, err := csp.reader.Peek(1)
if err == io.EOF {
// GH: EOF handling. Handle EOF like end of line. Can happen with
// encoded object streams that the object is at the end.
// In other cases, we will get the EOF error elsewhere at any rate.
break // Handle like EOF
}
if err != nil {
common.Log.Error("ERROR %s", err)
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := csp.reader.ReadByte()
numStr += string(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if core.IsDecimalDigit(bb[0]) {
b, _ := csp.reader.ReadByte()
numStr += string(b)
} else if bb[0] == '.' {
b, _ := csp.reader.ReadByte()
numStr += string(b)
isFloat = true
} else if bb[0] == 'e' {
// Exponential number format.
b, _ := csp.reader.ReadByte()
numStr += string(b)
isFloat = true
allowSigns = true
} else {
break
}
num, err := parseutils.ParseNumber(csp.reader)
if err != nil {
return nil, err
}
var o core.PdfObject
if isFloat {
fVal, err := strconv.ParseFloat(numStr, 64)
if err != nil {
common.Log.Debug("Error parsing number %q err=%v. Using 0.0. Output may be incorrect", numStr, err)
fVal = 0.0
}
objFloat := core.PdfObjectFloat(fVal)
o = &objFloat
} else {
intVal, err := strconv.ParseInt(numStr, 10, 64)
if err != nil {
common.Log.Debug("Error parsing integer %q err=%v. Using 0. Output may be incorrect", numStr, err)
intVal = 0
}
objInt := core.PdfObjectInteger(intVal)
o = &objInt
switch num := num.(type) {
case float64:
o := core.PdfObjectFloat(num)
return &o, nil
case int64:
o := core.PdfObjectInteger(num)
return &o, nil
}
return o, nil
return nil, fmt.Errorf("unhandled number type %T", num)
}
// A string starts with '(' and ends with ')'.

View File

@ -20,6 +20,7 @@ import (
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core/security"
"github.com/unidoc/unipdf/v3/internal/parseutils"
)
// Regular Expressions for parsing and identifying object signatures.
@ -286,69 +287,19 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) {
// we will support it in the reader (no confusion with other types, so
// no compromise).
func (parser *PdfParser) parseNumber() (PdfObject, error) {
isFloat := false
allowSigns := true
var r bytes.Buffer
for {
common.Log.Trace("Parsing number \"%s\"", r.String())
bb, err := parser.reader.Peek(1)
if err == io.EOF {
// GH: EOF handling. Handle EOF like end of line. Can happen with
// encoded object streams that the object is at the end.
// In other cases, we will get the EOF error elsewhere at any rate.
break // Handle like EOF
}
if err != nil {
common.Log.Debug("ERROR %s", err)
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if IsDecimalDigit(bb[0]) {
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
} else if bb[0] == '.' {
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
isFloat = true
} else if bb[0] == 'e' || bb[0] == 'E' {
// Exponential number format.
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
isFloat = true
allowSigns = true
} else {
break
}
num, err := parseutils.ParseNumber(parser.reader)
if err != nil {
return nil, err
}
var o PdfObject
if isFloat {
fVal, err := strconv.ParseFloat(r.String(), 64)
if err != nil {
common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err)
fVal = 0.0
err = nil
}
objFloat := PdfObjectFloat(fVal)
o = &objFloat
} else {
intVal, err := strconv.ParseInt(r.String(), 10, 64)
if err != nil {
common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err)
intVal = 0
err = nil
}
objInt := PdfObjectInteger(intVal)
o = &objInt
switch num := num.(type) {
case float64:
o := PdfObjectFloat(num)
return &o, nil
case int64:
o := PdfObjectInteger(num)
return &o, nil
}
return o, nil
return nil, fmt.Errorf("unhandled number type %T", num)
}
// A string starts with '(' and ends with ')'.

View File

@ -19,6 +19,7 @@ import (
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/internal/parseutils"
)
// Regular Expressions for parsing and identifying object signatures.
@ -212,54 +213,19 @@ func (parser *fdfParser) parseName() (core.PdfObjectName, error) {
// we will support it in the reader (no confusion with other types, so
// no compromise).
func (parser *fdfParser) parseNumber() (core.PdfObject, error) {
isFloat := false
allowSigns := true
var r bytes.Buffer
for {
common.Log.Trace("Parsing number \"%s\"", r.String())
bb, err := parser.reader.Peek(1)
if err == io.EOF {
// GH: EOF handling. Handle EOF like end of line. Can happen with
// encoded object streams that the object is at the end.
// In other cases, we will get the EOF error elsewhere at any rate.
break // Handle like EOF
}
if err != nil {
common.Log.Debug("ERROR %s", err)
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if core.IsDecimalDigit(bb[0]) {
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
} else if bb[0] == '.' {
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
isFloat = true
} else if bb[0] == 'e' {
// Exponential number format.
b, _ := parser.reader.ReadByte()
r.WriteByte(b)
isFloat = true
allowSigns = true
} else {
break
}
num, err := parseutils.ParseNumber(parser.reader)
if err != nil {
return nil, err
}
if isFloat {
fVal, err := strconv.ParseFloat(r.String(), 64)
o := core.PdfObjectFloat(fVal)
return &o, err
} else {
intVal, err := strconv.ParseInt(r.String(), 10, 64)
o := core.PdfObjectInteger(intVal)
return &o, err
switch num := num.(type) {
case float64:
o := core.PdfObjectFloat(num)
return &o, nil
case int64:
o := core.PdfObjectInteger(num)
return &o, nil
}
return nil, fmt.Errorf("unhandled number type %T", num)
}
// A string starts with '(' and ends with ')'.

View File

@ -15,6 +15,7 @@ import (
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/internal/parseutils"
)
// cMapParser parses CMap character to unicode mapping files.
@ -392,49 +393,17 @@ func (p *cMapParser) parseDict() (cmapDict, error) {
// parseDict parseNumber a PDF number.
func (p *cMapParser) parseNumber() (cmapObject, error) {
isFloat := false
allowSigns := true
numStr := bytes.Buffer{}
for {
bb, err := p.reader.Peek(1)
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if core.IsDecimalDigit(bb[0]) {
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
} else if bb[0] == '.' {
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
isFloat = true
} else if bb[0] == 'e' {
// Exponential number format.
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
isFloat = true
allowSigns = true
} else {
break
}
num, err := parseutils.ParseNumber(p.reader)
if err != nil {
return nil, err
}
if isFloat {
fVal, err := strconv.ParseFloat(numStr.String(), 64)
o := cmapFloat{fVal}
return o, err
switch num := num.(type) {
case float64:
return cmapFloat{num}, nil
case int64:
return cmapInt{num}, nil
}
intVal, err := strconv.ParseInt(numStr.String(), 10, 64)
o := cmapInt{intVal}
return o, err
return nil, fmt.Errorf("unhandled number type %T", num)
}
// parseOperand parses an operand, which is a text command represented by a word.

View File

@ -0,0 +1,103 @@
package parseutils
import (
"bufio"
"bytes"
"io"
"strconv"
"github.com/unidoc/unipdf/v3/common"
)
// ParseNumber parses a numeric objects from a buffered stream.
// Section 7.3.3.
// Integer or Float.
//
// An integer shall be written as one or more decimal digits optionally
// preceded by a sign. The value shall be interpreted as a signed
// decimal integer and shall be converted to an integer object.
//
// A real value shall be written as one or more decimal digits with an
// optional sign and a leading, trailing, or embedded PERIOD (2Eh)
// (decimal point). The value shall be interpreted as a real number
// and shall be converted to a real object.
//
// Regarding exponential numbers: 7.3.3 Numeric Objects:
// A conforming writer shall not use the PostScript syntax for numbers
// with non-decimal radices (such as 16#FFFE) or in exponential format
// (such as 6.02E23).
// Nonetheless, we sometimes get numbers with exponential format, so
// we will support it in the reader (no confusion with other types, so
// no compromise).
func ParseNumber(bufr *bufio.Reader) (interface{}, error) {
isFloat := false
allowSigns := true
var r bytes.Buffer
for {
if common.Log.IsLogLevel(common.LogLevelTrace) {
common.Log.Trace("Parsing number \"%s\"", r.String())
}
bb, err := bufr.Peek(1)
if err == io.EOF {
// GH: EOF handling. Handle EOF like end of line. Can happen with
// encoded object streams that the object is at the end.
// In other cases, we will get the EOF error elsewhere at any rate.
break // Handle like EOF
}
if err != nil {
common.Log.Debug("ERROR %s", err)
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := bufr.ReadByte()
r.WriteByte(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if IsDecimalDigit(bb[0]) {
b, _ := bufr.ReadByte()
r.WriteByte(b)
} else if bb[0] == '.' {
b, _ := bufr.ReadByte()
r.WriteByte(b)
isFloat = true
} else if bb[0] == 'e' || bb[0] == 'E' {
// Exponential number format.
b, _ := bufr.ReadByte()
r.WriteByte(b)
isFloat = true
allowSigns = true
} else {
break
}
}
var o interface{}
if isFloat {
fVal, err := strconv.ParseFloat(r.String(), 64)
if err != nil {
common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err)
fVal = 0.0
err = nil
}
o = fVal
} else {
intVal, err := strconv.ParseInt(r.String(), 10, 64)
if err != nil {
common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err)
intVal = 0
err = nil
}
o = intVal
}
return o, nil
}
// IsDecimalDigit checks if the character is a part of a decimal number string.
func IsDecimalDigit(c byte) bool {
if c >= '0' && c <= '9' {
return true
}
return false
}

View File

@ -9,11 +9,12 @@ import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"strconv"
"github.com/unidoc/unipdf/v3/common"
pdfcore "github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/internal/parseutils"
)
// PSParser is a basic Postscript parser.
@ -145,54 +146,17 @@ func (p *PSParser) skipSpaces() (int, error) {
// Numeric objects.
// Integer or Real numbers.
func (p *PSParser) parseNumber() (PSObject, error) {
isFloat := false
allowSigns := true
numStr := ""
for {
common.Log.Trace("Parsing number \"%s\"", numStr)
bb, err := p.reader.Peek(1)
if err == io.EOF {
// GH: EOF handling. Handle EOF like end of line. Can happen with
// encoded object streams that the object is at the end.
// In other cases, we will get the EOF error elsewhere at any rate.
break // Handle like EOF
}
if err != nil {
common.Log.Debug("PS ERROR: %s", err)
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := p.reader.ReadByte()
numStr += string(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if pdfcore.IsDecimalDigit(bb[0]) {
b, _ := p.reader.ReadByte()
numStr += string(b)
} else if bb[0] == '.' {
b, _ := p.reader.ReadByte()
numStr += string(b)
isFloat = true
} else if bb[0] == 'e' {
// Exponential number format.
// TODO: Is this supported in PS?
b, _ := p.reader.ReadByte()
numStr += string(b)
isFloat = true
allowSigns = true
} else {
break
}
num, err := parseutils.ParseNumber(p.reader)
if err != nil {
return nil, err
}
if isFloat {
fVal, err := strconv.ParseFloat(numStr, 64)
o := MakeReal(fVal)
return o, err
switch num := num.(type) {
case float64:
return MakeReal(num), nil
case int64:
return MakeInteger(int(num)), nil
}
intVal, err := strconv.ParseInt(numStr, 10, 64)
o := MakeInteger(int(intVal))
return o, err
return nil, fmt.Errorf("unhandled number type %T", num)
}
// Parse bool object.