diff --git a/contentstream/parser.go b/contentstream/parser.go index 17e9d772..7695ac16 100644 --- a/contentstream/parser.go +++ b/contentstream/parser.go @@ -16,7 +16,6 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core" - "github.com/unidoc/unipdf/v3/internal/parseutils" ) // ContentStreamParser represents a content stream parser for parsing content streams in PDFs. @@ -192,19 +191,7 @@ func (csp *ContentStreamParser) parseName() (core.PdfObjectName, error) { // we will support it in the reader (no confusion with other types, so // no compromise). func (csp *ContentStreamParser) parseNumber() (core.PdfObject, error) { - num, err := parseutils.ParseNumber(csp.reader) - if err != nil { - return nil, err - } - switch num := num.(type) { - case float64: - o := core.PdfObjectFloat(num) - return &o, nil - case int64: - o := core.PdfObjectInteger(num) - return &o, nil - } - return nil, fmt.Errorf("unhandled number type %T", num) + return core.ParseNumber(csp.reader) } // A string starts with '(' and ends with ')'. diff --git a/core/parser.go b/core/parser.go index fdcf2b3e..e20325a7 100755 --- a/core/parser.go +++ b/core/parser.go @@ -20,7 +20,6 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core/security" - "github.com/unidoc/unipdf/v3/internal/parseutils" ) // Regular Expressions for parsing and identifying object signatures. @@ -287,19 +286,7 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) { // we will support it in the reader (no confusion with other types, so // no compromise). func (parser *PdfParser) parseNumber() (PdfObject, error) { - num, err := parseutils.ParseNumber(parser.reader) - if err != nil { - return nil, err - } - switch num := num.(type) { - case float64: - o := PdfObjectFloat(num) - return &o, nil - case int64: - o := PdfObjectInteger(num) - return &o, nil - } - return nil, fmt.Errorf("unhandled number type %T", num) + return ParseNumber(parser.reader) } // A string starts with '(' and ends with ')'. diff --git a/core/utils.go b/core/utils.go index 0b2b9e80..1c4150f2 100644 --- a/core/utils.go +++ b/core/utils.go @@ -6,10 +6,14 @@ package core import ( + "bufio" + "bytes" "errors" "fmt" + "io" "reflect" "sort" + "strconv" "github.com/unidoc/unipdf/v3/common" ) @@ -374,3 +378,91 @@ func flattenObject(obj PdfObject, depth int) PdfObject { } return obj } + +// ParseNumber parses a numeric objects from a buffered stream. +// Section 7.3.3. +// Integer or Float. +// +// An integer shall be written as one or more decimal digits optionally +// preceded by a sign. The value shall be interpreted as a signed +// decimal integer and shall be converted to an integer object. +// +// A real value shall be written as one or more decimal digits with an +// optional sign and a leading, trailing, or embedded PERIOD (2Eh) +// (decimal point). The value shall be interpreted as a real number +// and shall be converted to a real object. +// +// Regarding exponential numbers: 7.3.3 Numeric Objects: +// A conforming writer shall not use the PostScript syntax for numbers +// with non-decimal radices (such as 16#FFFE) or in exponential format +// (such as 6.02E23). +// Nonetheless, we sometimes get numbers with exponential format, so +// we will support it in the reader (no confusion with other types, so +// no compromise). +func ParseNumber(buf *bufio.Reader) (PdfObject, error) { + isFloat := false + allowSigns := true + var r bytes.Buffer + for { + if common.Log.IsLogLevel(common.LogLevelTrace) { + common.Log.Trace("Parsing number \"%s\"", r.String()) + } + bb, err := buf.Peek(1) + if err == io.EOF { + // GH: EOF handling. Handle EOF like end of line. Can happen with + // encoded object streams that the object is at the end. + // In other cases, we will get the EOF error elsewhere at any rate. + break // Handle like EOF + } + if err != nil { + common.Log.Debug("ERROR %s", err) + return nil, err + } + if allowSigns && (bb[0] == '-' || bb[0] == '+') { + // Only appear in the beginning, otherwise serves as a delimiter. + b, _ := buf.ReadByte() + r.WriteByte(b) + allowSigns = false // Only allowed in beginning, and after e (exponential). + } else if IsDecimalDigit(bb[0]) { + b, _ := buf.ReadByte() + r.WriteByte(b) + } else if bb[0] == '.' { + b, _ := buf.ReadByte() + r.WriteByte(b) + isFloat = true + } else if bb[0] == 'e' || bb[0] == 'E' { + // Exponential number format. + b, _ := buf.ReadByte() + r.WriteByte(b) + isFloat = true + allowSigns = true + } else { + break + } + } + + var o PdfObject + if isFloat { + fVal, err := strconv.ParseFloat(r.String(), 64) + if err != nil { + common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err) + fVal = 0.0 + err = nil + } + + objFloat := PdfObjectFloat(fVal) + o = &objFloat + } else { + intVal, err := strconv.ParseInt(r.String(), 10, 64) + if err != nil { + common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err) + intVal = 0 + err = nil + } + + objInt := PdfObjectInteger(intVal) + o = &objInt + } + + return o, nil +} diff --git a/fdf/parser.go b/fdf/parser.go index 4f27cd02..3d42ecad 100644 --- a/fdf/parser.go +++ b/fdf/parser.go @@ -19,7 +19,6 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core" - "github.com/unidoc/unipdf/v3/internal/parseutils" ) // Regular Expressions for parsing and identifying object signatures. @@ -213,19 +212,7 @@ func (parser *fdfParser) parseName() (core.PdfObjectName, error) { // we will support it in the reader (no confusion with other types, so // no compromise). func (parser *fdfParser) parseNumber() (core.PdfObject, error) { - num, err := parseutils.ParseNumber(parser.reader) - if err != nil { - return nil, err - } - switch num := num.(type) { - case float64: - o := core.PdfObjectFloat(num) - return &o, nil - case int64: - o := core.PdfObjectInteger(num) - return &o, nil - } - return nil, fmt.Errorf("unhandled number type %T", num) + return core.ParseNumber(parser.reader) } // A string starts with '(' and ends with ')'. diff --git a/go.mod b/go.mod index 3e19193e..72f1cadc 100644 --- a/go.mod +++ b/go.mod @@ -12,3 +12,5 @@ require ( golang.org/x/text v0.3.2 golang.org/x/tools v0.0.0-20190606174628-0139d5756a7d // indirect ) + +go 1.13 diff --git a/internal/cmap/parser.go b/internal/cmap/parser.go index a43f4b13..71fee686 100644 --- a/internal/cmap/parser.go +++ b/internal/cmap/parser.go @@ -15,7 +15,6 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core" - "github.com/unidoc/unipdf/v3/internal/parseutils" ) // cMapParser parses CMap character to unicode mapping files. @@ -393,17 +392,19 @@ func (p *cMapParser) parseDict() (cmapDict, error) { // parseDict parseNumber a PDF number. func (p *cMapParser) parseNumber() (cmapObject, error) { - num, err := parseutils.ParseNumber(p.reader) + o, err := core.ParseNumber(p.reader) if err != nil { return nil, err } - switch num := num.(type) { - case float64: - return cmapFloat{num}, nil - case int64: - return cmapInt{num}, nil + + switch o := o.(type) { + case *core.PdfObjectFloat: + return cmapFloat{float64(*o)}, nil + case *core.PdfObjectInteger: + return cmapInt{int64(*o)}, nil } - return nil, fmt.Errorf("unhandled number type %T", num) + + return nil, fmt.Errorf("unhandled number type %T", o) } // parseOperand parses an operand, which is a text command represented by a word. diff --git a/internal/parseutils/number.go b/internal/parseutils/number.go deleted file mode 100644 index e7a8300e..00000000 --- a/internal/parseutils/number.go +++ /dev/null @@ -1,103 +0,0 @@ -package parseutils - -import ( - "bufio" - "bytes" - "io" - "strconv" - - "github.com/unidoc/unipdf/v3/common" -) - -// ParseNumber parses a numeric objects from a buffered stream. -// Section 7.3.3. -// Integer or Float. -// -// An integer shall be written as one or more decimal digits optionally -// preceded by a sign. The value shall be interpreted as a signed -// decimal integer and shall be converted to an integer object. -// -// A real value shall be written as one or more decimal digits with an -// optional sign and a leading, trailing, or embedded PERIOD (2Eh) -// (decimal point). The value shall be interpreted as a real number -// and shall be converted to a real object. -// -// Regarding exponential numbers: 7.3.3 Numeric Objects: -// A conforming writer shall not use the PostScript syntax for numbers -// with non-decimal radices (such as 16#FFFE) or in exponential format -// (such as 6.02E23). -// Nonetheless, we sometimes get numbers with exponential format, so -// we will support it in the reader (no confusion with other types, so -// no compromise). -func ParseNumber(bufr *bufio.Reader) (interface{}, error) { - isFloat := false - allowSigns := true - var r bytes.Buffer - for { - if common.Log.IsLogLevel(common.LogLevelTrace) { - common.Log.Trace("Parsing number \"%s\"", r.String()) - } - bb, err := bufr.Peek(1) - if err == io.EOF { - // GH: EOF handling. Handle EOF like end of line. Can happen with - // encoded object streams that the object is at the end. - // In other cases, we will get the EOF error elsewhere at any rate. - break // Handle like EOF - } - if err != nil { - common.Log.Debug("ERROR %s", err) - return nil, err - } - if allowSigns && (bb[0] == '-' || bb[0] == '+') { - // Only appear in the beginning, otherwise serves as a delimiter. - b, _ := bufr.ReadByte() - r.WriteByte(b) - allowSigns = false // Only allowed in beginning, and after e (exponential). - } else if IsDecimalDigit(bb[0]) { - b, _ := bufr.ReadByte() - r.WriteByte(b) - } else if bb[0] == '.' { - b, _ := bufr.ReadByte() - r.WriteByte(b) - isFloat = true - } else if bb[0] == 'e' || bb[0] == 'E' { - // Exponential number format. - b, _ := bufr.ReadByte() - r.WriteByte(b) - isFloat = true - allowSigns = true - } else { - break - } - } - - var o interface{} - if isFloat { - fVal, err := strconv.ParseFloat(r.String(), 64) - if err != nil { - common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err) - fVal = 0.0 - err = nil - } - o = fVal - } else { - intVal, err := strconv.ParseInt(r.String(), 10, 64) - if err != nil { - common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err) - intVal = 0 - err = nil - } - o = intVal - } - - return o, nil -} - -// IsDecimalDigit checks if the character is a part of a decimal number string. -func IsDecimalDigit(c byte) bool { - if c >= '0' && c <= '9' { - return true - } - - return false -} diff --git a/ps/parser.go b/ps/parser.go index 020cd260..8fee5fac 100644 --- a/ps/parser.go +++ b/ps/parser.go @@ -14,7 +14,6 @@ import ( "github.com/unidoc/unipdf/v3/common" pdfcore "github.com/unidoc/unipdf/v3/core" - "github.com/unidoc/unipdf/v3/internal/parseutils" ) // PSParser is a basic Postscript parser. @@ -146,17 +145,19 @@ func (p *PSParser) skipSpaces() (int, error) { // Numeric objects. // Integer or Real numbers. func (p *PSParser) parseNumber() (PSObject, error) { - num, err := parseutils.ParseNumber(p.reader) + o, err := pdfcore.ParseNumber(p.reader) if err != nil { return nil, err } - switch num := num.(type) { - case float64: - return MakeReal(num), nil - case int64: - return MakeInteger(int(num)), nil + + switch o := o.(type) { + case *pdfcore.PdfObjectFloat: + return MakeReal(float64(*o)), nil + case *pdfcore.PdfObjectInteger: + return MakeInteger(int(*o)), nil } - return nil, fmt.Errorf("unhandled number type %T", num) + + return nil, fmt.Errorf("unhandled number type %T", o) } // Parse bool object.