diff --git a/contentstream/parser.go b/contentstream/parser.go index 56cc5bd8..17e9d772 100644 --- a/contentstream/parser.go +++ b/contentstream/parser.go @@ -16,6 +16,7 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core" + "github.com/unidoc/unipdf/v3/internal/parseutils" ) // ContentStreamParser represents a content stream parser for parsing content streams in PDFs. @@ -191,67 +192,19 @@ func (csp *ContentStreamParser) parseName() (core.PdfObjectName, error) { // we will support it in the reader (no confusion with other types, so // no compromise). func (csp *ContentStreamParser) parseNumber() (core.PdfObject, error) { - isFloat := false - allowSigns := true - numStr := "" - for { - common.Log.Trace("Parsing number \"%s\"", numStr) - bb, err := csp.reader.Peek(1) - if err == io.EOF { - // GH: EOF handling. Handle EOF like end of line. Can happen with - // encoded object streams that the object is at the end. - // In other cases, we will get the EOF error elsewhere at any rate. - break // Handle like EOF - } - if err != nil { - common.Log.Error("ERROR %s", err) - return nil, err - } - if allowSigns && (bb[0] == '-' || bb[0] == '+') { - // Only appear in the beginning, otherwise serves as a delimiter. - b, _ := csp.reader.ReadByte() - numStr += string(b) - allowSigns = false // Only allowed in beginning, and after e (exponential). - } else if core.IsDecimalDigit(bb[0]) { - b, _ := csp.reader.ReadByte() - numStr += string(b) - } else if bb[0] == '.' { - b, _ := csp.reader.ReadByte() - numStr += string(b) - isFloat = true - } else if bb[0] == 'e' { - // Exponential number format. - b, _ := csp.reader.ReadByte() - numStr += string(b) - isFloat = true - allowSigns = true - } else { - break - } + num, err := parseutils.ParseNumber(csp.reader) + if err != nil { + return nil, err } - - var o core.PdfObject - if isFloat { - fVal, err := strconv.ParseFloat(numStr, 64) - if err != nil { - common.Log.Debug("Error parsing number %q err=%v. Using 0.0. Output may be incorrect", numStr, err) - fVal = 0.0 - } - - objFloat := core.PdfObjectFloat(fVal) - o = &objFloat - } else { - intVal, err := strconv.ParseInt(numStr, 10, 64) - if err != nil { - common.Log.Debug("Error parsing integer %q err=%v. Using 0. Output may be incorrect", numStr, err) - intVal = 0 - } - - objInt := core.PdfObjectInteger(intVal) - o = &objInt + switch num := num.(type) { + case float64: + o := core.PdfObjectFloat(num) + return &o, nil + case int64: + o := core.PdfObjectInteger(num) + return &o, nil } - - return o, nil + return nil, fmt.Errorf("unhandled number type %T", num) } // A string starts with '(' and ends with ')'. diff --git a/core/parser.go b/core/parser.go index 4d52e699..fdcf2b3e 100755 --- a/core/parser.go +++ b/core/parser.go @@ -20,6 +20,7 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core/security" + "github.com/unidoc/unipdf/v3/internal/parseutils" ) // Regular Expressions for parsing and identifying object signatures. @@ -286,69 +287,19 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) { // we will support it in the reader (no confusion with other types, so // no compromise). func (parser *PdfParser) parseNumber() (PdfObject, error) { - isFloat := false - allowSigns := true - var r bytes.Buffer - for { - common.Log.Trace("Parsing number \"%s\"", r.String()) - bb, err := parser.reader.Peek(1) - if err == io.EOF { - // GH: EOF handling. Handle EOF like end of line. Can happen with - // encoded object streams that the object is at the end. - // In other cases, we will get the EOF error elsewhere at any rate. - break // Handle like EOF - } - if err != nil { - common.Log.Debug("ERROR %s", err) - return nil, err - } - if allowSigns && (bb[0] == '-' || bb[0] == '+') { - // Only appear in the beginning, otherwise serves as a delimiter. - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - allowSigns = false // Only allowed in beginning, and after e (exponential). - } else if IsDecimalDigit(bb[0]) { - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - } else if bb[0] == '.' { - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - isFloat = true - } else if bb[0] == 'e' || bb[0] == 'E' { - // Exponential number format. - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - isFloat = true - allowSigns = true - } else { - break - } + num, err := parseutils.ParseNumber(parser.reader) + if err != nil { + return nil, err } - - var o PdfObject - if isFloat { - fVal, err := strconv.ParseFloat(r.String(), 64) - if err != nil { - common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err) - fVal = 0.0 - err = nil - } - - objFloat := PdfObjectFloat(fVal) - o = &objFloat - } else { - intVal, err := strconv.ParseInt(r.String(), 10, 64) - if err != nil { - common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err) - intVal = 0 - err = nil - } - - objInt := PdfObjectInteger(intVal) - o = &objInt + switch num := num.(type) { + case float64: + o := PdfObjectFloat(num) + return &o, nil + case int64: + o := PdfObjectInteger(num) + return &o, nil } - - return o, nil + return nil, fmt.Errorf("unhandled number type %T", num) } // A string starts with '(' and ends with ')'. diff --git a/fdf/parser.go b/fdf/parser.go index f4368e32..4f27cd02 100644 --- a/fdf/parser.go +++ b/fdf/parser.go @@ -19,6 +19,7 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core" + "github.com/unidoc/unipdf/v3/internal/parseutils" ) // Regular Expressions for parsing and identifying object signatures. @@ -212,54 +213,19 @@ func (parser *fdfParser) parseName() (core.PdfObjectName, error) { // we will support it in the reader (no confusion with other types, so // no compromise). func (parser *fdfParser) parseNumber() (core.PdfObject, error) { - isFloat := false - allowSigns := true - var r bytes.Buffer - for { - common.Log.Trace("Parsing number \"%s\"", r.String()) - bb, err := parser.reader.Peek(1) - if err == io.EOF { - // GH: EOF handling. Handle EOF like end of line. Can happen with - // encoded object streams that the object is at the end. - // In other cases, we will get the EOF error elsewhere at any rate. - break // Handle like EOF - } - if err != nil { - common.Log.Debug("ERROR %s", err) - return nil, err - } - if allowSigns && (bb[0] == '-' || bb[0] == '+') { - // Only appear in the beginning, otherwise serves as a delimiter. - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - allowSigns = false // Only allowed in beginning, and after e (exponential). - } else if core.IsDecimalDigit(bb[0]) { - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - } else if bb[0] == '.' { - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - isFloat = true - } else if bb[0] == 'e' { - // Exponential number format. - b, _ := parser.reader.ReadByte() - r.WriteByte(b) - isFloat = true - allowSigns = true - } else { - break - } + num, err := parseutils.ParseNumber(parser.reader) + if err != nil { + return nil, err } - - if isFloat { - fVal, err := strconv.ParseFloat(r.String(), 64) - o := core.PdfObjectFloat(fVal) - return &o, err - } else { - intVal, err := strconv.ParseInt(r.String(), 10, 64) - o := core.PdfObjectInteger(intVal) - return &o, err + switch num := num.(type) { + case float64: + o := core.PdfObjectFloat(num) + return &o, nil + case int64: + o := core.PdfObjectInteger(num) + return &o, nil } + return nil, fmt.Errorf("unhandled number type %T", num) } // A string starts with '(' and ends with ')'. diff --git a/internal/cmap/parser.go b/internal/cmap/parser.go index 92c93e07..a43f4b13 100644 --- a/internal/cmap/parser.go +++ b/internal/cmap/parser.go @@ -15,6 +15,7 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/core" + "github.com/unidoc/unipdf/v3/internal/parseutils" ) // cMapParser parses CMap character to unicode mapping files. @@ -392,49 +393,17 @@ func (p *cMapParser) parseDict() (cmapDict, error) { // parseDict parseNumber a PDF number. func (p *cMapParser) parseNumber() (cmapObject, error) { - isFloat := false - allowSigns := true - - numStr := bytes.Buffer{} - for { - bb, err := p.reader.Peek(1) - if err == io.EOF { - break - } - if err != nil { - return nil, err - } - if allowSigns && (bb[0] == '-' || bb[0] == '+') { - // Only appear in the beginning, otherwise serves as a delimiter. - b, _ := p.reader.ReadByte() - numStr.WriteByte(b) - allowSigns = false // Only allowed in beginning, and after e (exponential). - } else if core.IsDecimalDigit(bb[0]) { - b, _ := p.reader.ReadByte() - numStr.WriteByte(b) - } else if bb[0] == '.' { - b, _ := p.reader.ReadByte() - numStr.WriteByte(b) - isFloat = true - } else if bb[0] == 'e' { - // Exponential number format. - b, _ := p.reader.ReadByte() - numStr.WriteByte(b) - isFloat = true - allowSigns = true - } else { - break - } + num, err := parseutils.ParseNumber(p.reader) + if err != nil { + return nil, err } - - if isFloat { - fVal, err := strconv.ParseFloat(numStr.String(), 64) - o := cmapFloat{fVal} - return o, err + switch num := num.(type) { + case float64: + return cmapFloat{num}, nil + case int64: + return cmapInt{num}, nil } - intVal, err := strconv.ParseInt(numStr.String(), 10, 64) - o := cmapInt{intVal} - return o, err + return nil, fmt.Errorf("unhandled number type %T", num) } // parseOperand parses an operand, which is a text command represented by a word. diff --git a/internal/parseutils/number.go b/internal/parseutils/number.go new file mode 100644 index 00000000..e7a8300e --- /dev/null +++ b/internal/parseutils/number.go @@ -0,0 +1,103 @@ +package parseutils + +import ( + "bufio" + "bytes" + "io" + "strconv" + + "github.com/unidoc/unipdf/v3/common" +) + +// ParseNumber parses a numeric objects from a buffered stream. +// Section 7.3.3. +// Integer or Float. +// +// An integer shall be written as one or more decimal digits optionally +// preceded by a sign. The value shall be interpreted as a signed +// decimal integer and shall be converted to an integer object. +// +// A real value shall be written as one or more decimal digits with an +// optional sign and a leading, trailing, or embedded PERIOD (2Eh) +// (decimal point). The value shall be interpreted as a real number +// and shall be converted to a real object. +// +// Regarding exponential numbers: 7.3.3 Numeric Objects: +// A conforming writer shall not use the PostScript syntax for numbers +// with non-decimal radices (such as 16#FFFE) or in exponential format +// (such as 6.02E23). +// Nonetheless, we sometimes get numbers with exponential format, so +// we will support it in the reader (no confusion with other types, so +// no compromise). +func ParseNumber(bufr *bufio.Reader) (interface{}, error) { + isFloat := false + allowSigns := true + var r bytes.Buffer + for { + if common.Log.IsLogLevel(common.LogLevelTrace) { + common.Log.Trace("Parsing number \"%s\"", r.String()) + } + bb, err := bufr.Peek(1) + if err == io.EOF { + // GH: EOF handling. Handle EOF like end of line. Can happen with + // encoded object streams that the object is at the end. + // In other cases, we will get the EOF error elsewhere at any rate. + break // Handle like EOF + } + if err != nil { + common.Log.Debug("ERROR %s", err) + return nil, err + } + if allowSigns && (bb[0] == '-' || bb[0] == '+') { + // Only appear in the beginning, otherwise serves as a delimiter. + b, _ := bufr.ReadByte() + r.WriteByte(b) + allowSigns = false // Only allowed in beginning, and after e (exponential). + } else if IsDecimalDigit(bb[0]) { + b, _ := bufr.ReadByte() + r.WriteByte(b) + } else if bb[0] == '.' { + b, _ := bufr.ReadByte() + r.WriteByte(b) + isFloat = true + } else if bb[0] == 'e' || bb[0] == 'E' { + // Exponential number format. + b, _ := bufr.ReadByte() + r.WriteByte(b) + isFloat = true + allowSigns = true + } else { + break + } + } + + var o interface{} + if isFloat { + fVal, err := strconv.ParseFloat(r.String(), 64) + if err != nil { + common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err) + fVal = 0.0 + err = nil + } + o = fVal + } else { + intVal, err := strconv.ParseInt(r.String(), 10, 64) + if err != nil { + common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err) + intVal = 0 + err = nil + } + o = intVal + } + + return o, nil +} + +// IsDecimalDigit checks if the character is a part of a decimal number string. +func IsDecimalDigit(c byte) bool { + if c >= '0' && c <= '9' { + return true + } + + return false +} diff --git a/ps/parser.go b/ps/parser.go index cfd4c4ef..020cd260 100644 --- a/ps/parser.go +++ b/ps/parser.go @@ -9,11 +9,12 @@ import ( "bufio" "bytes" "errors" + "fmt" "io" - "strconv" "github.com/unidoc/unipdf/v3/common" pdfcore "github.com/unidoc/unipdf/v3/core" + "github.com/unidoc/unipdf/v3/internal/parseutils" ) // PSParser is a basic Postscript parser. @@ -145,54 +146,17 @@ func (p *PSParser) skipSpaces() (int, error) { // Numeric objects. // Integer or Real numbers. func (p *PSParser) parseNumber() (PSObject, error) { - isFloat := false - allowSigns := true - numStr := "" - for { - common.Log.Trace("Parsing number \"%s\"", numStr) - bb, err := p.reader.Peek(1) - if err == io.EOF { - // GH: EOF handling. Handle EOF like end of line. Can happen with - // encoded object streams that the object is at the end. - // In other cases, we will get the EOF error elsewhere at any rate. - break // Handle like EOF - } - if err != nil { - common.Log.Debug("PS ERROR: %s", err) - return nil, err - } - if allowSigns && (bb[0] == '-' || bb[0] == '+') { - // Only appear in the beginning, otherwise serves as a delimiter. - b, _ := p.reader.ReadByte() - numStr += string(b) - allowSigns = false // Only allowed in beginning, and after e (exponential). - } else if pdfcore.IsDecimalDigit(bb[0]) { - b, _ := p.reader.ReadByte() - numStr += string(b) - } else if bb[0] == '.' { - b, _ := p.reader.ReadByte() - numStr += string(b) - isFloat = true - } else if bb[0] == 'e' { - // Exponential number format. - // TODO: Is this supported in PS? - b, _ := p.reader.ReadByte() - numStr += string(b) - isFloat = true - allowSigns = true - } else { - break - } + num, err := parseutils.ParseNumber(p.reader) + if err != nil { + return nil, err } - - if isFloat { - fVal, err := strconv.ParseFloat(numStr, 64) - o := MakeReal(fVal) - return o, err + switch num := num.(type) { + case float64: + return MakeReal(num), nil + case int64: + return MakeInteger(int(num)), nil } - intVal, err := strconv.ParseInt(numStr, 10, 64) - o := MakeInteger(int(intVal)) - return o, err + return nil, fmt.Errorf("unhandled number type %T", num) } // Parse bool object.