Unify and optimize number parsing

2025-04-27 13:48:51 +08:00 · 2019-12-18 12:30:54 -08:00 · 2019-12-18 12:30:54 -08:00 · e85397b57a
commit e85397b57a
parent d0f9c139ad
6 changed files with 160 additions and 254 deletions
--- a/contentstream/parser.go
+++ b/contentstream/parser.go
@ -16,6 +16,7 @@ import (

 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/core"
+	"github.com/unidoc/unipdf/v3/internal/parseutils"
 )

 // ContentStreamParser represents a content stream parser for parsing content streams in PDFs.
@ -191,67 +192,19 @@ func (csp *ContentStreamParser) parseName() (core.PdfObjectName, error) {
 // we will support it in the reader (no confusion with other types, so
 // no compromise).
 func (csp *ContentStreamParser) parseNumber() (core.PdfObject, error) {
-	isFloat := false
-	allowSigns := true
-	numStr := ""
-	for {
-		common.Log.Trace("Parsing number \"%s\"", numStr)
-		bb, err := csp.reader.Peek(1)
-		if err == io.EOF {
-			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
-			// encoded object streams that the object is at the end.
-			// In other cases, we will get the EOF error elsewhere at any rate.
-			break // Handle like EOF
-		}
-		if err != nil {
-			common.Log.Error("ERROR %s", err)
-			return nil, err
-		}
-		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
-			// Only appear in the beginning, otherwise serves as a delimiter.
-			b, _ := csp.reader.ReadByte()
-			numStr += string(b)
-			allowSigns = false // Only allowed in beginning, and after e (exponential).
-		} else if core.IsDecimalDigit(bb[0]) {
-			b, _ := csp.reader.ReadByte()
-			numStr += string(b)
-		} else if bb[0] == '.' {
-			b, _ := csp.reader.ReadByte()
-			numStr += string(b)
-			isFloat = true
-		} else if bb[0] == 'e' {
-			// Exponential number format.
-			b, _ := csp.reader.ReadByte()
-			numStr += string(b)
-			isFloat = true
-			allowSigns = true
-		} else {
-			break
-		}
+	num, err := parseutils.ParseNumber(csp.reader)
+	if err != nil {
+		return nil, err
 	}
-
-	var o core.PdfObject
-	if isFloat {
-		fVal, err := strconv.ParseFloat(numStr, 64)
-		if err != nil {
-			common.Log.Debug("Error parsing number %q err=%v. Using 0.0. Output may be incorrect", numStr, err)
-			fVal = 0.0
-		}
-
-		objFloat := core.PdfObjectFloat(fVal)
-		o = &objFloat
-	} else {
-		intVal, err := strconv.ParseInt(numStr, 10, 64)
-		if err != nil {
-			common.Log.Debug("Error parsing integer %q err=%v. Using 0. Output may be incorrect", numStr, err)
-			intVal = 0
-		}
-
-		objInt := core.PdfObjectInteger(intVal)
-		o = &objInt
+	switch num := num.(type) {
+	case float64:
+		o := core.PdfObjectFloat(num)
+		return &o, nil
+	case int64:
+		o := core.PdfObjectInteger(num)
+		return &o, nil
 	}
-
-	return o, nil
+	return nil, fmt.Errorf("unhandled number type %T", num)
 }

 // A string starts with '(' and ends with ')'.
--- a/core/parser.go
+++ b/core/parser.go
@ -20,6 +20,7 @@ import (

 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/core/security"
+	"github.com/unidoc/unipdf/v3/internal/parseutils"
 )

 // Regular Expressions for parsing and identifying object signatures.
@ -286,69 +287,19 @@ func (parser *PdfParser) parseName() (PdfObjectName, error) {
 // we will support it in the reader (no confusion with other types, so
 // no compromise).
 func (parser *PdfParser) parseNumber() (PdfObject, error) {
-	isFloat := false
-	allowSigns := true
-	var r bytes.Buffer
-	for {
-		common.Log.Trace("Parsing number \"%s\"", r.String())
-		bb, err := parser.reader.Peek(1)
-		if err == io.EOF {
-			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
-			// encoded object streams that the object is at the end.
-			// In other cases, we will get the EOF error elsewhere at any rate.
-			break // Handle like EOF
-		}
-		if err != nil {
-			common.Log.Debug("ERROR %s", err)
-			return nil, err
-		}
-		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
-			// Only appear in the beginning, otherwise serves as a delimiter.
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-			allowSigns = false // Only allowed in beginning, and after e (exponential).
-		} else if IsDecimalDigit(bb[0]) {
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-		} else if bb[0] == '.' {
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-			isFloat = true
-		} else if bb[0] == 'e' || bb[0] == 'E' {
-			// Exponential number format.
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-			isFloat = true
-			allowSigns = true
-		} else {
-			break
-		}
+	num, err := parseutils.ParseNumber(parser.reader)
+	if err != nil {
+		return nil, err
 	}
-
-	var o PdfObject
-	if isFloat {
-		fVal, err := strconv.ParseFloat(r.String(), 64)
-		if err != nil {
-			common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err)
-			fVal = 0.0
-			err = nil
-		}
-
-		objFloat := PdfObjectFloat(fVal)
-		o = &objFloat
-	} else {
-		intVal, err := strconv.ParseInt(r.String(), 10, 64)
-		if err != nil {
-			common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err)
-			intVal = 0
-			err = nil
-		}
-
-		objInt := PdfObjectInteger(intVal)
-		o = &objInt
+	switch num := num.(type) {
+	case float64:
+		o := PdfObjectFloat(num)
+		return &o, nil
+	case int64:
+		o := PdfObjectInteger(num)
+		return &o, nil
 	}
-
-	return o, nil
+	return nil, fmt.Errorf("unhandled number type %T", num)
 }

 // A string starts with '(' and ends with ')'.
--- a/fdf/parser.go
+++ b/fdf/parser.go
@ -19,6 +19,7 @@ import (

 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/core"
+	"github.com/unidoc/unipdf/v3/internal/parseutils"
 )

 // Regular Expressions for parsing and identifying object signatures.
@ -212,54 +213,19 @@ func (parser *fdfParser) parseName() (core.PdfObjectName, error) {
 // we will support it in the reader (no confusion with other types, so
 // no compromise).
 func (parser *fdfParser) parseNumber() (core.PdfObject, error) {
-	isFloat := false
-	allowSigns := true
-	var r bytes.Buffer
-	for {
-		common.Log.Trace("Parsing number \"%s\"", r.String())
-		bb, err := parser.reader.Peek(1)
-		if err == io.EOF {
-			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
-			// encoded object streams that the object is at the end.
-			// In other cases, we will get the EOF error elsewhere at any rate.
-			break // Handle like EOF
-		}
-		if err != nil {
-			common.Log.Debug("ERROR %s", err)
-			return nil, err
-		}
-		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
-			// Only appear in the beginning, otherwise serves as a delimiter.
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-			allowSigns = false // Only allowed in beginning, and after e (exponential).
-		} else if core.IsDecimalDigit(bb[0]) {
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-		} else if bb[0] == '.' {
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-			isFloat = true
-		} else if bb[0] == 'e' {
-			// Exponential number format.
-			b, _ := parser.reader.ReadByte()
-			r.WriteByte(b)
-			isFloat = true
-			allowSigns = true
-		} else {
-			break
-		}
+	num, err := parseutils.ParseNumber(parser.reader)
+	if err != nil {
+		return nil, err
 	}
-
-	if isFloat {
-		fVal, err := strconv.ParseFloat(r.String(), 64)
-		o := core.PdfObjectFloat(fVal)
-		return &o, err
-	} else {
-		intVal, err := strconv.ParseInt(r.String(), 10, 64)
-		o := core.PdfObjectInteger(intVal)
-		return &o, err
+	switch num := num.(type) {
+	case float64:
+		o := core.PdfObjectFloat(num)
+		return &o, nil
+	case int64:
+		o := core.PdfObjectInteger(num)
+		return &o, nil
 	}
+	return nil, fmt.Errorf("unhandled number type %T", num)
 }

 // A string starts with '(' and ends with ')'.
--- a/internal/cmap/parser.go
+++ b/internal/cmap/parser.go
@ -15,6 +15,7 @@ import (

 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/core"
+	"github.com/unidoc/unipdf/v3/internal/parseutils"
 )

 // cMapParser parses CMap character to unicode mapping files.
@ -392,49 +393,17 @@ func (p *cMapParser) parseDict() (cmapDict, error) {

 // parseDict parseNumber a PDF number.
 func (p *cMapParser) parseNumber() (cmapObject, error) {
-	isFloat := false
-	allowSigns := true
-
-	numStr := bytes.Buffer{}
-	for {
-		bb, err := p.reader.Peek(1)
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return nil, err
-		}
-		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
-			// Only appear in the beginning, otherwise serves as a delimiter.
-			b, _ := p.reader.ReadByte()
-			numStr.WriteByte(b)
-			allowSigns = false // Only allowed in beginning, and after e (exponential).
-		} else if core.IsDecimalDigit(bb[0]) {
-			b, _ := p.reader.ReadByte()
-			numStr.WriteByte(b)
-		} else if bb[0] == '.' {
-			b, _ := p.reader.ReadByte()
-			numStr.WriteByte(b)
-			isFloat = true
-		} else if bb[0] == 'e' {
-			// Exponential number format.
-			b, _ := p.reader.ReadByte()
-			numStr.WriteByte(b)
-			isFloat = true
-			allowSigns = true
-		} else {
-			break
-		}
+	num, err := parseutils.ParseNumber(p.reader)
+	if err != nil {
+		return nil, err
 	}
-
-	if isFloat {
-		fVal, err := strconv.ParseFloat(numStr.String(), 64)
-		o := cmapFloat{fVal}
-		return o, err
+	switch num := num.(type) {
+	case float64:
+		return cmapFloat{num}, nil
+	case int64:
+		return cmapInt{num}, nil
 	}
-	intVal, err := strconv.ParseInt(numStr.String(), 10, 64)
-	o := cmapInt{intVal}
-	return o, err
+	return nil, fmt.Errorf("unhandled number type %T", num)
 }

 // parseOperand parses an operand, which is a text command represented by a word.
--- a/internal/parseutils/number.go
+++ b/internal/parseutils/number.go
@ -0,0 +1,103 @@
+package parseutils
+
+import (
+	"bufio"
+	"bytes"
+	"io"
+	"strconv"
+
+	"github.com/unidoc/unipdf/v3/common"
+)
+
+// ParseNumber parses a numeric objects from a buffered stream.
+// Section 7.3.3.
+// Integer or Float.
+//
+// An integer shall be written as one or more decimal digits optionally
+// preceded by a sign. The value shall be interpreted as a signed
+// decimal integer and shall be converted to an integer object.
+//
+// A real value shall be written as one or more decimal digits with an
+// optional sign and a leading, trailing, or embedded PERIOD (2Eh)
+// (decimal point). The value shall be interpreted as a real number
+// and shall be converted to a real object.
+//
+// Regarding exponential numbers: 7.3.3 Numeric Objects:
+// A conforming writer shall not use the PostScript syntax for numbers
+// with non-decimal radices (such as 16#FFFE) or in exponential format
+// (such as 6.02E23).
+// Nonetheless, we sometimes get numbers with exponential format, so
+// we will support it in the reader (no confusion with other types, so
+// no compromise).
+func ParseNumber(bufr *bufio.Reader) (interface{}, error) {
+	isFloat := false
+	allowSigns := true
+	var r bytes.Buffer
+	for {
+		if common.Log.IsLogLevel(common.LogLevelTrace) {
+			common.Log.Trace("Parsing number \"%s\"", r.String())
+		}
+		bb, err := bufr.Peek(1)
+		if err == io.EOF {
+			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
+			// encoded object streams that the object is at the end.
+			// In other cases, we will get the EOF error elsewhere at any rate.
+			break // Handle like EOF
+		}
+		if err != nil {
+			common.Log.Debug("ERROR %s", err)
+			return nil, err
+		}
+		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
+			// Only appear in the beginning, otherwise serves as a delimiter.
+			b, _ := bufr.ReadByte()
+			r.WriteByte(b)
+			allowSigns = false // Only allowed in beginning, and after e (exponential).
+		} else if IsDecimalDigit(bb[0]) {
+			b, _ := bufr.ReadByte()
+			r.WriteByte(b)
+		} else if bb[0] == '.' {
+			b, _ := bufr.ReadByte()
+			r.WriteByte(b)
+			isFloat = true
+		} else if bb[0] == 'e' || bb[0] == 'E' {
+			// Exponential number format.
+			b, _ := bufr.ReadByte()
+			r.WriteByte(b)
+			isFloat = true
+			allowSigns = true
+		} else {
+			break
+		}
+	}
+
+	var o interface{}
+	if isFloat {
+		fVal, err := strconv.ParseFloat(r.String(), 64)
+		if err != nil {
+			common.Log.Debug("Error parsing number %v err=%v. Using 0.0. Output may be incorrect", r.String(), err)
+			fVal = 0.0
+			err = nil
+		}
+		o = fVal
+	} else {
+		intVal, err := strconv.ParseInt(r.String(), 10, 64)
+		if err != nil {
+			common.Log.Debug("Error parsing number %v err=%v. Using 0. Output may be incorrect", r.String(), err)
+			intVal = 0
+			err = nil
+		}
+		o = intVal
+	}
+
+	return o, nil
+}
+
+// IsDecimalDigit checks if the character is a part of a decimal number string.
+func IsDecimalDigit(c byte) bool {
+	if c >= '0' && c <= '9' {
+		return true
+	}
+
+	return false
+}
--- a/ps/parser.go
+++ b/ps/parser.go
@ -9,11 +9,12 @@ import (
 	"bufio"
 	"bytes"
 	"errors"
+	"fmt"
 	"io"
-	"strconv"

 	"github.com/unidoc/unipdf/v3/common"
 	pdfcore "github.com/unidoc/unipdf/v3/core"
+	"github.com/unidoc/unipdf/v3/internal/parseutils"
 )

 // PSParser is a basic Postscript parser.
@ -145,54 +146,17 @@ func (p *PSParser) skipSpaces() (int, error) {
 // Numeric objects.
 // Integer or Real numbers.
 func (p *PSParser) parseNumber() (PSObject, error) {
-	isFloat := false
-	allowSigns := true
-	numStr := ""
-	for {
-		common.Log.Trace("Parsing number \"%s\"", numStr)
-		bb, err := p.reader.Peek(1)
-		if err == io.EOF {
-			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
-			// encoded object streams that the object is at the end.
-			// In other cases, we will get the EOF error elsewhere at any rate.
-			break // Handle like EOF
-		}
-		if err != nil {
-			common.Log.Debug("PS ERROR: %s", err)
-			return nil, err
-		}
-		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
-			// Only appear in the beginning, otherwise serves as a delimiter.
-			b, _ := p.reader.ReadByte()
-			numStr += string(b)
-			allowSigns = false // Only allowed in beginning, and after e (exponential).
-		} else if pdfcore.IsDecimalDigit(bb[0]) {
-			b, _ := p.reader.ReadByte()
-			numStr += string(b)
-		} else if bb[0] == '.' {
-			b, _ := p.reader.ReadByte()
-			numStr += string(b)
-			isFloat = true
-		} else if bb[0] == 'e' {
-			// Exponential number format.
-			// TODO: Is this supported in PS?
-			b, _ := p.reader.ReadByte()
-			numStr += string(b)
-			isFloat = true
-			allowSigns = true
-		} else {
-			break
-		}
+	num, err := parseutils.ParseNumber(p.reader)
+	if err != nil {
+		return nil, err
 	}
-
-	if isFloat {
-		fVal, err := strconv.ParseFloat(numStr, 64)
-		o := MakeReal(fVal)
-		return o, err
+	switch num := num.(type) {
+	case float64:
+		return MakeReal(num), nil
+	case int64:
+		return MakeInteger(int(num)), nil
 	}
-	intVal, err := strconv.ParseInt(numStr, 10, 64)
-	o := MakeInteger(int(intVal))
-	return o, err
+	return nil, fmt.Errorf("unhandled number type %T", num)
 }

 // Parse bool object.