More flexibility in parsing dates etc to support more PDFs with the new outlines handling

PdfDate more flexible (offset made optional)
Count that should be int can now also be a float
This commit is contained in:
Gunnsteinn Hall 2016-08-19 11:34:55 +00:00
parent 1593b0ebd4
commit eb13f3b8be
3 changed files with 84 additions and 9 deletions

View File

@ -70,17 +70,20 @@ func newPdfOutlineFromDict(dict *PdfObjectDictionary) (*PdfOutline, error) {
typeVal, ok := obj.(*PdfObjectName)
if ok {
if *typeVal != "Outlines" {
return nil, fmt.Errorf("Type != Outlines (%s)", *typeVal)
common.Log.Error("Type != Outlines (%s)", *typeVal)
// Should be "Outlines" if there, but some files have other types
// Log as an error but do not quit.
// Might be a good idea to log this kind of deviation from the standard separately.
}
}
}
if obj, hasCount := (*dict)["Count"]; hasCount {
countVal, ok := obj.(*PdfObjectInteger)
if !ok {
return nil, fmt.Errorf("Count not an integer (%T)", obj)
// This should always be an integer, but in a few cases has been a float.
count, err := getNumberAsInt64(obj)
if err != nil {
return nil, err
}
count := int64(*countVal)
outline.Count = &count
}

View File

@ -17,6 +17,8 @@ import (
"fmt"
"regexp"
"strconv"
"github.com/unidoc/unidoc/common"
)
type PdfRectangle struct {
@ -38,6 +40,21 @@ func getNumberAsFloat(obj PdfObject) (float64, error) {
return 0, errors.New("Not a number")
}
// Cases where expecting an integer, but some implementations actually
// store the number in a floating point format.
func getNumberAsInt64(obj PdfObject) (int64, error) {
if iObj, ok := obj.(*PdfObjectInteger); ok {
return int64(*iObj), nil
}
if fObj, ok := obj.(*PdfObjectFloat); ok {
common.Log.Debug("Number expected as integer was stored as float (type casting used)")
return int64(*fObj), nil
}
return 0, errors.New("Not a number")
}
func getNumberAsFloatOrNull(obj PdfObject) (*float64, error) {
if fObj, ok := obj.(*PdfObjectFloat); ok {
num := float64(*fObj)
@ -112,7 +129,7 @@ type PdfDate struct {
utOffsetMins int64 // mm (00-59)
}
var reDate = regexp.MustCompile(`\s*D\s*:\s*(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})([+-Z])(\d{2})'(\d{2})?`)
var reDate = regexp.MustCompile(`\s*D\s*:\s*(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})([+-Z])?(\d{2})?'?(\d{2})?`)
// Make a new PdfDate object from a PDF date string (see 7.9.4 Dates).
// format: "D: YYYYMMDDHHmmSSOHH'mm"
@ -134,9 +151,22 @@ func NewPdfDate(dateStr string) (PdfDate, error) {
d.hour, _ = strconv.ParseInt(matches[0][4], 10, 32)
d.minute, _ = strconv.ParseInt(matches[0][5], 10, 32)
d.second, _ = strconv.ParseInt(matches[0][6], 10, 32)
d.utOffsetSign = matches[0][7][0]
d.utOffsetHours, _ = strconv.ParseInt(matches[0][8], 10, 32)
d.utOffsetMins, _ = strconv.ParseInt(matches[0][9], 10, 32)
// Some poor implementations do not include the offset.
if len(matches[0][7]) > 0 {
d.utOffsetSign = matches[0][7][0]
} else {
d.utOffsetSign = '+'
}
if len(matches[0][8]) > 0 {
d.utOffsetHours, _ = strconv.ParseInt(matches[0][8], 10, 32)
} else {
d.utOffsetHours = 0
}
if len(matches[0][9]) > 0 {
d.utOffsetMins, _ = strconv.ParseInt(matches[0][9], 10, 32)
} else {
d.utOffsetMins = 0
}
return d, nil
}

View File

@ -125,6 +125,48 @@ func TestDateParse(t *testing.T) {
t.Errorf("Invalid offset minutes")
return
}
// Case 5: Missing some more parameters.
// Seems that many implementations consider some stuff optional...
// Not following the standard, but we need to handle it.
// D:20050823042205
str = "D:20050823042205"
date, err = NewPdfDate(str)
if err != nil {
t.Errorf("Fail: %s", err)
return
}
if date.year != 2005 {
t.Errorf("Year != 2005")
return
}
if date.month != 8 {
t.Errorf("month != 8")
return
}
if date.day != 23 {
t.Errorf("Day != 23")
return
}
if date.hour != 04 {
t.Errorf("Hour != 11 (%d)", date.hour)
return
}
if date.minute != 22 {
t.Errorf("Minute != 29 (%d)", date.minute)
}
if date.second != 05 {
t.Errorf("Second != 37 (%d)", date.second)
return
}
if date.utOffsetHours != 0 {
t.Errorf("Invalid offset hours")
return
}
if date.utOffsetMins != 0 {
t.Errorf("Invalid offset minutes")
return
}
}
// Test parsing and building the date.