diff --git a/pdf/outlines.go b/pdf/outlines.go index 347a7d26..542f80ea 100644 --- a/pdf/outlines.go +++ b/pdf/outlines.go @@ -70,17 +70,20 @@ func newPdfOutlineFromDict(dict *PdfObjectDictionary) (*PdfOutline, error) { typeVal, ok := obj.(*PdfObjectName) if ok { if *typeVal != "Outlines" { - return nil, fmt.Errorf("Type != Outlines (%s)", *typeVal) + common.Log.Error("Type != Outlines (%s)", *typeVal) + // Should be "Outlines" if there, but some files have other types + // Log as an error but do not quit. + // Might be a good idea to log this kind of deviation from the standard separately. } } } if obj, hasCount := (*dict)["Count"]; hasCount { - countVal, ok := obj.(*PdfObjectInteger) - if !ok { - return nil, fmt.Errorf("Count not an integer (%T)", obj) + // This should always be an integer, but in a few cases has been a float. + count, err := getNumberAsInt64(obj) + if err != nil { + return nil, err } - count := int64(*countVal) outline.Count = &count } diff --git a/pdf/page.go b/pdf/page.go index 699e3918..12ca9d8c 100644 --- a/pdf/page.go +++ b/pdf/page.go @@ -17,6 +17,8 @@ import ( "fmt" "regexp" "strconv" + + "github.com/unidoc/unidoc/common" ) type PdfRectangle struct { @@ -38,6 +40,21 @@ func getNumberAsFloat(obj PdfObject) (float64, error) { return 0, errors.New("Not a number") } +// Cases where expecting an integer, but some implementations actually +// store the number in a floating point format. +func getNumberAsInt64(obj PdfObject) (int64, error) { + if iObj, ok := obj.(*PdfObjectInteger); ok { + return int64(*iObj), nil + } + + if fObj, ok := obj.(*PdfObjectFloat); ok { + common.Log.Debug("Number expected as integer was stored as float (type casting used)") + return int64(*fObj), nil + } + + return 0, errors.New("Not a number") +} + func getNumberAsFloatOrNull(obj PdfObject) (*float64, error) { if fObj, ok := obj.(*PdfObjectFloat); ok { num := float64(*fObj) @@ -112,7 +129,7 @@ type PdfDate struct { utOffsetMins int64 // mm (00-59) } -var reDate = regexp.MustCompile(`\s*D\s*:\s*(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})([+-Z])(\d{2})'(\d{2})?`) +var reDate = regexp.MustCompile(`\s*D\s*:\s*(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})([+-Z])?(\d{2})?'?(\d{2})?`) // Make a new PdfDate object from a PDF date string (see 7.9.4 Dates). // format: "D: YYYYMMDDHHmmSSOHH'mm" @@ -134,9 +151,22 @@ func NewPdfDate(dateStr string) (PdfDate, error) { d.hour, _ = strconv.ParseInt(matches[0][4], 10, 32) d.minute, _ = strconv.ParseInt(matches[0][5], 10, 32) d.second, _ = strconv.ParseInt(matches[0][6], 10, 32) - d.utOffsetSign = matches[0][7][0] - d.utOffsetHours, _ = strconv.ParseInt(matches[0][8], 10, 32) - d.utOffsetMins, _ = strconv.ParseInt(matches[0][9], 10, 32) + // Some poor implementations do not include the offset. + if len(matches[0][7]) > 0 { + d.utOffsetSign = matches[0][7][0] + } else { + d.utOffsetSign = '+' + } + if len(matches[0][8]) > 0 { + d.utOffsetHours, _ = strconv.ParseInt(matches[0][8], 10, 32) + } else { + d.utOffsetHours = 0 + } + if len(matches[0][9]) > 0 { + d.utOffsetMins, _ = strconv.ParseInt(matches[0][9], 10, 32) + } else { + d.utOffsetMins = 0 + } return d, nil } diff --git a/pdf/page_test.go b/pdf/page_test.go index e8189dbb..1e9d25c7 100644 --- a/pdf/page_test.go +++ b/pdf/page_test.go @@ -125,6 +125,48 @@ func TestDateParse(t *testing.T) { t.Errorf("Invalid offset minutes") return } + + // Case 5: Missing some more parameters. + // Seems that many implementations consider some stuff optional... + // Not following the standard, but we need to handle it. + // D:20050823042205 + str = "D:20050823042205" + date, err = NewPdfDate(str) + if err != nil { + t.Errorf("Fail: %s", err) + return + } + if date.year != 2005 { + t.Errorf("Year != 2005") + return + } + if date.month != 8 { + t.Errorf("month != 8") + return + } + if date.day != 23 { + t.Errorf("Day != 23") + return + } + if date.hour != 04 { + t.Errorf("Hour != 11 (%d)", date.hour) + return + } + if date.minute != 22 { + t.Errorf("Minute != 29 (%d)", date.minute) + } + if date.second != 05 { + t.Errorf("Second != 37 (%d)", date.second) + return + } + if date.utOffsetHours != 0 { + t.Errorf("Invalid offset hours") + return + } + if date.utOffsetMins != 0 { + t.Errorf("Invalid offset minutes") + return + } } // Test parsing and building the date.