diff --git a/pdf/core/const.go b/pdf/core/const.go index a1eadec1..dcb6c6f7 100644 --- a/pdf/core/const.go +++ b/pdf/core/const.go @@ -8,5 +8,8 @@ package core import "errors" var ( + // ErrUnsupportedEncodingParameters error indicates that encoding/decoding was attempted with unsupported + // encoding parameters. + // For example when trying to encode with an unsupported Predictor (flate). ErrUnsupportedEncodingParameters = errors.New("Unsupported encoding parameters") ) diff --git a/pdf/core/crossrefs.go b/pdf/core/crossrefs.go index 3e844477..70f15b31 100644 --- a/pdf/core/crossrefs.go +++ b/pdf/core/crossrefs.go @@ -15,8 +15,13 @@ import ( "github.com/unidoc/unidoc/common" ) +// TODO (v3): Create a new type xrefType which can be an integer and can be used for improved type checking. +// TODO (v3): Unexport these constants and rename with camelCase. const ( - XREF_TABLE_ENTRY = iota + // XREF_TABLE_ENTRY indicates a normal xref table entry. + XREF_TABLE_ENTRY = iota + + // XREF_OBJECT_STREAM indicates an xref entry in an xref object stream. XREF_OBJECT_STREAM = iota ) @@ -67,7 +72,7 @@ func (this *PdfParser) lookupObjectViaOS(sobjNumber int, objNum int) (PdfObject, } if this.crypter != nil && !this.crypter.isDecrypted(so) { - return nil, errors.New("Need to decrypt the stream !") + return nil, errors.New("Need to decrypt the stream") } sod := so.PdfObjectDictionary diff --git a/pdf/core/doc.go b/pdf/core/doc.go new file mode 100644 index 00000000..09a1e1b1 --- /dev/null +++ b/pdf/core/doc.go @@ -0,0 +1,9 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +// Package core defines and implements the primitive PDF object types in golang, and provides functionality +// for parsing those from a PDF file stream. This includes I/O handling, cross references, repairs, encryption, +// encoding and other core capabilities. +package core diff --git a/pdf/core/io.go b/pdf/core/io.go index e0a5bce4..f00e9b2a 100644 --- a/pdf/core/io.go +++ b/pdf/core/io.go @@ -13,6 +13,8 @@ import ( "github.com/unidoc/unidoc/common" ) +// ReadAtLeast reads at least n bytes into slice p. +// Returns the number of bytes read (should always be == n), and an error on failure. func (this *PdfParser) ReadAtLeast(p []byte, n int) (int, error) { remaining := n start := 0 diff --git a/pdf/core/parser.go b/pdf/core/parser.go index 0f2c317b..dc1b029f 100644 --- a/pdf/core/parser.go +++ b/pdf/core/parser.go @@ -3,9 +3,6 @@ * file 'LICENSE.md', which is part of this source code package. */ -// The core package provides fundamental functionality for handling PDFs, including definitions of the core PDF objects -// (primitives), parsing a PDF file as a series of primitives, io, cross references, repairs, encryption, encoding and -// other core capabilities. package core import ( @@ -35,6 +32,7 @@ var reIndirectObject = regexp.MustCompile(`(\d+)\s+(\d+)\s+obj`) var reXrefSubsection = regexp.MustCompile(`(\d+)\s+(\d+)\s*$`) var reXrefEntry = regexp.MustCompile(`(\d+)\s+(\d+)\s+([nf])\s*$`) +// PdfParser parses a PDF file and provides access to the object structure of the PDF. type PdfParser struct { majorVersion int minorVersion int @@ -45,7 +43,7 @@ type PdfParser struct { xrefs XrefTable objstms ObjectStreams trailer *PdfObjectDictionary - ObjCache ObjectCache + ObjCache ObjectCache // TODO: Unexport (v3). crypter *PdfCrypt repairsAttempted bool // Avoid multiple attempts for repair. @@ -56,14 +54,18 @@ type PdfParser struct { streamLengthReferenceLookupInProgress map[int64]bool } +// GetCrypter returns the PdfCrypt instance which has information about the PDFs encryption. func (this *PdfParser) GetCrypter() *PdfCrypt { return this.crypter } +// IsAuthenticated returns true if the PDF has already been authenticated for accessing. func (this *PdfParser) IsAuthenticated() bool { return this.crypter.Authenticated } +// GetTrailer returns the PDFs trailer dictionary. The trailer dictionary is typically the starting point for a PDF, +// referencing other key objects that are important in the document structure. func (this *PdfParser) GetTrailer() *PdfObjectDictionary { return this.trailer } @@ -573,6 +575,7 @@ func (this *PdfParser) parseObject() (PdfObject, error) { } // Reads and parses a PDF dictionary object enclosed with '<<' and '>>' +// TODO: Unexport (v3). func (this *PdfParser) ParseDict() (*PdfObjectDictionary, error) { common.Log.Trace("Reading PDF Dict!") @@ -777,7 +780,7 @@ func (this *PdfParser) parseXrefTable() (*PdfObjectDictionary, error) { if txt == "%%EOF" { common.Log.Debug("ERROR: end of file - trailer not found - error!") - return nil, errors.New("End of file - trailer not found!") + return nil, errors.New("End of file - trailer not found") } common.Log.Trace("xref more : %s", txt) @@ -806,7 +809,7 @@ func (this *PdfParser) parseXrefStream(xstm *PdfObjectInteger) (*PdfObjectDictio xs, ok := xrefObj.(*PdfObjectStream) if !ok { common.Log.Debug("ERROR: XRefStm pointing to non-stream object!") - return nil, errors.New("XRefStm pointing to a non-stream object!") + return nil, errors.New("XRefStm pointing to a non-stream object") } trailerDict := xs.PdfObjectDictionary @@ -932,7 +935,6 @@ func (this *PdfParser) parseXrefStream(xstm *PdfObjectInteger) (*PdfObjectDictio var tmp int64 = 0 for i := 0; i < len(v); i++ { tmp += int64(v[i]) * (1 << uint(8*(len(v)-i-1))) - } return tmp } @@ -1118,7 +1120,7 @@ func (this *PdfParser) seekToEOFMarker(fSize int64) error { // 3. Check the Prev xref // 4. Continue looking for Prev until not found. // -// The earlier xrefs have higher precedance. If objects already +// The earlier xrefs have higher precedence. If objects already // loaded will ignore older versions. // func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { @@ -1298,8 +1300,9 @@ func (this *PdfParser) traceStreamLength(lengthObj PdfObject) (PdfObject, error) return slo, nil } -// Parse an indirect object from the input stream. -// Can also be an object stream. +// Parse an indirect object from the input stream. Can also be an object stream. +// Returns the indirect object (*PdfIndirectObject) or the stream object (*PdfObjectStream). +// TODO: Unexport (v3). func (this *PdfParser) ParseIndirectObject() (PdfObject, error) { indirect := PdfIndirectObject{} @@ -1475,6 +1478,7 @@ func (this *PdfParser) ParseIndirectObject() (PdfObject, error) { } // For testing purposes. +// TODO: Unexport (v3) or move to test files, if needed by external test cases. func NewParserFromString(txt string) *PdfParser { parser := PdfParser{} buf := []byte(txt) @@ -1490,8 +1494,8 @@ func NewParserFromString(txt string) *PdfParser { return &parser } -// Creates a new parser for a PDF file via ReadSeeker. Loads the -// cross reference stream and trailer. +// NewParser creates a new parser for a PDF file via ReadSeeker. Loads the cross reference stream and trailer. +// An error is returned on failure. func NewParser(rs io.ReadSeeker) (*PdfParser, error) { parser := &PdfParser{} @@ -1499,22 +1503,19 @@ func NewParser(rs io.ReadSeeker) (*PdfParser, error) { parser.ObjCache = make(ObjectCache) parser.streamLengthReferenceLookupInProgress = map[int64]bool{} - // Start by reading xrefs from bottom + // Start by reading the xrefs (from bottom). trailer, err := parser.loadXrefs() if err != nil { common.Log.Debug("ERROR: Failed to load xref table! %s", err) - // Try to rebuild entire xref table? return nil, err } common.Log.Trace("Trailer: %s", trailer) if len(parser.xrefs) == 0 { - return nil, fmt.Errorf("Empty XREF table. Invalid.") + return nil, fmt.Errorf("Empty XREF table - Invalid") } - // printXrefTable(parser.xrefs) - majorVersion, minorVersion, err := parser.parsePdfVersion() if err != nil { common.Log.Error("Unable to parse version: %v", err) @@ -1528,11 +1529,10 @@ func NewParser(rs io.ReadSeeker) (*PdfParser, error) { return parser, nil } -// Check if the document is encrypted. First time when called, will -// check if the Encrypt dictionary is accessible through the trailer -// dictionary. -// If encrypted, prepares a crypt datastructure which can be used to -// authenticate and decrypt the document. +// IsEncrypted checks if the document is encrypted. A bool flag is returned indicating the result. +// First time when called, will check if the Encrypt dictionary is accessible through the trailer dictionary. +// If encrypted, prepares a crypt datastructure which can be used to authenticate and decrypt the document. +// On failure, an error is returned. func (this *PdfParser) IsEncrypted() (bool, error) { if this.crypter != nil { return true, nil @@ -1574,9 +1574,9 @@ func (this *PdfParser) IsEncrypted() (bool, error) { return false, nil } -// Decrypt the PDF file with a specified password. Also tries to -// decrypt with an empty password. Returns true if successful, -// false otherwise. +// Decrypt attempts to decrypt the PDF file with a specified password. Also tries to +// decrypt with an empty password. Returns true if successful, false otherwise. +// An error is returned when there is a problem with decrypting. func (this *PdfParser) Decrypt(password []byte) (bool, error) { // Also build the encryption/decryption key. if this.crypter == nil { @@ -1595,8 +1595,8 @@ func (this *PdfParser) Decrypt(password []byte) (bool, error) { return authenticated, err } -// Check access rights and permissions for a specified password. If either user/owner password is specified, -// full rights are granted, otherwise the access rights are specified by the Permissions flag. +// CheckAccessRights checks access rights and permissions for a specified password. If either user/owner password is +// specified, full rights are granted, otherwise the access rights are specified by the Permissions flag. // // The bool flag indicates that the user can access and view the file. // The AccessPermissions shows what access the user has for editing etc. diff --git a/pdf/core/primitives.go b/pdf/core/primitives.go index 86d7db84..3001e9dc 100644 --- a/pdf/core/primitives.go +++ b/pdf/core/primitives.go @@ -3,11 +3,6 @@ * file 'LICENSE.md', which is part of this source code package. */ -// Defines PDF primitive objects as per the standard. Also defines a PdfObject -// interface allowing to universally work with these objects. It allows -// recursive writing of the objects to file as well and stringifying for -// debug purposes. - package core import ( @@ -17,41 +12,63 @@ import ( "github.com/unidoc/unidoc/common" ) -// PDF Primitives implement the PdfObject interface. +// PdfObject is an interface which all primitive PDF objects must implement. type PdfObject interface { - String() string // Output a string representation of the primitive (for debugging). - DefaultWriteString() string // Output the PDF primitive as expected by the standard. + // Output a string representation of the primitive (for debugging). + String() string + + // Output the PDF primitive as written to file as expected by the standard. + DefaultWriteString() string } +// PdfObjectBool represents the primitive PDF boolean object. type PdfObjectBool bool + +// PdfObjectInteger represents the primitive PDF integer numerical object. type PdfObjectInteger int64 + +// PdfObjectFloat represents the primitive PDF floating point numerical object. type PdfObjectFloat float64 + +// PdfObjectString represents the primitive PDF string object. +// TODO (v3): Change to a struct and add a flag for hex/plaintext. type PdfObjectString string + +// PdfObjectName represents the primitive PDF name object. type PdfObjectName string + +// PdfObjectArray represents the primitive PDF array object. type PdfObjectArray []PdfObject + +// PdfObjectDictionary represents the primitive PDF dictionary/map object. type PdfObjectDictionary struct { dict map[PdfObjectName]PdfObject keys []PdfObjectName } + +// PdfObjectNull represents the primitive PDF null object. type PdfObjectNull struct{} +// PdfObjectReference represents the primitive PDF reference object. type PdfObjectReference struct { ObjectNumber int64 GenerationNumber int64 } +// PdfIndirectObject represents the primitive PDF indirect object. type PdfIndirectObject struct { PdfObjectReference PdfObject } +// PdfObjectStream represents the primitive PDF Object stream. type PdfObjectStream struct { PdfObjectReference *PdfObjectDictionary Stream []byte } -// Quick functions to make pdf objects form primitive objects. +// MakeDict creates and returns an empty PdfObjectDictionary. func MakeDict() *PdfObjectDictionary { d := &PdfObjectDictionary{} d.dict = map[PdfObjectName]PdfObject{} @@ -59,16 +76,19 @@ func MakeDict() *PdfObjectDictionary { return d } +// MakeName creates a PdfObjectName from a string. func MakeName(s string) *PdfObjectName { name := PdfObjectName(s) return &name } +// MakeInteger creates a PdfObjectInteger from an int64. func MakeInteger(val int64) *PdfObjectInteger { num := PdfObjectInteger(val) return &num } +// MakeArray creates an PdfObjectArray from a list of PdfObjects. func MakeArray(objects ...PdfObject) *PdfObjectArray { array := PdfObjectArray{} for _, obj := range objects { @@ -77,6 +97,8 @@ func MakeArray(objects ...PdfObject) *PdfObjectArray { return &array } +// MakeArrayFromIntegers creates an PdfObjectArray from a slice of ints, where each array element is +// an PdfObjectInteger. func MakeArrayFromIntegers(vals []int) *PdfObjectArray { array := PdfObjectArray{} for _, val := range vals { @@ -85,6 +107,8 @@ func MakeArrayFromIntegers(vals []int) *PdfObjectArray { return &array } +// MakeArrayFromIntegers64 creates an PdfObjectArray from a slice of int64s, where each array element +// is an PdfObjectInteger. func MakeArrayFromIntegers64(vals []int64) *PdfObjectArray { array := PdfObjectArray{} for _, val := range vals { @@ -93,6 +117,8 @@ func MakeArrayFromIntegers64(vals []int64) *PdfObjectArray { return &array } +// MakeArrayFromFloats creates an PdfObjectArray from a slice of float64s, where each array element is an +// PdfObjectFloat. func MakeArrayFromFloats(vals []float64) *PdfObjectArray { array := PdfObjectArray{} for _, val := range vals { @@ -101,27 +127,33 @@ func MakeArrayFromFloats(vals []float64) *PdfObjectArray { return &array } +// MakeFloat creates an PdfObjectFloat from a float64. func MakeFloat(val float64) *PdfObjectFloat { num := PdfObjectFloat(val) return &num } +// MakeString creates an PdfObjectString from a string. func MakeString(s string) *PdfObjectString { str := PdfObjectString(s) return &str } +// MakeNull creates an PdfObjectNull. func MakeNull() *PdfObjectNull { null := PdfObjectNull{} return &null } +// MakeIndirectObject creates an PdfIndirectObject with a specified direct object PdfObject. func MakeIndirectObject(obj PdfObject) *PdfIndirectObject { ind := &PdfIndirectObject{} ind.PdfObject = obj return ind } +// MakeStream creates an PdfObjectStream with specified contents and encoding. If encoding is nil, then raw encoding +// will be used (i.e. no encoding applied). func MakeStream(contents []byte, encoder StreamEncoder) (*PdfObjectStream, error) { stream := &PdfObjectStream{} @@ -149,6 +181,7 @@ func (this *PdfObjectBool) String() string { } } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectBool) DefaultWriteString() string { if *this { return "true" @@ -161,6 +194,7 @@ func (this *PdfObjectInteger) String() string { return fmt.Sprintf("%d", *this) } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectInteger) DefaultWriteString() string { return fmt.Sprintf("%d", *this) } @@ -169,6 +203,7 @@ func (this *PdfObjectFloat) String() string { return fmt.Sprintf("%f", *this) } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectFloat) DefaultWriteString() string { return fmt.Sprintf("%f", *this) } @@ -177,6 +212,7 @@ func (this *PdfObjectString) String() string { return fmt.Sprintf("%s", string(*this)) } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectString) DefaultWriteString() string { var output bytes.Buffer @@ -209,6 +245,7 @@ func (this *PdfObjectName) String() string { return fmt.Sprintf("%s", string(*this)) } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectName) DefaultWriteString() string { var output bytes.Buffer @@ -229,6 +266,8 @@ func (this *PdfObjectName) DefaultWriteString() string { return output.String() } +// ToFloat64Array returns a slice of all elements in the array as a float64 slice. An error is returned if the array +// contains non-numeric objects (each element can be either PdfObjectInteger or PdfObjectFloat). func (this *PdfObjectArray) ToFloat64Array() ([]float64, error) { vals := []float64{} @@ -245,6 +284,8 @@ func (this *PdfObjectArray) ToFloat64Array() ([]float64, error) { return vals, nil } +// ToIntegerArray returns a slice of all array elements as an int slice. An error is returned if the array contains +// non-integer objects. Each element can only be PdfObjectInteger. func (this *PdfObjectArray) ToIntegerArray() ([]int, error) { vals := []int{} @@ -271,6 +312,7 @@ func (this *PdfObjectArray) String() string { return outStr } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectArray) DefaultWriteString() string { outStr := "[" for ind, o := range *this { @@ -283,6 +325,7 @@ func (this *PdfObjectArray) DefaultWriteString() string { return outStr } +// Append adds an PdfObject to the array. func (this *PdfObjectArray) Append(obj PdfObject) { *this = append(*this, obj) } @@ -299,8 +342,8 @@ func getNumberAsFloat(obj PdfObject) (float64, error) { return 0, fmt.Errorf("Not a number") } -// For numeric array: Get the array in []float64 slice representation. -// Will return error if not entirely numeric. +// GetAsFloat64Slice returns the array as []float64 slice. +// Returns an error if not entirely numeric (only PdfObjectIntegers, PdfObjectFloats). func (this *PdfObjectArray) GetAsFloat64Slice() ([]float64, error) { slice := []float64{} @@ -316,7 +359,7 @@ func (this *PdfObjectArray) GetAsFloat64Slice() ([]float64, error) { return slice, nil } -// Merge in key/values from another dictionary. Overwriting if has same keys. +// Merge merges in key/values from another dictionary. Overwriting if has same keys. func (this *PdfObjectDictionary) Merge(another *PdfObjectDictionary) { if another != nil { for _, key := range another.Keys() { @@ -336,6 +379,7 @@ func (this *PdfObjectDictionary) String() string { return outStr } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectDictionary) DefaultWriteString() string { outStr := "<<" for _, k := range this.keys { @@ -349,6 +393,7 @@ func (this *PdfObjectDictionary) DefaultWriteString() string { return outStr } +// Set sets the dictionary's key -> val mapping entry. Overwrites if key already set. func (d *PdfObjectDictionary) Set(key PdfObjectName, val PdfObject) { found := false for _, k := range d.keys { @@ -365,7 +410,7 @@ func (d *PdfObjectDictionary) Set(key PdfObjectName, val PdfObject) { d.dict[key] = val } -// Get PdfObject corresponding to the specified key. +// Get returns the PdfObject corresponding to the specified key. // Returns a nil value if the key is not set. // // The design is such that we only return 1 value. @@ -380,12 +425,12 @@ func (d *PdfObjectDictionary) Get(key PdfObjectName) PdfObject { return val } -// Get the list of keys. +// Keys returns the list of keys in the dictionary. func (d *PdfObjectDictionary) Keys() []PdfObjectName { return d.keys } -// Remove an element specified by key. +// Remove removes an element specified by key. func (d *PdfObjectDictionary) Remove(key PdfObjectName) { idx := -1 for i, k := range d.keys { @@ -402,9 +447,7 @@ func (d *PdfObjectDictionary) Remove(key PdfObjectName) { } } -// Check if the value's PdfObject interface, or its containing value is nil. Only set the -// key/value pair if not nil. -// +// SetIfNotNil sets the dictionary's key -> val mapping entry -IF- val is not nil. // Note that we take care to perform a type switch. Otherwise if we would supply a nil value // of another type, e.g. (PdfObjectArray*)(nil), then it would not be a PdfObject(nil) and thus // would get set. @@ -466,6 +509,7 @@ func (this *PdfObjectReference) String() string { return fmt.Sprintf("Ref(%d %d)", this.ObjectNumber, this.GenerationNumber) } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectReference) DefaultWriteString() string { return fmt.Sprintf("%d %d R", this.ObjectNumber, this.GenerationNumber) } @@ -476,6 +520,7 @@ func (this *PdfIndirectObject) String() string { return fmt.Sprintf("IObject:%d", (*this).ObjectNumber) } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfIndirectObject) DefaultWriteString() string { outStr := fmt.Sprintf("%d 0 R", (*this).ObjectNumber) return outStr @@ -485,6 +530,7 @@ func (this *PdfObjectStream) String() string { return fmt.Sprintf("Object stream %d: %s", this.ObjectNumber, this.PdfObjectDictionary) } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectStream) DefaultWriteString() string { outStr := fmt.Sprintf("%d 0 R", (*this).ObjectNumber) return outStr @@ -494,18 +540,20 @@ func (this *PdfObjectNull) String() string { return "null" } +// DefaultWriteString outputs the object as it is to be written to file. func (this *PdfObjectNull) DefaultWriteString() string { return "null" } // Handy functions to work with primitive objects. -// Traces a pdf object to a direct object. For example contained -// in indirect objects (can be double referenced even). -// -// Note: This function does not trace/resolve references. -// That needs to be done beforehand. + +// TraceMaxDepth specifies the maximum recursion depth allowed. const TraceMaxDepth = 20 +// TraceToDirectObject traces a PdfObject to a direct object. For example direct objects contained +// in indirect objects (can be double referenced even). +// +// Note: This function does not trace/resolve references. That needs to be done beforehand. func TraceToDirectObject(obj PdfObject) PdfObject { iobj, isIndirectObj := obj.(*PdfIndirectObject) depth := 0