diff --git a/.gitignore b/.gitignore index 990efc43..5f5280dc 100755 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.gox .idea *.mdb *.userprefs diff --git a/pdf/contentstream/inline-image.go b/pdf/contentstream/inline-image.go index 323a0f03..cab7e1e4 100644 --- a/pdf/contentstream/inline-image.go +++ b/pdf/contentstream/inline-image.go @@ -33,11 +33,8 @@ type ContentStreamInlineImage struct { // Make a new content stream inline image object from an image. func NewInlineImageFromImage(img Image, encoder StreamEncoder) (*ContentStreamInlineImage, error) { - filterName := "" if encoder == nil { encoder = NewRawEncoder() - } else { - filterName = encoder.GetFilterName() } common.Log.Debug("NewInlineImageFromImage: encoder=%T", encoder) @@ -62,7 +59,9 @@ func NewInlineImageFromImage(img Image, encoder StreamEncoder) (*ContentStreamIn } inlineImage.stream = encoded - if len(filterName) > 0 { + + filterName := encoder.GetFilterName() + if filterName != StreamEncodingFilterNameRaw { inlineImage.Filter = MakeName(filterName) } // XXX/FIXME: Add decode params? diff --git a/pdf/contentstream/processor.go b/pdf/contentstream/processor.go index 50f0a25a..550702c7 100644 --- a/pdf/contentstream/processor.go +++ b/pdf/contentstream/processor.go @@ -127,7 +127,7 @@ func (csp *ContentStreamProcessor) getColorspace(name string, resources *PdfPage } // Otherwise unsupported. - common.Log.Debug("Unknown colorspace requested: %s", name) + common.Log.Error("Unknown colorspace requested: %s", name) return nil, errors.New("Unsupported colorspace") } @@ -140,6 +140,10 @@ func (csp *ContentStreamProcessor) getInitialColor(cs PdfColorspace) (PdfColor, return NewPdfColorDeviceRGB(0.0, 0.0, 0.0), nil case *PdfColorspaceDeviceCMYK: return NewPdfColorDeviceCMYK(0.0, 0.0, 0.0, 1.0), nil + case *PdfColorspaceCalGray: + return NewPdfColorCalGray(0.0), nil + case *PdfColorspaceCalRGB: + return NewPdfColorCalRGB(0.0, 0.0, 0.0), nil case *PdfColorspaceLab: l := 0.0 a := 0.0 @@ -190,7 +194,7 @@ func (csp *ContentStreamProcessor) getInitialColor(cs PdfColorspace) (PdfColor, return nil, nil } - common.Log.Debug("Unable to determine initial color for unknown colorspace: %T", cs) + common.Log.Error("Unable to determine initial color for unknown colorspace: %T", cs) return nil, errors.New("Unsupported colorspace") } @@ -364,8 +368,8 @@ func (this *ContentStreamProcessor) handleCommand_SC(op *ContentStreamOperation, cs := this.graphicsState.ColorspaceStroking if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } @@ -389,8 +393,8 @@ func (this *ContentStreamProcessor) handleCommand_SCN(op *ContentStreamOperation if !isPatternCS(cs) { if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } } @@ -411,8 +415,8 @@ func (this *ContentStreamProcessor) handleCommand_sc(op *ContentStreamOperation, if !isPatternCS(cs) { if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } } @@ -433,8 +437,8 @@ func (this *ContentStreamProcessor) handleCommand_scn(op *ContentStreamOperation if !isPatternCS(cs) { if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } } @@ -454,8 +458,8 @@ func (this *ContentStreamProcessor) handleCommand_scn(op *ContentStreamOperation func (this *ContentStreamProcessor) handleCommand_G(op *ContentStreamOperation, resources *PdfPageResources) error { cs := NewPdfColorspaceDeviceGray() if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } @@ -475,8 +479,8 @@ func (this *ContentStreamProcessor) handleCommand_G(op *ContentStreamOperation, func (this *ContentStreamProcessor) handleCommand_g(op *ContentStreamOperation, resources *PdfPageResources) error { cs := NewPdfColorspaceDeviceGray() if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } @@ -496,8 +500,8 @@ func (this *ContentStreamProcessor) handleCommand_g(op *ContentStreamOperation, func (this *ContentStreamProcessor) handleCommand_RG(op *ContentStreamOperation, resources *PdfPageResources) error { cs := NewPdfColorspaceDeviceRGB() if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } @@ -516,8 +520,8 @@ func (this *ContentStreamProcessor) handleCommand_RG(op *ContentStreamOperation, func (this *ContentStreamProcessor) handleCommand_rg(op *ContentStreamOperation, resources *PdfPageResources) error { cs := NewPdfColorspaceDeviceRGB() if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } @@ -537,8 +541,8 @@ func (this *ContentStreamProcessor) handleCommand_rg(op *ContentStreamOperation, func (this *ContentStreamProcessor) handleCommand_K(op *ContentStreamOperation, resources *PdfPageResources) error { cs := NewPdfColorspaceDeviceCMYK() if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } @@ -557,8 +561,8 @@ func (this *ContentStreamProcessor) handleCommand_K(op *ContentStreamOperation, func (this *ContentStreamProcessor) handleCommand_k(op *ContentStreamOperation, resources *PdfPageResources) error { cs := NewPdfColorspaceDeviceCMYK() if len(op.Params) != cs.GetNumComponents() { - common.Log.Debug("Invalid number of parameters for SC") - common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs) + common.Log.Error("Invalid number of parameters for SC") + common.Log.Error("Number %d not matching colorspace %T", len(op.Params), cs) return errors.New("Invalid number of parameters") } diff --git a/pdf/core/const.go b/pdf/core/const.go new file mode 100644 index 00000000..a1eadec1 --- /dev/null +++ b/pdf/core/const.go @@ -0,0 +1,12 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +package core + +import "errors" + +var ( + ErrUnsupportedEncodingParameters = errors.New("Unsupported encoding parameters") +) diff --git a/pdf/core/crypt.go b/pdf/core/crypt.go index c7b08d33..aff8ce85 100644 --- a/pdf/core/crypt.go +++ b/pdf/core/crypt.go @@ -519,6 +519,7 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by common.Log.Debug("ERROR AES invalid buf %s", buf) return buf, fmt.Errorf("AES: Buf len < 16 (%d)", len(buf)) } + iv := buf[:16] buf = buf[16:] @@ -534,8 +535,20 @@ func (this *PdfCrypt) decryptBytes(buf []byte, filter string, okey []byte) ([]by common.Log.Trace("chop AES Decrypt (%d): % x", len(buf), buf) mode.CryptBlocks(buf, buf) common.Log.Trace("to (%d): % x", len(buf), buf) - //copy(buf[0:], buf[16:]) - //common.Log.Debug("chop to (%d): % x", len(buf), buf) + + if len(buf) == 0 { + common.Log.Trace("Empty buf, returning empty string") + return buf, nil + } + + // The padded length is indicated by the last values. Remove those. + padLen := int(buf[len(buf)-1]) + if padLen >= len(buf) { + common.Log.Debug("Illegal pad length") + return buf, fmt.Errorf("Invalid pad length") + } + buf = buf[:len(buf)-padLen] + return buf, nil } return nil, fmt.Errorf("Unsupported crypt filter method (%s)", cfMethod) diff --git a/pdf/core/encoding.go b/pdf/core/encoding.go index 068c5a12..99568a3e 100644 --- a/pdf/core/encoding.go +++ b/pdf/core/encoding.go @@ -36,6 +36,11 @@ const ( StreamEncodingFilterNameDCT = "DCTDecode" StreamEncodingFilterNameASCIIHex = "ASCIIHexDecode" StreamEncodingFilterNameASCII85 = "ASCII85Decode" + StreamEncodingFilterNameRaw = "Raw" +) + +const ( + DefaultJPEGQuality = 75 ) type StreamEncoder interface { @@ -314,6 +319,41 @@ func (this *FlateEncoder) DecodeStream(streamObj *PdfObjectStream) ([]byte, erro for j := 1; j < rowLength; j++ { rowData[j] = byte(int(rowData[j]+prevRowData[j]) % 256) } + case 3: + // Avg: Predicts the same as the average of the sample to the left and above. + for j := 1; j < rowLength; j++ { + if j == 1 { + rowData[j] = byte(int(rowData[j]+prevRowData[j]) % 256) + } else { + avg := (rowData[j-1] + prevRowData[j]) / 2 + rowData[j] = byte(int(rowData[j]+avg) % 256) + } + } + case 4: + // Paeth: a nonlinear function of the sample above, the sample to the left and the sample + // to the upper left. + for j := 2; j < rowLength; j++ { + a := rowData[j-1] // left + b := prevRowData[j] // above + c := prevRowData[j-1] // upper left + + p := int(a + b - c) + pa := absInt(p - int(a)) + pb := absInt(p - int(b)) + pc := absInt(p - int(c)) + + if pa <= pb && pa <= pc { + // Use a (left). + rowData[j] = byte(int(rowData[j]+a) % 256) + } else if pb <= pc { + // Use b (upper). + rowData[j] = byte(int(rowData[j]+b) % 256) + } else { + // Use c (upper left). + rowData[j] = byte(int(rowData[j]+c) % 256) + } + } + default: common.Log.Debug("ERROR: Invalid filter byte (%d) @row %d", fb, i) return nil, fmt.Errorf("Invalid filter byte (%d)", fb) @@ -337,9 +377,10 @@ func (this *FlateEncoder) DecodeStream(streamObj *PdfObjectStream) ([]byte, erro // Encode a bytes array and return the encoded value based on the encoder parameters. func (this *FlateEncoder) EncodeBytes(data []byte) ([]byte, error) { - if this.Predictor != 1 && !(11 <= this.Predictor && this.Predictor <= 15) { - common.Log.Error("FlateEncoder: Predictor=%d. Only 1, 11-15 supported", this.Predictor) - return nil, fmt.Errorf("FlateEncoder Predictor = 1, 11-15 only supported") + if this.Predictor != 1 && !(11 <= this.Predictor && this.Predictor <= 11) { + common.Log.Debug("Encoding error: FlateEncoder Predictor=%d. Only 1, 11 supported", + this.Predictor) + return nil, ErrUnsupportedEncodingParameters } if 11 <= this.Predictor && this.Predictor <= 15 { @@ -721,7 +762,7 @@ func NewDCTEncoder() *DCTEncoder { encoder.ColorComponents = 3 encoder.BitsPerComponent = 8 - encoder.Quality = 75 + encoder.Quality = DefaultJPEGQuality return encoder } @@ -806,7 +847,7 @@ func newDCTEncoderFromStream(streamObj *PdfObjectStream, multiEnc *MultiEncoder) encoder.Width = cfg.Width encoder.Height = cfg.Height common.Log.Trace("DCT Encoder: %+v", encoder) - encoder.Quality = 75 + encoder.Quality = DefaultJPEGQuality return encoder, nil } @@ -1273,7 +1314,7 @@ func NewRawEncoder() *RawEncoder { } func (this *RawEncoder) GetFilterName() string { - return "Raw (no encoding)" + return StreamEncodingFilterNameRaw } func (this *RawEncoder) MakeDecodeParams() PdfObject { diff --git a/pdf/core/parser.go b/pdf/core/parser.go index b5d9de51..ed7ae4b3 100644 --- a/pdf/core/parser.go +++ b/pdf/core/parser.go @@ -647,7 +647,13 @@ func (this *PdfParser) parsePdfVersion() (int, int, error) { result1 := rePdfVersion.FindStringSubmatch(string(b)) if len(result1) < 3 { - common.Log.Debug("Error: PDF Version not found!") + major, minor, err := this.seekPdfVersionTopDown() + if err == nil { + common.Log.Debug("Failed recovery - unable to find version") + return 0, 0, err + } + + return major, minor, nil return 0, 0, errors.New("PDF version not found") } @@ -742,6 +748,7 @@ func (this *PdfParser) parseXrefTable() (*PdfObjectDictionary, error) { continue } if (len(txt) > 6) && (txt[:7] == "trailer") { + common.Log.Trace("Found trailer - %s", txt) // Sometimes get "trailer << ...." // Need to rewind to end of trailer text. if len(txt) > 9 { @@ -1001,13 +1008,64 @@ func (this *PdfParser) parseXref() (*PdfObjectDictionary, error) { return nil, err } } else { - common.Log.Debug("ERROR: Invalid xref.... starting with \"%s\"", string(bb)) - return nil, errors.New("Invalid xref format") + common.Log.Debug("Warning: Unable to find xref table or stream. Repair attempted: Looking for earliest xref from bottom.") + err := this.repairSeekXrefMarker() + if err != nil { + common.Log.Debug("Repair failed - %v", err) + return nil, err + } + + trailerDict, err = this.parseXrefTable() + if err != nil { + return nil, err + } } return trailerDict, err } +// Look for EOF marker and seek to its beginning. +// Define an offset position from the end of the file. +func (this *PdfParser) seekToEOFMarker(fSize int64) error { + // Define the starting point (from the end of the file) to search from. + var offset int64 = 0 + + // Define an buffer length in terms of how many bytes to read from the end of the file. + var buflen int64 = 1000 + + for offset < fSize { + if fSize <= (buflen + offset) { + buflen = fSize - offset + } + + // Move back enough (as we need to read forward). + _, err := this.rs.Seek(-offset-buflen, os.SEEK_END) + if err != nil { + return err + } + + // Read the data. + b1 := make([]byte, buflen) + this.rs.Read(b1) + common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1)) + ind := reEOF.FindAllStringIndex(string(b1), -1) + if ind != nil { + // Found it. + lastInd := ind[len(ind)-1] + common.Log.Trace("Ind: % d", ind) + this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END) + return nil + } else { + common.Log.Debug("Warning: EOF marker not found! - continue seeking") + } + + offset += buflen + } + + common.Log.Debug("Error: EOF marker was not found.") + return errors.New("EOF not found") +} + // // Load the xrefs from the bottom of file prior to parsing the file. // 1. Look for %%EOF marker, then @@ -1031,39 +1089,29 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { this.xrefs = make(XrefTable) this.objstms = make(ObjectStreams) - // Look for EOF marker and seek to its beginning. - // Define an offset position from the end of the file. - var offset int64 = 1000 // Get the file size. fSize, err := this.rs.Seek(0, os.SEEK_END) if err != nil { return nil, err } common.Log.Trace("fsize: %d", fSize) - if fSize <= offset { - offset = fSize - } - _, err = this.rs.Seek(-offset, os.SEEK_END) + + // Seek the EOF marker. + err = this.seekToEOFMarker(fSize) if err != nil { + common.Log.Debug("Failed seek to eof marker: %v", err) return nil, err } - b1 := make([]byte, offset) - this.rs.Read(b1) - common.Log.Trace("Looking for EOF marker: \"%s\"", string(b1)) - ind := reEOF.FindAllStringIndex(string(b1), -1) - if ind == nil { - common.Log.Debug("Error: EOF marker not found!") - return nil, errors.New("EOF marker not found") - } - lastInd := ind[len(ind)-1] - common.Log.Trace("Ind: % d", ind) - this.rs.Seek(-offset+int64(lastInd[0]), os.SEEK_END) // Look for startxref and get the xref offset. - offset = 64 + var offset int64 = 64 this.rs.Seek(-offset, os.SEEK_CUR) b2 := make([]byte, offset) - this.rs.Read(b2) + _, err = this.rs.Read(b2) + if err != nil { + common.Log.Debug("Failed reading while looking for startxref: %v", err) + return nil, err + } result := reStartXref.FindStringSubmatch(string(b2)) if len(result) < 2 { @@ -1071,7 +1119,6 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { return nil, errors.New("Startxref not found") } if len(result) > 2 { - // GH: Take the last one? Make a test case. common.Log.Debug("ERROR: Multiple startxref (%s)!", b2) return nil, errors.New("Multiple startxref entries?") } @@ -1133,8 +1180,9 @@ func (this *PdfParser) loadXrefs() (*PdfObjectDictionary, error) { ptrailerDict, err := this.parseXref() if err != nil { - common.Log.Debug("ERROR: Failed loading another (Prev) trailer") - return nil, err + common.Log.Debug("Warning: Error - Failed loading another (Prev) trailer") + common.Log.Debug("Attempting to continue by ignoring it") + break } xx, present = (*ptrailerDict)["Prev"] diff --git a/pdf/core/repairs.go b/pdf/core/repairs.go index 9c3d0674..94be6dcb 100644 --- a/pdf/core/repairs.go +++ b/pdf/core/repairs.go @@ -13,6 +13,10 @@ import ( "os" "regexp" + "bufio" + "io" + "strconv" + "github.com/unidoc/unidoc/common" ) @@ -77,9 +81,22 @@ func (this *PdfParser) rebuildXrefTable() error { return nil } +// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil). +func parseObjectNumberFromString(str string) (int, int, error) { + result := reIndirectObject.FindStringSubmatch(str) + if len(result) < 3 { + return 0, 0, errors.New("Unable to detect indirect object signature") + } + + on, _ := strconv.Atoi(result[1]) + gn, _ := strconv.Atoi(result[2]) + + return on, gn, nil +} + // Parse the entire file from top down. -// Currently not supporting object streams... -// Also need to detect object streams and load the object numbers. +// Goes through the file byte-by-byte looking for " obj" patterns. +// N.B. This collects the XREF_TABLE_ENTRY data only. func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) { if this.repairsAttempted { // Avoid multiple repairs (only try once). @@ -87,60 +104,183 @@ func (this *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) { } this.repairsAttempted = true - reRepairIndirectObject := regexp.MustCompile(`^(\d+)\s+(\d+)\s+obj`) + // Go to beginning, reset reader. + this.rs.Seek(0, os.SEEK_SET) + this.reader = bufio.NewReader(this.rs) - this.SetFileOffset(0) + // Keep a running buffer of last bytes. + bufLen := 20 + last := make([]byte, bufLen) xrefTable := XrefTable{} for { - this.skipComments() - - curOffset := this.GetFileOffset() - - peakBuf, err := this.reader.Peek(10) + b, err := this.reader.ReadByte() if err != nil { - // EOF - break + if err == io.EOF { + break + } else { + return nil, err + } } - // Indirect object? - results := reRepairIndirectObject.FindIndex(peakBuf) - if len(results) > 0 { - obj, err := this.ParseIndirectObject() + // Format: + // object number - whitespace - generation number - obj + // e.g. "12 0 obj" + if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) { + i := bufLen - 4 + // Go past whitespace + for IsWhiteSpace(last[i]) && i > 0 { + i-- + } + if i == 0 || !IsDecimalDigit(last[i]) { + continue + } + // Go past generation number + for IsDecimalDigit(last[i]) && i > 0 { + i-- + } + if i == 0 || !IsWhiteSpace(last[i]) { + continue + } + // Go past whitespace + for IsWhiteSpace(last[i]) && i > 0 { + i-- + } + if i == 0 || !IsDecimalDigit(last[i]) { + continue + } + // Go past object number. + for IsDecimalDigit(last[i]) && i > 0 { + i-- + } + if i == 0 { + continue // Probably too long to be a valid object... + } + + objOffset := this.GetFileOffset() - int64(bufLen-i) + + objstr := append(last[i+1:], b) + objNum, genNum, err := parseObjectNumberFromString(string(objstr)) if err != nil { - common.Log.Debug("ERROR: Unable to parse indirect object (%s)", err) + common.Log.Debug("Unable to parse object number: %v", err) return nil, err } - if indObj, ok := obj.(*PdfIndirectObject); ok { + // Create and insert the XREF entry if not existing, or the generation number is higher. + if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum { // Make the entry for the cross ref table. xrefEntry := XrefObject{} xrefEntry.xtype = XREF_TABLE_ENTRY - xrefEntry.objectNumber = int(indObj.ObjectNumber) - xrefEntry.generation = int(indObj.GenerationNumber) - xrefEntry.offset = curOffset - xrefTable[int(indObj.ObjectNumber)] = xrefEntry - } else if streamObj, ok := obj.(*PdfObjectStream); ok { - // Make the entry for the cross ref table. - xrefEntry := XrefObject{} - xrefEntry.xtype = XREF_TABLE_ENTRY - xrefEntry.objectNumber = int(streamObj.ObjectNumber) - xrefEntry.generation = int(streamObj.GenerationNumber) - xrefEntry.offset = curOffset - xrefTable[int(streamObj.ObjectNumber)] = xrefEntry - } else { - return nil, fmt.Errorf("Not an indirect object or stream (%T)", obj) // Should never happen. + xrefEntry.objectNumber = int(objNum) + xrefEntry.generation = int(genNum) + xrefEntry.offset = objOffset + xrefTable[objNum] = xrefEntry } - } else if string(peakBuf[0:6]) == "endobj" { - this.reader.Discard(6) - } else { - // Stop once we reach xrefs/trailer section etc. Technically this could fail for complex - // cases, but lets keep it simple for now. Add more complexity when needed (problematic user committed files). - // In general more likely that more complex files would have better understanding of the PDF standard. - common.Log.Debug("Not an object - stop repair rebuilding xref here (%s)", peakBuf) - break } + + last = append(last[1:bufLen], b) } return &xrefTable, nil } + +// Look for first sign of xref table from end of file. +func (this *PdfParser) repairSeekXrefMarker() error { + // Get the file size. + fSize, err := this.rs.Seek(0, os.SEEK_END) + if err != nil { + return err + } + + reXrefTableStart := regexp.MustCompile(`\sxref\s*`) + + // Define the starting point (from the end of the file) to search from. + var offset int64 = 0 + + // Define an buffer length in terms of how many bytes to read from the end of the file. + var buflen int64 = 1000 + + for offset < fSize { + if fSize <= (buflen + offset) { + buflen = fSize - offset + } + + // Move back enough (as we need to read forward). + _, err := this.rs.Seek(-offset-buflen, os.SEEK_END) + if err != nil { + return err + } + + // Read the data. + b1 := make([]byte, buflen) + this.rs.Read(b1) + + common.Log.Trace("Looking for xref : \"%s\"", string(b1)) + ind := reXrefTableStart.FindAllStringIndex(string(b1), -1) + if ind != nil { + // Found it. + lastInd := ind[len(ind)-1] + common.Log.Trace("Ind: % d", ind) + this.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END) + this.reader = bufio.NewReader(this.rs) + // Go past whitespace, finish at 'x'. + for { + bb, err := this.reader.Peek(1) + if err != nil { + return err + } + common.Log.Trace("B: %d %c", bb[0], bb[0]) + if !IsWhiteSpace(bb[0]) { + break + } + this.reader.Discard(1) + } + + return nil + } else { + common.Log.Debug("Warning: EOF marker not found! - continue seeking") + } + + offset += buflen + } + + common.Log.Debug("Error: Xref table marker was not found.") + return errors.New("xref not found ") +} + +// Called when Pdf version not found normally. Looks for the PDF version by scanning top-down. +// %PDF-1.7 +func (this *PdfParser) seekPdfVersionTopDown() (int, int, error) { + // Go to beginning, reset reader. + this.rs.Seek(0, os.SEEK_SET) + this.reader = bufio.NewReader(this.rs) + + // Keep a running buffer of last bytes. + bufLen := 20 + last := make([]byte, bufLen) + + for { + b, err := this.reader.ReadByte() + if err != nil { + if err == io.EOF { + break + } else { + return 0, 0, err + } + } + + // Format: + // object number - whitespace - generation number - obj + // e.g. "12 0 obj" + if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' && + last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' { + major := int(last[bufLen-2] - '0') + minor := int(b - '0') + return major, minor, nil + } + + last = append(last[1:bufLen], b) + } + + return 0, 0, errors.New("Version not found") +} diff --git a/pdf/core/utils.go b/pdf/core/utils.go index 4ec316fb..88bb58d5 100644 --- a/pdf/core/utils.go +++ b/pdf/core/utils.go @@ -143,3 +143,11 @@ func (this *PdfParser) inspect() (map[string]int, error) { return objTypes, nil } + +func absInt(x int) int { + if x < 0 { + return -x + } else { + return x + } +} diff --git a/pdf/model/annotations.go b/pdf/model/annotations.go index ba89a05d..99d47bff 100644 --- a/pdf/model/annotations.go +++ b/pdf/model/annotations.go @@ -680,10 +680,13 @@ func (r *PdfReader) newPdfAnnotationFromIndirectObject(container *PdfIndirectObj subtypeObj, has := (*d)["Subtype"] if !has { - return nil, fmt.Errorf("Missing Subtype") + common.Log.Debug("WARNING: Compatibility issue - annotation Subtype missing - assuming no subtype") + annot.context = nil + return annot, nil } subtype, ok := subtypeObj.(*PdfObjectName) if !ok { + common.Log.Debug("ERROR: Invalid Subtype object type != name (%T)", subtypeObj) return nil, fmt.Errorf("Invalid Subtype object type != name (%T)", subtypeObj) } switch *subtype { @@ -1597,6 +1600,8 @@ func (this *PdfAnnotation) GetContainingPdfObject() PdfObject { return this.primitive } +// Note: Call the sub-annotation's ToPdfObject to set both the generic and non-generic information. +// TODO/FIXME: Consider doing it here instead. func (this *PdfAnnotation) ToPdfObject() PdfObject { container := this.primitive d := container.PdfObject.(*PdfObjectDictionary) diff --git a/pdf/model/colorspace.go b/pdf/model/colorspace.go index ef1bbb05..f4a6572f 100644 --- a/pdf/model/colorspace.go +++ b/pdf/model/colorspace.go @@ -354,6 +354,7 @@ func (this *PdfColorspaceDeviceRGB) ImageToRGB(img Image) (Image, error) { } func (this *PdfColorspaceDeviceRGB) ImageToGray(img Image) (Image, error) { + grayImage := img samples := img.GetSamples() @@ -374,7 +375,9 @@ func (this *PdfColorspaceDeviceRGB) ImageToGray(img Image) (Image, error) { // Convert to uint32 val := uint32(grayValue * maxVal) + graySamples = append(graySamples, val) + } grayImage.SetSamples(graySamples) grayImage.ColorComponents = 1 @@ -893,7 +896,7 @@ func (this *PdfColorspaceCalRGB) String() string { } func (this *PdfColorspaceCalRGB) GetNumComponents() int { - return 1 + return 3 } func newPdfColorspaceCalRGBFromPdfObject(obj PdfObject) (*PdfColorspaceCalRGB, error) { @@ -1119,7 +1122,7 @@ func (this *PdfColorspaceCalRGB) ImageToRGB(img Image) (Image, error) { maxVal := math.Pow(2, float64(img.BitsPerComponent)) - 1 rgbSamples := []uint32{} - for i := 0; i < len(samples); i++ { + for i := 0; i < len(samples)-2; i++ { // A, B, C in range 0.0 to 1.0 aVal := float64(samples[i]) / maxVal bVal := float64(samples[i+1]) / maxVal @@ -1916,6 +1919,7 @@ func (this *PdfColorspaceSpecialPattern) ColorFromFloats(vals []float64) (PdfCol // the name of the pattern. func (this *PdfColorspaceSpecialPattern) ColorFromPdfObjects(objects []PdfObject) (PdfColor, error) { if len(objects) < 1 { + common.Log.Error("ColorFromPdfObjects: len(objects)=%d", len(objects)) return nil, errors.New("Invalid number of parameters") } patternColor := &PdfColorPattern{} diff --git a/pdf/model/forms.go b/pdf/model/forms.go index bbbc4177..c2f5fbda 100644 --- a/pdf/model/forms.go +++ b/pdf/model/forms.go @@ -17,12 +17,12 @@ import ( // type PdfAcroForm struct { Fields *[]*PdfField - NeedAppearances PdfObject - SigFlags PdfObject - CO PdfObject - DR PdfObject - DA PdfObject - Q PdfObject + NeedAppearances *PdfObjectBool + SigFlags *PdfObjectInteger + CO *PdfObjectArray + DR *PdfPageResources + DA *PdfObjectString + Q *PdfObjectInteger XFA PdfObject primitive *PdfIndirectObject @@ -78,23 +78,66 @@ func (r *PdfReader) newPdfAcroFormFromDict(d *PdfObjectDictionary) (*PdfAcroForm } if obj, has := (*d)["NeedAppearances"]; has { - acroForm.NeedAppearances = obj + val, ok := obj.(*PdfObjectBool) + if ok { + acroForm.NeedAppearances = val + } else { + common.Log.Debug("ERROR: NeedAppearances invalid (got %T)", obj) + } } + if obj, has := (*d)["SigFlags"]; has { - acroForm.SigFlags = obj + val, ok := obj.(*PdfObjectInteger) + if ok { + acroForm.SigFlags = val + } else { + common.Log.Debug("ERROR: SigFlags invalid (got %T)", obj) + } } + if obj, has := (*d)["CO"]; has { - acroForm.CO = obj + obj = TraceToDirectObject(obj) + arr, ok := obj.(*PdfObjectArray) + if ok { + acroForm.CO = arr + } else { + common.Log.Debug("ERROR: CO invalid (got %T)", obj) + } } + if obj, has := (*d)["DR"]; has { - acroForm.DR = obj + obj = TraceToDirectObject(obj) + if d, ok := obj.(*PdfObjectDictionary); ok { + resources, err := NewPdfPageResourcesFromDict(d) + if err != nil { + common.Log.Error("Invalid DR: %v", err) + return nil, err + } + + acroForm.DR = resources + } else { + common.Log.Debug("ERROR: DR invalid (got %T)", obj) + } } + if obj, has := (*d)["DA"]; has { - acroForm.DA = obj + str, ok := obj.(*PdfObjectString) + if ok { + acroForm.DA = str + } else { + common.Log.Debug("ERROR: DA invalid (got %T)", obj) + } } + if obj, has := (*d)["Q"]; has { - acroForm.Q = obj + val, ok := obj.(*PdfObjectInteger) + if ok { + acroForm.Q = val + } else { + common.Log.Debug("ERROR: Q invalid (got %T)", obj) + } } + if obj, has := (*d)["XFA"]; has { acroForm.XFA = obj } @@ -128,7 +171,7 @@ func (this *PdfAcroForm) ToPdfObject() PdfObject { (*dict)["CO"] = this.CO } if this.DR != nil { - (*dict)["DR"] = this.DR + (*dict)["DR"] = this.DR.ToPdfObject() } if this.DA != nil { (*dict)["DA"] = this.DA diff --git a/pdf/model/page.go b/pdf/model/page.go index 0f1264ac..c327e27c 100644 --- a/pdf/model/page.go +++ b/pdf/model/page.go @@ -5,7 +5,7 @@ // // Allow higher level manipulation of PDF files and pages. -// This can be continously expanded to support more and more features. +// This can be continuously expanded to support more and more features. // Generic handling can be done by defining elements as PdfObject which // can later be replaced and fully defined. // @@ -17,29 +17,27 @@ import ( "fmt" "strings" - "github.com/unidoc/unidoc/common" . "github.com/unidoc/unidoc/pdf/core" ) // PDF page object (7.7.3.3 - Table 30). type PdfPage struct { - Parent PdfObject - LastModified *PdfDate - Resources *PdfPageResources - CropBox *PdfRectangle - MediaBox *PdfRectangle - BleedBox *PdfRectangle - TrimBox *PdfRectangle - ArtBox *PdfRectangle - BoxColorInfo PdfObject - Contents PdfObject - Rotate *int64 - Group PdfObject - Thumb PdfObject - B PdfObject - Dur PdfObject - Trans PdfObject - //Annots PdfObject + Parent PdfObject + LastModified *PdfDate + Resources *PdfPageResources + CropBox *PdfRectangle + MediaBox *PdfRectangle + BleedBox *PdfRectangle + TrimBox *PdfRectangle + ArtBox *PdfRectangle + BoxColorInfo PdfObject + Contents PdfObject + Rotate *int64 + Group PdfObject + Thumb PdfObject + B PdfObject + Dur PdfObject + Trans PdfObject AA PdfObject Metadata PdfObject PieceInfo PdfObject @@ -52,7 +50,7 @@ type PdfPage struct { PresSteps PdfObject UserUnit PdfObject VP PdfObject - //Annotations + Annotations []*PdfAnnotation // Primitive container. @@ -477,7 +475,12 @@ func (this *PdfPage) GetPageDict() *PdfObjectDictionary { if this.Annotations != nil { arr := PdfObjectArray{} for _, annot := range this.Annotations { - arr = append(arr, annot.GetContext().ToPdfObject()) + if subannot := annot.GetContext(); subannot != nil { + arr = append(arr, subannot.ToPdfObject()) + } else { + // Generic annotation dict (without subtype). + arr = append(arr, annot.ToPdfObject()) + } } p.Set("Annots", &arr) } @@ -760,6 +763,7 @@ func (this *PdfPage) GetAllContentStreams() (string, error) { return strings.Join(cstreams, " "), nil } +// Needs to have matching name and colorspace map entry. The Names define the order. type PdfPageResourcesColorspaces struct { Names []string Colorspaces map[string]PdfColorspace @@ -767,19 +771,7 @@ type PdfPageResourcesColorspaces struct { container *PdfIndirectObject } -func NewPdfPageResourcesColorspaces() *PdfPageResourcesColorspaces { - return &PdfPageResourcesColorspaces{ - Names: []string{}, - Colorspaces: map[string]PdfColorspace{}, - } -} -func (this *PdfPageResourcesColorspaces) Add(name string, colorspace PdfColorspace) { - if _, ok := this.Colorspaces[name]; ok { - common.Log.Error("Colorspace name=%#q already exists", name) - } - this.Names = append(this.Names, name) - this.Colorspaces[name] = colorspace -} + func newPdfPageResourcesColorspacesFromPdfObject(obj PdfObject) (*PdfPageResourcesColorspaces, error) { colorspaces := &PdfPageResourcesColorspaces{} @@ -821,305 +813,3 @@ func (this *PdfPageResourcesColorspaces) ToPdfObject() PdfObject { return dict } - -// Page resources model. -// Implements PdfModel. -type PdfPageResources struct { - ExtGState PdfObject - //ColorSpace PdfObject - ColorSpace *PdfPageResourcesColorspaces - Pattern PdfObject - Shading PdfObject - XObject PdfObject - Font PdfObject - ProcSet PdfObject - Properties PdfObject - // Primitive. - primitive *PdfObjectDictionary -} - -func NewPdfPageResources() *PdfPageResources { - r := &PdfPageResources{} - r.primitive = &PdfObjectDictionary{} - return r -} - -func NewPdfPageResourcesFromDict(dict *PdfObjectDictionary) (*PdfPageResources, error) { - r := NewPdfPageResources() - - if obj, isDefined := (*dict)["ExtGState"]; isDefined { - r.ExtGState = obj - } - if obj, isDefined := (*dict)["ColorSpace"]; isDefined { - //r.ColorSpace = obj - - colorspaces, err := newPdfPageResourcesColorspacesFromPdfObject(obj) - if err != nil { - return nil, err - } - r.ColorSpace = colorspaces - } - if obj, isDefined := (*dict)["Pattern"]; isDefined { - r.Pattern = obj - } - if obj, isDefined := (*dict)["Shading"]; isDefined { - r.Shading = obj - } - if obj, isDefined := (*dict)["XObject"]; isDefined { - r.XObject = obj - } - if obj, isDefined := (*dict)["Font"]; isDefined { - r.Font = obj - } - if obj, isDefined := (*dict)["ProcSet"]; isDefined { - r.ProcSet = obj - } - if obj, isDefined := (*dict)["Properties"]; isDefined { - r.Properties = obj - } - - return r, nil -} - -func (r *PdfPageResources) GetContainingPdfObject() PdfObject { - return r.primitive -} - -func (r *PdfPageResources) ToPdfObject() PdfObject { - d := r.primitive - d.SetIfNotNil("ExtGState", r.ExtGState) - if r.ColorSpace != nil { - d.SetIfNotNil("ColorSpace", r.ColorSpace.ToPdfObject()) - } - d.SetIfNotNil("Pattern", r.Pattern) - d.SetIfNotNil("Shading", r.Shading) - d.SetIfNotNil("XObject", r.XObject) - d.SetIfNotNil("Font", r.Font) - d.SetIfNotNil("ProcSet", r.ProcSet) - d.SetIfNotNil("Properties", r.Properties) - - return d -} - -// Add External Graphics State (GState). The gsDict can be specified either directly as a dictionary or an indirect -// object containing a dictionary. -func (r *PdfPageResources) AddExtGState(gsName PdfObjectName, gsDict PdfObject) error { - if r.ExtGState == nil { - r.ExtGState = &PdfObjectDictionary{} - } - - obj := r.ExtGState - dict, ok := TraceToDirectObject(obj).(*PdfObjectDictionary) - if !ok { - common.Log.Debug("ExtGState type error (got %T/%T)", obj, TraceToDirectObject(obj)) - return ErrTypeError - } - - (*dict)[gsName] = gsDict - return nil -} - -// Get the shading specified by keyName. Returns nil if not existing. The bool flag indicated whether it was found -// or not. -func (r *PdfPageResources) GetShadingByName(keyName string) (*PdfShading, bool) { - if r.Shading == nil { - return nil, false - } - - shadingDict, ok := r.Shading.(*PdfObjectDictionary) - if !ok { - common.Log.Debug("ERROR: Invalid Shading entry - not a dict (got %T)", r.Shading) - return nil, false - } - - if obj, has := (*shadingDict)[PdfObjectName(keyName)]; has { - shading, err := newPdfShadingFromPdfObject(obj) - if err != nil { - common.Log.Debug("ERROR: failed to load pdf shading: %v", err) - return nil, false - } - return shading, true - } else { - return nil, false - } -} - -// Set a shading resource specified by keyName. -func (r *PdfPageResources) SetShadingByName(keyName string, shadingObj PdfObject) error { - if r.Shading == nil { - r.Shading = &PdfObjectDictionary{} - } - - shadingDict, has := r.Shading.(*PdfObjectDictionary) - if !has { - return ErrTypeError - } - - (*shadingDict)[PdfObjectName(keyName)] = shadingObj - return nil -} - -// Get the pattern specified by keyName. Returns nil if not existing. The bool flag indicated whether it was found -// or not. -func (r *PdfPageResources) GetPatternByName(keyName string) (*PdfPattern, bool) { - if r.Pattern == nil { - return nil, false - } - - patternDict, ok := r.Pattern.(*PdfObjectDictionary) - if !ok { - common.Log.Debug("ERROR: Invalid Pattern entry - not a dict (got %T)", r.Pattern) - return nil, false - } - - if obj, has := (*patternDict)[PdfObjectName(keyName)]; has { - pattern, err := newPdfPatternFromPdfObject(obj) - if err != nil { - common.Log.Debug("ERROR: failed to load pdf pattern: %v", err) - return nil, false - } - - return pattern, true - } else { - return nil, false - } -} - -// Set a pattern resource specified by keyName. -func (r *PdfPageResources) SetPatternByName(keyName string, pattern PdfObject) error { - if r.Pattern == nil { - r.Pattern = &PdfObjectDictionary{} - } - - patternDict, has := r.Pattern.(*PdfObjectDictionary) - if !has { - return ErrTypeError - } - - (*patternDict)[PdfObjectName(keyName)] = pattern - return nil -} - -// Check if an XObject with a specified keyName is defined. -func (r *PdfPageResources) HasXObjectByName(keyName string) bool { - obj, _ := r.GetXObjectByName(keyName) - if obj != nil { - return true - } else { - return false - } -} - -type XObjectType int - -const ( - XObjectTypeUndefined XObjectType = iota - XObjectTypeImage XObjectType = iota - XObjectTypeForm XObjectType = iota - XObjectTypePS XObjectType = iota - XObjectTypeUnknown XObjectType = iota -) - -// Returns the XObject with the specified keyName and the object type. -func (r *PdfPageResources) GetXObjectByName(keyName string) (*PdfObjectStream, XObjectType) { - if r.XObject == nil { - return nil, XObjectTypeUndefined - } - - xresDict, has := TraceToDirectObject(r.XObject).(*PdfObjectDictionary) - - if !has { - common.Log.Debug("ERROR: XObject not a dictionary! (got %T)", TraceToDirectObject(r.XObject)) - return nil, XObjectTypeUndefined - } - - if obj, has := (*xresDict)[PdfObjectName(keyName)]; has { - stream, ok := obj.(*PdfObjectStream) - if !ok { - common.Log.Debug("XObject not pointing to a stream %T", obj) - return nil, XObjectTypeUndefined - } - dict := stream.PdfObjectDictionary - - name, ok := (*dict)["Subtype"].(*PdfObjectName) - if !ok { - common.Log.Debug("XObject Subtype not a Name, dict: %s", dict.String()) - return nil, XObjectTypeUndefined - } - - if *name == "Image" { - return stream, XObjectTypeImage - } else if *name == "Form" { - return stream, XObjectTypeForm - } else if *name == "PS" { - return stream, XObjectTypePS - } else { - common.Log.Debug("XObject Subtype not known (%s)", *name) - return nil, XObjectTypeUndefined - } - } else { - return nil, XObjectTypeUndefined - } -} - -func (r *PdfPageResources) setXObjectByName(keyName string, stream *PdfObjectStream) error { - if r.XObject == nil { - r.XObject = &PdfObjectDictionary{} - } - - obj := TraceToDirectObject(r.XObject) - xresDict, has := obj.(*PdfObjectDictionary) - if !has { - common.Log.Debug("Invalid XObject, got %T/%T", r.XObject, obj) - return errors.New("Type check error") - } - - (*xresDict)[PdfObjectName(keyName)] = stream - return nil -} - -func (r *PdfPageResources) GetXObjectImageByName(keyName string) (*XObjectImage, error) { - stream, xtype := r.GetXObjectByName(keyName) - if stream == nil { - return nil, nil - } - if xtype != XObjectTypeImage { - return nil, errors.New("Not an image") - } - - ximg, err := NewXObjectImageFromStream(stream) - if err != nil { - return nil, err - } - - return ximg, nil -} - -func (r *PdfPageResources) SetXObjectImageByName(keyName string, ximg *XObjectImage) error { - stream := ximg.ToPdfObject().(*PdfObjectStream) - err := r.setXObjectByName(keyName, stream) - return err -} - -func (r *PdfPageResources) GetXObjectFormByName(keyName string) (*XObjectForm, error) { - stream, xtype := r.GetXObjectByName(keyName) - if stream == nil { - return nil, nil - } - if xtype != XObjectTypeForm { - return nil, errors.New("Not a form") - } - - xform, err := NewXObjectFormFromStream(stream) - if err != nil { - return nil, err - } - - return xform, nil -} - -func (r *PdfPageResources) SetXObjectFormByName(keyName string, xform *XObjectForm) error { - stream := xform.ToPdfObject().(*PdfObjectStream) - err := r.setXObjectByName(keyName, stream) - return err -} diff --git a/pdf/model/reader.go b/pdf/model/reader.go index bfb12d4a..352c9bf3 100644 --- a/pdf/model/reader.go +++ b/pdf/model/reader.go @@ -24,7 +24,6 @@ type PdfReader struct { pageCount int catalog *PdfObjectDictionary outlineTree *PdfOutlineTreeNode - forms *PdfObjectDictionary AcroForm *PdfAcroForm modelManager *ModelManager @@ -199,13 +198,7 @@ func (this *PdfReader) loadStructure() error { return err } - // Get forms. - this.forms, err = this.GetForms() - if err != nil { - return err - } - - // Get fields + // Load interactive forms and fields. this.AcroForm, err = this.loadForms() if err != nil { return err @@ -437,48 +430,6 @@ func (this *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, return outlineNodeList, flattenedTitleList, nil } -// Get document form data. -func (this *PdfReader) GetForms() (*PdfObjectDictionary, error) { - if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() { - return nil, fmt.Errorf("File need to be decrypted first") - } - // Has forms? - catalog := this.catalog - - var formsDict *PdfObjectDictionary - - if dict, hasFormsDict := (*catalog)["AcroForm"].(*PdfObjectDictionary); hasFormsDict { - common.Log.Trace("Has Acro forms - dictionary under Catalog") - formsDict = dict - } else if formsRef, hasFormsRef := (*catalog)["AcroForm"].(*PdfObjectReference); hasFormsRef { - common.Log.Trace("Has Acro forms - Indirect object") - formsObj, err := this.parser.LookupByReference(*formsRef) - if err != nil { - common.Log.Debug("ERROR: Failed to read forms") - return nil, err - } - if iobj, ok := formsObj.(*PdfIndirectObject); ok { - if dict, ok := iobj.PdfObject.(*PdfObjectDictionary); ok { - formsDict = dict - } - } - } - if formsDict == nil { - common.Log.Trace("Does not have forms") - return nil, nil - } - - common.Log.Trace("Has Acro forms") - common.Log.Trace("Traverse the Acroforms structure") - err := this.traverseObjectData(formsDict) - if err != nil { - common.Log.Debug("ERROR: Unable to traverse AcroForms (%s)", err) - return nil, err - } - - return formsDict, nil -} - func (this *PdfReader) loadForms() (*PdfAcroForm, error) { if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() { return nil, fmt.Errorf("File need to be decrypted first") @@ -511,6 +462,15 @@ func (this *PdfReader) loadForms() (*PdfAcroForm, error) { common.Log.Trace("Has Acro forms") // Load it. + // Ensure we have access to everything. + common.Log.Trace("Traverse the Acroforms structure") + err = this.traverseObjectData(formsDict) + if err != nil { + common.Log.Debug("ERROR: Unable to traverse AcroForms (%s)", err) + return nil, err + } + + // Create the acro forms object. acroForm, err := this.newPdfAcroFormFromDict(formsDict) if err != nil { return nil, err diff --git a/pdf/model/resources.go b/pdf/model/resources.go new file mode 100644 index 00000000..cc479f53 --- /dev/null +++ b/pdf/model/resources.go @@ -0,0 +1,312 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +package model + +import ( + "errors" + + "github.com/unidoc/unidoc/common" + . "github.com/unidoc/unidoc/pdf/core" +) + +// Page resources model. +// Implements PdfModel. +type PdfPageResources struct { + ExtGState PdfObject + //ColorSpace PdfObject + ColorSpace *PdfPageResourcesColorspaces + Pattern PdfObject + Shading PdfObject + XObject PdfObject + Font PdfObject + ProcSet PdfObject + Properties PdfObject + // Primitive reource container. + primitive *PdfObjectDictionary +} + +func NewPdfPageResources() *PdfPageResources { + r := &PdfPageResources{} + r.primitive = &PdfObjectDictionary{} + return r +} + +func NewPdfPageResourcesFromDict(dict *PdfObjectDictionary) (*PdfPageResources, error) { + r := NewPdfPageResources() + + if obj, isDefined := (*dict)["ExtGState"]; isDefined { + r.ExtGState = obj + } + if obj, isDefined := (*dict)["ColorSpace"]; isDefined && !isNullObject(obj) { + colorspaces, err := newPdfPageResourcesColorspacesFromPdfObject(obj) + if err != nil { + return nil, err + } + r.ColorSpace = colorspaces + } + if obj, isDefined := (*dict)["Pattern"]; isDefined { + r.Pattern = obj + } + if obj, isDefined := (*dict)["Shading"]; isDefined { + r.Shading = obj + } + if obj, isDefined := (*dict)["XObject"]; isDefined { + r.XObject = obj + } + if obj, isDefined := (*dict)["Font"]; isDefined { + r.Font = obj + } + if obj, isDefined := (*dict)["ProcSet"]; isDefined { + r.ProcSet = obj + } + if obj, isDefined := (*dict)["Properties"]; isDefined { + r.Properties = obj + } + + return r, nil +} + +func (r *PdfPageResources) GetContainingPdfObject() PdfObject { + return r.primitive +} + +func (r *PdfPageResources) ToPdfObject() PdfObject { + d := r.primitive + d.SetIfNotNil("ExtGState", r.ExtGState) + if r.ColorSpace != nil { + d.SetIfNotNil("ColorSpace", r.ColorSpace.ToPdfObject()) + } + d.SetIfNotNil("Pattern", r.Pattern) + d.SetIfNotNil("Shading", r.Shading) + d.SetIfNotNil("XObject", r.XObject) + d.SetIfNotNil("Font", r.Font) + d.SetIfNotNil("ProcSet", r.ProcSet) + d.SetIfNotNil("Properties", r.Properties) + + return d +} + +// Add External Graphics State (GState). The gsDict can be specified either directly as a dictionary or an indirect +// object containing a dictionary. +func (r *PdfPageResources) AddExtGState(gsName PdfObjectName, gsDict PdfObject) error { + if r.ExtGState == nil { + r.ExtGState = &PdfObjectDictionary{} + } + + obj := r.ExtGState + dict, ok := TraceToDirectObject(obj).(*PdfObjectDictionary) + if !ok { + common.Log.Debug("ExtGState type error (got %T/%T)", obj, TraceToDirectObject(obj)) + return ErrTypeError + } + + (*dict)[gsName] = gsDict + return nil +} + +// Get the shading specified by keyName. Returns nil if not existing. The bool flag indicated whether it was found +// or not. +func (r *PdfPageResources) GetShadingByName(keyName string) (*PdfShading, bool) { + if r.Shading == nil { + return nil, false + } + + shadingDict, ok := r.Shading.(*PdfObjectDictionary) + if !ok { + common.Log.Debug("ERROR: Invalid Shading entry - not a dict (got %T)", r.Shading) + return nil, false + } + + if obj, has := (*shadingDict)[PdfObjectName(keyName)]; has { + shading, err := newPdfShadingFromPdfObject(obj) + if err != nil { + common.Log.Debug("ERROR: failed to load pdf shading: %v", err) + return nil, false + } + return shading, true + } else { + return nil, false + } +} + +// Set a shading resource specified by keyName. +func (r *PdfPageResources) SetShadingByName(keyName string, shadingObj PdfObject) error { + if r.Shading == nil { + r.Shading = &PdfObjectDictionary{} + } + + shadingDict, has := r.Shading.(*PdfObjectDictionary) + if !has { + return ErrTypeError + } + + (*shadingDict)[PdfObjectName(keyName)] = shadingObj + return nil +} + +// Get the pattern specified by keyName. Returns nil if not existing. The bool flag indicated whether it was found +// or not. +func (r *PdfPageResources) GetPatternByName(keyName string) (*PdfPattern, bool) { + if r.Pattern == nil { + return nil, false + } + + patternDict, ok := r.Pattern.(*PdfObjectDictionary) + if !ok { + common.Log.Debug("ERROR: Invalid Pattern entry - not a dict (got %T)", r.Pattern) + return nil, false + } + + if obj, has := (*patternDict)[PdfObjectName(keyName)]; has { + pattern, err := newPdfPatternFromPdfObject(obj) + if err != nil { + common.Log.Debug("ERROR: failed to load pdf pattern: %v", err) + return nil, false + } + + return pattern, true + } else { + return nil, false + } +} + +// Set a pattern resource specified by keyName. +func (r *PdfPageResources) SetPatternByName(keyName string, pattern PdfObject) error { + if r.Pattern == nil { + r.Pattern = &PdfObjectDictionary{} + } + + patternDict, has := r.Pattern.(*PdfObjectDictionary) + if !has { + return ErrTypeError + } + + (*patternDict)[PdfObjectName(keyName)] = pattern + return nil +} + +// Check if an XObject with a specified keyName is defined. +func (r *PdfPageResources) HasXObjectByName(keyName string) bool { + obj, _ := r.GetXObjectByName(keyName) + if obj != nil { + return true + } else { + return false + } +} + +type XObjectType int + +const ( + XObjectTypeUndefined XObjectType = iota + XObjectTypeImage XObjectType = iota + XObjectTypeForm XObjectType = iota + XObjectTypePS XObjectType = iota + XObjectTypeUnknown XObjectType = iota +) + +// Returns the XObject with the specified keyName and the object type. +func (r *PdfPageResources) GetXObjectByName(keyName string) (*PdfObjectStream, XObjectType) { + if r.XObject == nil { + return nil, XObjectTypeUndefined + } + + xresDict, has := TraceToDirectObject(r.XObject).(*PdfObjectDictionary) + if !has { + common.Log.Debug("ERROR: XObject not a dictionary! (got %T)", TraceToDirectObject(r.XObject)) + return nil, XObjectTypeUndefined + } + + if obj, has := (*xresDict)[PdfObjectName(keyName)]; has { + stream, ok := obj.(*PdfObjectStream) + if !ok { + common.Log.Debug("XObject not pointing to a stream %T", obj) + return nil, XObjectTypeUndefined + } + dict := stream.PdfObjectDictionary + + name, ok := (*dict)["Subtype"].(*PdfObjectName) + if !ok { + common.Log.Debug("XObject Subtype not a Name, dict: %s", dict.String()) + return nil, XObjectTypeUndefined + } + + if *name == "Image" { + return stream, XObjectTypeImage + } else if *name == "Form" { + return stream, XObjectTypeForm + } else if *name == "PS" { + return stream, XObjectTypePS + } else { + common.Log.Debug("XObject Subtype not known (%s)", *name) + return nil, XObjectTypeUndefined + } + } else { + return nil, XObjectTypeUndefined + } +} + +func (r *PdfPageResources) setXObjectByName(keyName string, stream *PdfObjectStream) error { + if r.XObject == nil { + r.XObject = &PdfObjectDictionary{} + } + + obj := TraceToDirectObject(r.XObject) + xresDict, has := obj.(*PdfObjectDictionary) + if !has { + common.Log.Debug("Invalid XObject, got %T/%T", r.XObject, obj) + return errors.New("Type check error") + } + + (*xresDict)[PdfObjectName(keyName)] = stream + return nil +} + +func (r *PdfPageResources) GetXObjectImageByName(keyName string) (*XObjectImage, error) { + stream, xtype := r.GetXObjectByName(keyName) + if stream == nil { + return nil, nil + } + if xtype != XObjectTypeImage { + return nil, errors.New("Not an image") + } + + ximg, err := NewXObjectImageFromStream(stream) + if err != nil { + return nil, err + } + + return ximg, nil +} + +func (r *PdfPageResources) SetXObjectImageByName(keyName string, ximg *XObjectImage) error { + stream := ximg.ToPdfObject().(*PdfObjectStream) + err := r.setXObjectByName(keyName, stream) + return err +} + +func (r *PdfPageResources) GetXObjectFormByName(keyName string) (*XObjectForm, error) { + stream, xtype := r.GetXObjectByName(keyName) + if stream == nil { + return nil, nil + } + if xtype != XObjectTypeForm { + return nil, errors.New("Not a form") + } + + xform, err := NewXObjectFormFromStream(stream) + if err != nil { + return nil, err + } + + return xform, nil +} + +func (r *PdfPageResources) SetXObjectFormByName(keyName string, xform *XObjectForm) error { + stream := xform.ToPdfObject().(*PdfObjectStream) + err := r.setXObjectByName(keyName, stream) + return err +} diff --git a/pdf/model/shading.go b/pdf/model/shading.go index 53478bca..a95f9218 100644 --- a/pdf/model/shading.go +++ b/pdf/model/shading.go @@ -420,8 +420,8 @@ func newPdfShadingType3FromDictionary(dict *PdfObjectDictionary) (*PdfShadingTyp common.Log.Debug("Coords not an array (got %T)", obj) return nil, ErrTypeError } - if len(*arr) != 4 { - common.Log.Debug("Coords length not 4 (got %d)", len(*arr)) + if len(*arr) != 6 { + common.Log.Debug("Coords length not 6 (got %d)", len(*arr)) return nil, ErrInvalidAttribute } shading.Coords = arr diff --git a/pdf/model/utils.go b/pdf/model/utils.go index 1678f95d..14e3650b 100644 --- a/pdf/model/utils.go +++ b/pdf/model/utils.go @@ -28,6 +28,14 @@ func getNumberAsFloat(obj PdfObject) (float64, error) { return 0, errors.New("Not a number") } +func isNullObject(obj PdfObject) bool { + if _, isNull := obj.(*PdfObjectNull); isNull { + return true + } else { + return false + } +} + // Convert a list of pdf objects representing floats or integers to a slice of float64 values. func getNumbersAsFloat(objects []PdfObject) ([]float64, error) { floats := []float64{} @@ -73,3 +81,20 @@ func getNumberAsFloatOrNull(obj PdfObject) (*float64, error) { return nil, errors.New("Not a number") } + +// Handy function for debugging in development. +func debugObject(obj PdfObject) { + common.Log.Debug("obj: %T %s", obj, obj.String()) + + if stream, is := obj.(*PdfObjectStream); is { + decoded, err := DecodeStream(stream) + if err != nil { + common.Log.Debug("Error: %v", err) + return + } + common.Log.Debug("Decoded: %s", decoded) + } else if indObj, is := obj.(*PdfIndirectObject); is { + common.Log.Debug("%T %v", indObj.PdfObject, indObj.PdfObject) + common.Log.Debug("%s", indObj.PdfObject.String()) + } +} diff --git a/pdf/model/writer.go b/pdf/model/writer.go index dec118fb..14f203d1 100644 --- a/pdf/model/writer.go +++ b/pdf/model/writer.go @@ -224,7 +224,7 @@ func (this *PdfWriter) addObjects(obj PdfObject) error { } if hasObj := this.hasObject(v); !hasObj { - common.Log.Debug("Parent obj is missing!! %T %p %v\n", v, v, v) + common.Log.Debug("Parent obj is missing!! %T %p %v", v, v, v) this.pendingObjects[v] = dict // Although it is missing at this point, it could be added later... } @@ -400,84 +400,8 @@ func (this *PdfWriter) seekByName(obj PdfObject, followKeys []string, key string return list, nil } -// Add Acroforms to a PDF file. -func (this *PdfWriter) AddForms(forms *PdfObjectDictionary) error { - // Traverse the forms object... - // Keep a list of stuff? - - // Forms dictionary should have: - // Fields array. - if forms == nil { - return errors.New("forms == nil") - } - - // For now, support only regular forms with fields - var fieldsArray *PdfObjectArray - if fields, hasFields := (*forms)["Fields"]; hasFields { - if arr, isArray := fields.(*PdfObjectArray); isArray { - fieldsArray = arr - } else if ind, isInd := fields.(*PdfIndirectObject); isInd { - if arr, isArray := ind.PdfObject.(*PdfObjectArray); isArray { - fieldsArray = arr - } - } - } - if fieldsArray == nil { - common.Log.Debug("Writer - no fields to be added to forms") - return nil - } - - // Add the fields. - for _, field := range *fieldsArray { - fieldObj, ok := field.(*PdfIndirectObject) - if !ok { - return errors.New("Field not pointing indirect object") - } - - followKeys := []string{"Fields", "Kids"} - list, err := this.seekByName(fieldObj, followKeys, "P") - common.Log.Trace("Done seeking!") - if err != nil { - return err - } - common.Log.Trace("List of P objects %d", len(list)) - if len(list) < 1 { - continue - } - - includeField := false - for _, p := range list { - if po, ok := p.(*PdfIndirectObject); ok { - common.Log.Trace("P entry is an indirect object (page)") - if this.hasObject(po) { - includeField = true - } else { - return errors.New("P pointing outside of write pages") - } - } else { - common.Log.Debug("ERROR: P entry not an indirect object (%T)", p) - } - } - - // This won't work. There can be many sub objects. - // Need to specifically go and check the page object! - // P or the appearance dictionary. - if includeField { - common.Log.Trace("Add the field! (%T)", field) - // Add if nothing referenced outside of the writer. - // Probably need to add some objects first... - this.addObject(field) - this.fields = append(this.fields, field) - } else { - common.Log.Trace("Field not relevant!") - } - } - return nil -} - -// Add Acroforms to a PDF file. -func (this *PdfWriter) AddForms2(form *PdfAcroForm) error { - //form.ToPdfObject(true) +// Add Acroforms to a PDF file. Sets the specified form for writing. +func (this *PdfWriter) SetForms(form *PdfAcroForm) error { this.acroForm = form return nil } @@ -613,24 +537,8 @@ func (this *PdfWriter) Write(ws io.WriteSeeker) error { return err } } + // Form fields. - /* - if len(this.fields) > 0 { - forms := PdfIndirectObject{} - formsDict := PdfObjectDictionary{} - forms.PdfObject = &formsDict - fieldsArray := PdfObjectArray{} - for _, field := range this.fields { - fieldsArray = append(fieldsArray, field) - } - formsDict[PdfObjectName("Fields")] = &fieldsArray - (*this.catalog)[PdfObjectName("AcroForm")] = &forms - err := this.addObjects(&forms) - if err != nil { - return err - } - }*/ - // Acroform. if this.acroForm != nil { common.Log.Trace("Writing acro forms") indObj := this.acroForm.ToPdfObject() diff --git a/pdf/model/xobject.go b/pdf/model/xobject.go index 44b40b63..90814aa5 100644 --- a/pdf/model/xobject.go +++ b/pdf/model/xobject.go @@ -242,13 +242,30 @@ func NewXObjectImage() *XObjectImage { return xobj } -// Creates a new XObject Image from an image object with default -// options. -func NewXObjectImageFromImage(name PdfObjectName, img *Image, cs PdfColorspace) (*XObjectImage, error) { - xobj := NewXObjectImage() +// Creates a new XObject Image from an image object with default options. +// If encoder is nil, uses raw encoding (none). +func NewXObjectImageFromImage(name PdfObjectName, img *Image, cs PdfColorspace, encoder StreamEncoder) (*XObjectImage, error) { + baseXObj := NewXObjectImage() + return UpdateXObjectImageFromImage(baseXObj, name, img, cs, encoder) +} + +func UpdateXObjectImageFromImage(baseXObj *XObjectImage, name PdfObjectName, img *Image, cs PdfColorspace, encoder StreamEncoder) (*XObjectImage, error) { + dupObj := *baseXObj + xobj := &dupObj + + if encoder == nil { + encoder = NewRawEncoder() + } + + encoded, err := encoder.EncodeBytes(img.Data) + if err != nil { + common.Log.Debug("Error with encoding: %v", err) + return nil, err + } xobj.Name = &name - xobj.Stream = img.Data + xobj.Filter = encoder + xobj.Stream = encoded // Width and height. imWidth := img.Width @@ -416,6 +433,24 @@ func (ximg *XObjectImage) SetImage(img *Image, cs PdfColorspace) error { return nil } +// Set compression filter. Decodes with current filter sets and encodes the data with the new filter. +func (ximg *XObjectImage) SetFilter(encoder StreamEncoder) error { + encoded := ximg.Stream + decoded, err := ximg.Filter.DecodeBytes(encoded) + if err != nil { + return err + } + + ximg.Filter = encoder + encoded, err = encoder.EncodeBytes(decoded) + if err != nil { + return err + } + + ximg.Stream = encoded + return nil +} + // Compress with default settings, updating the underlying stream also. // XXX/TODO: Add flate encoding as an option (although lossy). Need to be able // to set default settings and override.