diff --git a/pdf/core/encoding.go b/pdf/core/encoding.go index 01a21b41..fecdd586 100644 --- a/pdf/core/encoding.go +++ b/pdf/core/encoding.go @@ -15,6 +15,7 @@ import ( "bytes" "compress/zlib" "encoding/hex" + "errors" "fmt" "io" @@ -29,6 +30,7 @@ const ( StreamEncodingFilterNameFlate = "FlateDecode" StreamEncodingFilterNameLZW = "LZWDecode" StreamEncodingFilterNameASCIIHex = "ASCIIHexDecode" + StreamEncodingFilterNameASCII85 = "ASCII85Decode" ) type StreamEncoder interface { @@ -67,7 +69,7 @@ func NewFlateEncoder() *FlateEncoder { } func (this *FlateEncoder) GetFilterName() string { - return "FlateDecode" + return StreamEncodingFilterNameFlate } func (this *FlateEncoder) MakeDecodeParams() PdfObject { @@ -351,7 +353,7 @@ func NewLZWEncoder() *LZWEncoder { } func (this *LZWEncoder) GetFilterName() string { - return "LZWDecode" + return StreamEncodingFilterNameLZW } func (this *LZWEncoder) MakeDecodeParams() PdfObject { @@ -646,14 +648,14 @@ func (this *LZWEncoder) EncodeBytes(data []byte) ([]byte, error) { type ASCIIHexEncoder struct { } -// Make a new LZW encoder with default parameters. +// Make a new ASCII hex encoder. func NewASCIIHexEncoder() *ASCIIHexEncoder { encoder := &ASCIIHexEncoder{} return encoder } func (this *ASCIIHexEncoder) GetFilterName() string { - return "ASCIIHexDecode" + return StreamEncodingFilterNameASCIIHex } func (this *ASCIIHexEncoder) MakeDecodeParams() PdfObject { @@ -717,6 +719,174 @@ func (this *ASCIIHexEncoder) EncodeBytes(data []byte) ([]byte, error) { return encoded.Bytes(), nil } +// +// ASCII85 encoder/decoder. +// +type ASCII85Encoder struct { +} + +// Make a new ASCII85 encoder. +func NewASCII85Encoder() *ASCII85Encoder { + encoder := &ASCII85Encoder{} + return encoder +} + +func (this *ASCII85Encoder) GetFilterName() string { + return StreamEncodingFilterNameASCII85 +} + +func (this *ASCII85Encoder) MakeDecodeParams() PdfObject { + return nil +} + +// Make a new instance of an encoding dictionary for a stream object. +func (this *ASCII85Encoder) MakeStreamDict() *PdfObjectDictionary { + dict := PdfObjectDictionary{} + + dict["Filter"] = MakeName(this.GetFilterName()) + return &dict +} + +// 5 ASCII characters -> 4 raw binary bytes +func (this *ASCII85Encoder) DecodeBytes(encoded []byte) ([]byte, error) { + decoded := []byte{} + + i := 0 + eod := false + + for i < len(encoded) && !eod { + codes := [5]byte{0, 0, 0, 0, 0} + spaces := 0 // offset due to whitespace. + j := 0 + toWrite := 4 + for j < 5+spaces { + if i+j == len(encoded) { + break + } + code := encoded[i+j] + if IsWhiteSpace(code) { + // Skip whitespace. + spaces++ + j++ + continue + } else if code == '~' && i+j+1 < len(encoded) && encoded[i+j+1] == '>' { + toWrite = (j - spaces) - 1 + if toWrite < 0 { + toWrite = 0 + } + // EOD marker. Marks end of data. + eod = true + break + } else if code >= '!' && code <= 'u' { + // Valid code. + code -= '!' + } else if code == 'z' && j-spaces == 0 { + // 'z' in beginning of the byte sequence means that all 5 codes are 0. + // Already all 0 initialized, so can break here. + toWrite = 4 + j++ + break + } else { + common.Log.Error("Failed decoding, invalid code") + return nil, errors.New("Invalid code encountered") + } + + codes[j-spaces] = code + j++ + } + i += j + + // Pad with 'u' 84 (unused ones) + // Takes care of issues at ends for input data that is not a multiple of 4-bytes. + for m := toWrite + 1; m < 5; m++ { + codes[m] = 84 + } + + // Convert to a uint32 value. + value := uint32(codes[0])*85*85*85*85 + uint32(codes[1])*85*85*85 + uint32(codes[2])*85*85 + uint32(codes[3])*85 + uint32(codes[4]) + + // Convert to 4 bytes. + decodedBytes := []byte{ + byte((value >> 24) & 0xff), + byte((value >> 16) & 0xff), + byte((value >> 8) & 0xff), + byte(value & 0xff)} + + // This accounts for the end of data, where the original data length is not a multiple of 4. + // In that case, 0 bytes are assumed but only + decoded = append(decoded, decodedBytes[:toWrite]...) + } + + return decoded, nil +} + +// ASCII85 stream decoding. +func (this *ASCII85Encoder) DecodeStream(streamObj *PdfObjectStream) ([]byte, error) { + return this.DecodeBytes(streamObj.Stream) +} + +// Convert a base 256 number to a series of base 85 values (5 codes). +// 85^5 = 4437053125 > 256^4 = 4294967296 +// So 5 base-85 numbers will always be enough to cover 4 base-256 numbers. +// The base 256 value is already converted to an uint32 value. +func (this *ASCII85Encoder) base256Tobase85(base256val uint32) [5]byte { + base85 := [5]byte{0, 0, 0, 0, 0} + remainder := base256val + for i := 0; i < 5; i++ { + divider := uint32(1) + for j := 0; j < 4-i; j++ { + divider *= 85 + } + val := remainder / divider + remainder = remainder % divider + base85[i] = byte(val) + } + return base85 +} + +// Encode data into ASCII85 encoded format. +func (this *ASCII85Encoder) EncodeBytes(data []byte) ([]byte, error) { + var encoded bytes.Buffer + + for i := 0; i < len(data); i += 4 { + b1 := data[i] + n := 1 + + b2 := byte(0) + if i+1 < len(data) { + b2 = data[i+1] + n++ + } + + b3 := byte(0) + if i+2 < len(data) { + b3 = data[i+2] + n++ + } + + b4 := byte(0) + if i+3 < len(data) { + b4 = data[i+3] + n++ + } + + // Convert to a uint32 number. + base256 := (uint32(b1) << 24) | (uint32(b2) << 16) | (uint32(b3) << 8) | uint32(b4) + if base256 == 0 { + encoded.WriteByte('z') + } else { + base85vals := this.base256Tobase85(base256) + for _, val := range base85vals[:n+1] { + encoded.WriteByte(val + '!') + } + } + } + + // EOD. + encoded.WriteString("~>") + return encoded.Bytes(), nil +} + // // Raw encoder/decoder (no encoding, pass through) // @@ -853,6 +1023,9 @@ func newMultiEncoderFromStream(streamObj *PdfObjectStream) (*MultiEncoder, error } else if *name == StreamEncodingFilterNameASCIIHex { encoder := NewASCIIHexEncoder() mencoder.AddEncoder(encoder) + } else if *name == StreamEncodingFilterNameASCII85 { + encoder := NewASCII85Encoder() + mencoder.AddEncoder(encoder) } else { common.Log.Error("Unsupported filter %s", *name) return nil, fmt.Errorf("Invalid filter in multi filter array") diff --git a/pdf/core/encoding_test.go b/pdf/core/encoding_test.go index de6cce11..528f01b6 100644 --- a/pdf/core/encoding_test.go +++ b/pdf/core/encoding_test.go @@ -6,6 +6,7 @@ package core import ( + "encoding/base64" "testing" "github.com/unidoc/unidoc/common" @@ -90,6 +91,101 @@ func TestASCIIHexEncoding(t *testing.T) { } } +// ASCII85. +func TestASCII85EncodingWikipediaExample(t *testing.T) { + expected := `Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure.` + // Base 64 encoded, Ascii85 encoded version (wikipedia). + encodedInBase64 := `OWpxb15CbGJELUJsZUIxREorKitGKGYscS8wSmhLRjxHTD5DakAuNEdwJGQ3RiEsTDdAPDZAKS8wSkRFRjxHJTwrRVY6MkYhLE88REorKi5APCpLMEA8NkwoRGYtXDBFYzVlO0RmZlooRVplZS5CbC45cEYiQUdYQlBDc2krREdtPkAzQkIvRiomT0NBZnUyL0FLWWkoREliOkBGRCwqKStDXVU9QDNCTiNFY1lmOEFURDNzQHE/ZCRBZnRWcUNoW05xRjxHOjgrRVY6LitDZj4tRkQ1VzhBUmxvbERJYWwoRElkPGpAPD8zckA6RiVhK0Q1OCdBVEQ0JEJsQGwzRGU6LC1ESnNgOEFSb0ZiLzBKTUtAcUI0XkYhLFI8QUtaJi1EZlRxQkclRz51RC5SVHBBS1lvJytDVC81K0NlaSNESUk/KEUsOSlvRioyTTcvY34+` + encoded, _ := base64.StdEncoding.DecodeString(encodedInBase64) + + encoder := NewASCII85Encoder() + enc1, err := encoder.EncodeBytes([]byte(expected)) + if err != nil { + t.Errorf("Fail") + return + } + if string(enc1) != string(encoded) { + t.Errorf("ASCII85 encoding wiki example fail") + return + } + + decoded, err := encoder.DecodeBytes([]byte(encoded)) + if err != nil { + t.Errorf("Fail, error: %v", err) + return + } + if expected != string(decoded) { + t.Errorf("Mismatch! '%s' vs '%s'", decoded, expected) + return + } +} + +func TestASCII85Encoding(t *testing.T) { + encoded := `FD,B0+EVmJAKYo'+D#G#De*R"B-:o0+E_a:A0>T(+AbuZ@;]Tu:ddbqAnc'mEr~>` + expected := "this type of encoding is used in PS and PDF files" + + encoder := NewASCII85Encoder() + + enc1, err := encoder.EncodeBytes([]byte(expected)) + if err != nil { + t.Errorf("Fail") + return + } + if encoded != string(enc1) { + t.Errorf("Encoding error") + return + } + + decoded, err := encoder.DecodeBytes([]byte(encoded)) + if err != nil { + t.Errorf("Fail, error: %v", err) + return + } + if expected != string(decoded) { + t.Errorf("Mismatch! '%s' vs '%s'", decoded, expected) + return + } +} + +type TestASCII85DecodingTestCase struct { + Encoded string + Expected string +} + +func TestASCII85Decoding(t *testing.T) { + // Map encoded -> Decoded + testcases := []TestASCII85DecodingTestCase{ + {"z~>", "\x00\x00\x00\x00"}, + {"z ~>", "\x00\x00\x00\x00"}, + {"zz~>", "\x00\x00\x00\x00\x00\x00\x00\x00"}, + {" zz~>", "\x00\x00\x00\x00\x00\x00\x00\x00"}, + {" z z~>", "\x00\x00\x00\x00\x00\x00\x00\x00"}, + {" z z ~>", "\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"+T~>", `!`}, + {"+`d~>", `!s`}, + {"+`hr~>", `!sz`}, + {"+`hsS~>", `!szx`}, + {"+`hsS+T~>", `!szx!`}, + {"+ `hs S +T ~>", `!szx!`}, + } + + encoder := NewASCII85Encoder() + + for _, testcase := range testcases { + encoded := testcase.Encoded + expected := testcase.Expected + decoded, err := encoder.DecodeBytes([]byte(encoded)) + if err != nil { + t.Errorf("Fail, error: %v", err) + return + } + if expected != string(decoded) { + t.Errorf("Mismatch! '%s' vs '%s'", decoded, expected) + return + } + } +} + // Test multi encoder with FlateDecode and ASCIIHexDecode. func TestMultiEncoder(t *testing.T) { rawStream := []byte("this is a dummy text with some \x01\x02\x03 binary data") diff --git a/pdf/core/stream.go b/pdf/core/stream.go index 80b9f572..3f5e18e9 100644 --- a/pdf/core/stream.go +++ b/pdf/core/stream.go @@ -57,10 +57,12 @@ func NewEncoderFromStream(streamObj *PdfObjectStream) (StreamEncoder, error) { return NewASCIIHexEncoder(), nil } else if *method == StreamEncodingFilterNameLZW { return newLZWEncoderFromStream(streamObj, nil) + } else if *method == StreamEncodingFilterNameASCII85 { + return NewASCII85Encoder(), nil + } else { + common.Log.Debug("ERROR: Unsupported encoding method!") + return nil, fmt.Errorf("Unsupported encoding method (%s)", *method) } - - common.Log.Debug("ERROR: Unsupported encoding method!") - return nil, fmt.Errorf("Unsupported encoding method (%s)", *method) } // Decodes the stream.