mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-02 22:17:06 +08:00
Refactored contentstream into a separate package.
This commit is contained in:
parent
9247f5d954
commit
4aa6845e27
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
|
.idea
|
||||||
*.mdb
|
*.mdb
|
||||||
*.userprefs
|
*.userprefs
|
||||||
*.pidb
|
*.pidb
|
||||||
|
68
pdf/contentstream/contentstream.go
Normal file
68
pdf/contentstream/contentstream.go
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
/*
|
||||||
|
* This file is subject to the terms and conditions defined in
|
||||||
|
* file 'LICENSE.md', which is part of this source code package.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// The content stream parser provides functionality to parse the content stream into a list of
|
||||||
|
// operands that can then be processed further for rendering or extraction of information.
|
||||||
|
// The contentstream package uses the core and model packages.
|
||||||
|
|
||||||
|
package contentstream
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
. "github.com/unidoc/unidoc/pdf/core"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ContentStreamOperation struct {
|
||||||
|
Params []PdfObject
|
||||||
|
Operand string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parses and extracts all text data in content streams and returns as a string.
|
||||||
|
// Does not take into account Encoding table, the output is simply the character codes.
|
||||||
|
func (this *ContentStreamParser) ExtractText() (string, error) {
|
||||||
|
operations, err := this.Parse()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
inText := false
|
||||||
|
txt := ""
|
||||||
|
for _, op := range operations {
|
||||||
|
if op.Operand == "BT" {
|
||||||
|
inText = true
|
||||||
|
} else if op.Operand == "ET" {
|
||||||
|
inText = false
|
||||||
|
}
|
||||||
|
if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
|
||||||
|
// Move to next line...
|
||||||
|
txt += "\n"
|
||||||
|
}
|
||||||
|
if inText && op.Operand == "TJ" {
|
||||||
|
if len(op.Params) < 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
paramList, ok := op.Params[0].(*PdfObjectArray)
|
||||||
|
if !ok {
|
||||||
|
return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
|
||||||
|
}
|
||||||
|
for _, obj := range *paramList {
|
||||||
|
if strObj, ok := obj.(*PdfObjectString); ok {
|
||||||
|
txt += string(*strObj)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if inText && op.Operand == "Tj" {
|
||||||
|
if len(op.Params) < 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
param, ok := op.Params[0].(*PdfObjectString)
|
||||||
|
if !ok {
|
||||||
|
return "", fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
|
||||||
|
}
|
||||||
|
txt += string(*param)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return txt, nil
|
||||||
|
}
|
386
pdf/contentstream/encoding.go
Normal file
386
pdf/contentstream/encoding.go
Normal file
@ -0,0 +1,386 @@
|
|||||||
|
/*
|
||||||
|
* This file is subject to the terms and conditions defined in
|
||||||
|
* file 'LICENSE.md', which is part of this source code package.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package contentstream
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
gocolor "image/color"
|
||||||
|
"image/jpeg"
|
||||||
|
|
||||||
|
"github.com/unidoc/unidoc/common"
|
||||||
|
. "github.com/unidoc/unidoc/pdf/core"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Creates the encoder for the inline image's Filter and DecodeParms.
|
||||||
|
func NewEncoderFromInlineImage(inlineImage *ContentStreamInlineImage) (StreamEncoder, error) {
|
||||||
|
if inlineImage.Filter == nil {
|
||||||
|
// No filter, return raw data back.
|
||||||
|
return NewRawEncoder(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// The filter should be a name or an array with a list of filter names.
|
||||||
|
filterName, ok := inlineImage.Filter.(*PdfObjectName)
|
||||||
|
if !ok {
|
||||||
|
array, ok := inlineImage.Filter.(*PdfObjectArray)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Filter not a Name or Array object")
|
||||||
|
}
|
||||||
|
if len(*array) == 0 {
|
||||||
|
// Empty array -> indicates raw filter (no filter).
|
||||||
|
return NewRawEncoder(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(*array) != 1 {
|
||||||
|
menc, err := newMultiEncoderFromInlineImage(inlineImage)
|
||||||
|
if err != nil {
|
||||||
|
common.Log.Error("Failed creating multi encoder: %v", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
common.Log.Trace("Multi enc: %s\n", menc)
|
||||||
|
return menc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Single element.
|
||||||
|
filterObj := (*array)[0]
|
||||||
|
filterName, ok = filterObj.(*PdfObjectName)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Filter array member not a Name object")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if *filterName == "AHx" {
|
||||||
|
return NewASCIIHexEncoder(), nil
|
||||||
|
} else if *filterName == "A85" {
|
||||||
|
return NewASCII85Encoder(), nil
|
||||||
|
} else if *filterName == "DCT" {
|
||||||
|
return newDCTEncoderFromInlineImage(inlineImage)
|
||||||
|
} else if *filterName == "Fl" {
|
||||||
|
return newFlateEncoderFromInlineImage(inlineImage, nil)
|
||||||
|
} else if *filterName == "LZW" {
|
||||||
|
return newLZWEncoderFromInlineImage(inlineImage, nil)
|
||||||
|
} else {
|
||||||
|
common.Log.Debug("Unsupported inline image encoding filter name : %s", *filterName)
|
||||||
|
return nil, errors.New("Unsupported inline encoding method")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new flate decoder from an inline image object, getting all the encoding parameters
|
||||||
|
// from the DecodeParms stream object dictionary entry that can be provided optionally, usually
|
||||||
|
// only when a multi filter is used.
|
||||||
|
func newFlateEncoderFromInlineImage(inlineImage *ContentStreamInlineImage, decodeParams *PdfObjectDictionary) (*FlateEncoder, error) {
|
||||||
|
encoder := NewFlateEncoder()
|
||||||
|
|
||||||
|
// If decodeParams not provided, see if we can get from the stream.
|
||||||
|
if decodeParams == nil {
|
||||||
|
obj := inlineImage.DecodeParms
|
||||||
|
if obj != nil {
|
||||||
|
dp, isDict := obj.(*PdfObjectDictionary)
|
||||||
|
if !isDict {
|
||||||
|
common.Log.Debug("Error: DecodeParms not a dictionary (%T)", obj)
|
||||||
|
return nil, fmt.Errorf("Invalid DecodeParms")
|
||||||
|
}
|
||||||
|
decodeParams = dp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if decodeParams == nil {
|
||||||
|
// Can safely return here if no decode params, as the following depend on the decode params.
|
||||||
|
return encoder, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
common.Log.Trace("decode params: %s", decodeParams.String())
|
||||||
|
obj, has := (*decodeParams)["Predictor"]
|
||||||
|
if !has {
|
||||||
|
common.Log.Debug("Error: Predictor missing from DecodeParms - Continue with default (1)")
|
||||||
|
} else {
|
||||||
|
predictor, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
common.Log.Debug("Error: Predictor specified but not numeric (%T)", obj)
|
||||||
|
return nil, fmt.Errorf("Invalid Predictor")
|
||||||
|
}
|
||||||
|
encoder.Predictor = int(*predictor)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bits per component. Use default if not specified (8).
|
||||||
|
obj, has = (*decodeParams)["BitsPerComponent"]
|
||||||
|
if has {
|
||||||
|
bpc, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
common.Log.Debug("ERROR: Invalid BitsPerComponent")
|
||||||
|
return nil, fmt.Errorf("Invalid BitsPerComponent")
|
||||||
|
}
|
||||||
|
encoder.BitsPerComponent = int(*bpc)
|
||||||
|
}
|
||||||
|
|
||||||
|
if encoder.Predictor > 1 {
|
||||||
|
// Columns.
|
||||||
|
encoder.Columns = 1
|
||||||
|
obj, has = (*decodeParams)["Columns"]
|
||||||
|
if has {
|
||||||
|
columns, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Predictor column invalid")
|
||||||
|
}
|
||||||
|
|
||||||
|
encoder.Columns = int(*columns)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Colors.
|
||||||
|
// Number of interleaved color components per sample (Default 1 if not specified)
|
||||||
|
encoder.Colors = 1
|
||||||
|
obj, has = (*decodeParams)["Colors"]
|
||||||
|
if has {
|
||||||
|
colors, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Predictor colors not an integer")
|
||||||
|
}
|
||||||
|
encoder.Colors = int(*colors)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return encoder, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new LZW encoder/decoder based on an inline image object, getting all the encoding parameters
|
||||||
|
// from the DecodeParms stream object dictionary entry.
|
||||||
|
func newLZWEncoderFromInlineImage(inlineImage *ContentStreamInlineImage, decodeParams *PdfObjectDictionary) (*LZWEncoder, error) {
|
||||||
|
// Start with default settings.
|
||||||
|
encoder := NewLZWEncoder()
|
||||||
|
|
||||||
|
// If decodeParams not provided, see if we can get from the inline image directly.
|
||||||
|
if decodeParams == nil {
|
||||||
|
if inlineImage.DecodeParms != nil {
|
||||||
|
dp, isDict := inlineImage.DecodeParms.(*PdfObjectDictionary)
|
||||||
|
if !isDict {
|
||||||
|
common.Log.Debug("Error: DecodeParms not a dictionary (%T)", inlineImage.DecodeParms)
|
||||||
|
return nil, fmt.Errorf("Invalid DecodeParms")
|
||||||
|
}
|
||||||
|
decodeParams = dp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if decodeParams == nil {
|
||||||
|
// No decode parameters. Can safely return here if not set as the following options
|
||||||
|
// are related to the decode Params.
|
||||||
|
return encoder, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// The EarlyChange indicates when to increase code length, as different
|
||||||
|
// implementations use a different mechanisms. Essentially this chooses
|
||||||
|
// which LZW implementation to use.
|
||||||
|
// The default is 1 (one code early)
|
||||||
|
//
|
||||||
|
// The EarlyChange parameter is specified in the object stream dictionary for regular streams,
|
||||||
|
// but it is not specified explicitly where to check for it in the case of inline images.
|
||||||
|
// We will check in the decodeParms for now, we can adjust later if we come across cases of this.
|
||||||
|
obj, has := (*decodeParams)["EarlyChange"]
|
||||||
|
if has {
|
||||||
|
earlyChange, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
common.Log.Debug("Error: EarlyChange specified but not numeric (%T)", obj)
|
||||||
|
return nil, fmt.Errorf("Invalid EarlyChange")
|
||||||
|
}
|
||||||
|
if *earlyChange != 0 && *earlyChange != 1 {
|
||||||
|
return nil, fmt.Errorf("Invalid EarlyChange value (not 0 or 1)")
|
||||||
|
}
|
||||||
|
|
||||||
|
encoder.EarlyChange = int(*earlyChange)
|
||||||
|
} else {
|
||||||
|
encoder.EarlyChange = 1 // default
|
||||||
|
}
|
||||||
|
|
||||||
|
obj, has = (*decodeParams)["Predictor"]
|
||||||
|
if has {
|
||||||
|
predictor, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
common.Log.Debug("Error: Predictor specified but not numeric (%T)", obj)
|
||||||
|
return nil, fmt.Errorf("Invalid Predictor")
|
||||||
|
}
|
||||||
|
encoder.Predictor = int(*predictor)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bits per component. Use default if not specified (8).
|
||||||
|
obj, has = (*decodeParams)["BitsPerComponent"]
|
||||||
|
if has {
|
||||||
|
bpc, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
common.Log.Debug("ERROR: Invalid BitsPerComponent")
|
||||||
|
return nil, fmt.Errorf("Invalid BitsPerComponent")
|
||||||
|
}
|
||||||
|
encoder.BitsPerComponent = int(*bpc)
|
||||||
|
}
|
||||||
|
|
||||||
|
if encoder.Predictor > 1 {
|
||||||
|
// Columns.
|
||||||
|
encoder.Columns = 1
|
||||||
|
obj, has = (*decodeParams)["Columns"]
|
||||||
|
if has {
|
||||||
|
columns, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Predictor column invalid")
|
||||||
|
}
|
||||||
|
|
||||||
|
encoder.Columns = int(*columns)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Colors.
|
||||||
|
// Number of interleaved color components per sample (Default 1 if not specified)
|
||||||
|
encoder.Colors = 1
|
||||||
|
obj, has = (*decodeParams)["Colors"]
|
||||||
|
if has {
|
||||||
|
colors, ok := obj.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Predictor colors not an integer")
|
||||||
|
}
|
||||||
|
encoder.Colors = int(*colors)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
common.Log.Trace("decode params: %s", decodeParams.String())
|
||||||
|
return encoder, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new DCT encoder/decoder based on an inline image, getting all the encoding parameters
|
||||||
|
// from the stream object dictionary entry and the image data itself.
|
||||||
|
func newDCTEncoderFromInlineImage(inlineImage *ContentStreamInlineImage) (*DCTEncoder, error) {
|
||||||
|
// Start with default settings.
|
||||||
|
encoder := NewDCTEncoder()
|
||||||
|
|
||||||
|
bufReader := bytes.NewReader(inlineImage.stream)
|
||||||
|
|
||||||
|
cfg, err := jpeg.DecodeConfig(bufReader)
|
||||||
|
//img, _, err := goimage.Decode(bufReader)
|
||||||
|
if err != nil {
|
||||||
|
common.Log.Debug("Error decoding file: %s", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
switch cfg.ColorModel {
|
||||||
|
case gocolor.RGBAModel:
|
||||||
|
encoder.BitsPerComponent = 8
|
||||||
|
encoder.ColorComponents = 3 // alpha is not included in pdf.
|
||||||
|
case gocolor.RGBA64Model:
|
||||||
|
encoder.BitsPerComponent = 16
|
||||||
|
encoder.ColorComponents = 3
|
||||||
|
case gocolor.GrayModel:
|
||||||
|
encoder.BitsPerComponent = 8
|
||||||
|
encoder.ColorComponents = 1
|
||||||
|
case gocolor.Gray16Model:
|
||||||
|
encoder.BitsPerComponent = 16
|
||||||
|
encoder.ColorComponents = 1
|
||||||
|
case gocolor.CMYKModel:
|
||||||
|
encoder.BitsPerComponent = 8
|
||||||
|
encoder.ColorComponents = 4
|
||||||
|
case gocolor.YCbCrModel:
|
||||||
|
// YCbCr is not supported by PDF, but it could be a different colorspace
|
||||||
|
// with 3 components. Would be specified by the ColorSpace entry.
|
||||||
|
encoder.BitsPerComponent = 8
|
||||||
|
encoder.ColorComponents = 3
|
||||||
|
default:
|
||||||
|
return nil, errors.New("Unsupported color model")
|
||||||
|
}
|
||||||
|
encoder.Width = cfg.Width
|
||||||
|
encoder.Height = cfg.Height
|
||||||
|
common.Log.Trace("DCT Encoder: %+v", encoder)
|
||||||
|
|
||||||
|
return encoder, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new multi-filter encoder/decoder based on an inline image, getting all the encoding parameters
|
||||||
|
// from the filter specification and the DecodeParms (DP) dictionaries.
|
||||||
|
func newMultiEncoderFromInlineImage(inlineImage *ContentStreamInlineImage) (*MultiEncoder, error) {
|
||||||
|
mencoder := NewMultiEncoder()
|
||||||
|
|
||||||
|
// Prepare the decode params array (one for each filter type)
|
||||||
|
// Optional, not always present.
|
||||||
|
var decodeParamsDict *PdfObjectDictionary
|
||||||
|
decodeParamsArray := []PdfObject{}
|
||||||
|
if obj := inlineImage.DecodeParms; obj != nil {
|
||||||
|
// If it is a dictionary, assume it applies to all
|
||||||
|
dict, isDict := obj.(*PdfObjectDictionary)
|
||||||
|
if isDict {
|
||||||
|
decodeParamsDict = dict
|
||||||
|
}
|
||||||
|
|
||||||
|
// If it is an array, assume there is one for each
|
||||||
|
arr, isArray := obj.(*PdfObjectArray)
|
||||||
|
if isArray {
|
||||||
|
for _, dictObj := range *arr {
|
||||||
|
if dict, is := dictObj.(*PdfObjectDictionary); is {
|
||||||
|
decodeParamsArray = append(decodeParamsArray, dict)
|
||||||
|
} else {
|
||||||
|
decodeParamsArray = append(decodeParamsArray, nil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
obj := inlineImage.Filter
|
||||||
|
if obj == nil {
|
||||||
|
return nil, fmt.Errorf("Filter missing")
|
||||||
|
}
|
||||||
|
|
||||||
|
array, ok := obj.(*PdfObjectArray)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Multi filter can only be made from array")
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx, obj := range *array {
|
||||||
|
name, ok := obj.(*PdfObjectName)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Multi filter array element not a name")
|
||||||
|
}
|
||||||
|
|
||||||
|
var dp PdfObject
|
||||||
|
|
||||||
|
// If decode params dict is set, use it. Otherwise take from array..
|
||||||
|
if decodeParamsDict != nil {
|
||||||
|
dp = decodeParamsDict
|
||||||
|
} else {
|
||||||
|
// Only get the dp if provided. Oftentimes there is no decode params dict
|
||||||
|
// provided.
|
||||||
|
if len(decodeParamsArray) > 0 {
|
||||||
|
if idx >= len(decodeParamsArray) {
|
||||||
|
return nil, fmt.Errorf("Missing elements in decode params array")
|
||||||
|
}
|
||||||
|
dp = decodeParamsArray[idx]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var dParams *PdfObjectDictionary
|
||||||
|
if dict, is := dp.(*PdfObjectDictionary); is {
|
||||||
|
dParams = dict
|
||||||
|
}
|
||||||
|
|
||||||
|
if *name == StreamEncodingFilterNameFlate {
|
||||||
|
// XXX: need to separate out the DecodeParms..
|
||||||
|
encoder, err := newFlateEncoderFromInlineImage(inlineImage, dParams)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
mencoder.AddEncoder(encoder)
|
||||||
|
} else if *name == StreamEncodingFilterNameLZW {
|
||||||
|
encoder, err := newLZWEncoderFromInlineImage(inlineImage, dParams)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
mencoder.AddEncoder(encoder)
|
||||||
|
} else if *name == StreamEncodingFilterNameASCIIHex {
|
||||||
|
encoder := NewASCIIHexEncoder()
|
||||||
|
mencoder.AddEncoder(encoder)
|
||||||
|
} else if *name == StreamEncodingFilterNameASCII85 {
|
||||||
|
encoder := NewASCII85Encoder()
|
||||||
|
mencoder.AddEncoder(encoder)
|
||||||
|
} else {
|
||||||
|
common.Log.Error("Unsupported filter %s", *name)
|
||||||
|
return nil, fmt.Errorf("Invalid filter in multi filter array")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return mencoder, nil
|
||||||
|
}
|
314
pdf/contentstream/inline-image.go
Normal file
314
pdf/contentstream/inline-image.go
Normal file
@ -0,0 +1,314 @@
|
|||||||
|
/*
|
||||||
|
* This file is subject to the terms and conditions defined in
|
||||||
|
* file 'LICENSE.md', which is part of this source code package.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package contentstream
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/unidoc/unidoc/common"
|
||||||
|
. "github.com/unidoc/unidoc/pdf/core"
|
||||||
|
. "github.com/unidoc/unidoc/pdf/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
// A representation of an inline image in a Content stream. Everything between the BI and EI operands.
|
||||||
|
// ContentStreamInlineImage implements the PdfObject interface although strictly it is not a PDF object.
|
||||||
|
type ContentStreamInlineImage struct {
|
||||||
|
BitsPerComponent PdfObject
|
||||||
|
ColorSpace PdfObject
|
||||||
|
Decode PdfObject
|
||||||
|
DecodeParms PdfObject
|
||||||
|
Filter PdfObject
|
||||||
|
Height PdfObject
|
||||||
|
ImageMask PdfObject
|
||||||
|
Intent PdfObject
|
||||||
|
Interpolate PdfObject
|
||||||
|
Width PdfObject
|
||||||
|
stream []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func (this *ContentStreamInlineImage) String() string {
|
||||||
|
s := fmt.Sprintf("InlineImage(len=%d)\n", len(this.stream))
|
||||||
|
if this.BitsPerComponent != nil {
|
||||||
|
s += "- BPC " + this.BitsPerComponent.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.ColorSpace != nil {
|
||||||
|
s += "- CS " + this.ColorSpace.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.Decode != nil {
|
||||||
|
s += "- D " + this.Decode.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.DecodeParms != nil {
|
||||||
|
s += "- DP " + this.DecodeParms.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.Filter != nil {
|
||||||
|
s += "- F " + this.Filter.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.Height != nil {
|
||||||
|
s += "- H " + this.Height.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.ImageMask != nil {
|
||||||
|
s += "- IM " + this.ImageMask.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.Intent != nil {
|
||||||
|
s += "- Intent " + this.Intent.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.Interpolate != nil {
|
||||||
|
s += "- I " + this.Interpolate.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
if this.Width != nil {
|
||||||
|
s += "- W " + this.Width.DefaultWriteString() + "\n"
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func (this *ContentStreamInlineImage) DefaultWriteString() string {
|
||||||
|
var output bytes.Buffer
|
||||||
|
|
||||||
|
// We do not start with "BI" as that is the operand and is written out separately.
|
||||||
|
// Write out the parameters
|
||||||
|
s := "BPC " + this.BitsPerComponent.DefaultWriteString() + "\n"
|
||||||
|
s += "CS " + this.ColorSpace.DefaultWriteString() + "\n"
|
||||||
|
s += "D " + this.Decode.DefaultWriteString() + "\n"
|
||||||
|
s += "DP " + this.DecodeParms.DefaultWriteString() + "\n"
|
||||||
|
s += "F " + this.Filter.DefaultWriteString() + "\n"
|
||||||
|
s += "H " + this.Height.DefaultWriteString() + "\n"
|
||||||
|
s += "IM " + this.ImageMask.DefaultWriteString() + "\n"
|
||||||
|
s += "Intent " + this.Intent.DefaultWriteString() + "\n"
|
||||||
|
s += "I " + this.Interpolate.DefaultWriteString() + "\n"
|
||||||
|
s += "W " + this.Width.DefaultWriteString() + "\n"
|
||||||
|
output.WriteString(s)
|
||||||
|
|
||||||
|
output.WriteString("ID ")
|
||||||
|
output.Write(this.stream)
|
||||||
|
|
||||||
|
return output.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (this *ContentStreamInlineImage) GetColorSpace() (PdfColorspace, error) {
|
||||||
|
if this.ColorSpace == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can also refer to a name in the PDF page resources...
|
||||||
|
|
||||||
|
name, ok := this.ColorSpace.(*PdfObjectName)
|
||||||
|
if !ok {
|
||||||
|
common.Log.Debug("Error: Invalid object type")
|
||||||
|
return nil, errors.New("Invalid type")
|
||||||
|
}
|
||||||
|
|
||||||
|
if *name == "G" {
|
||||||
|
return NewPdfColorspaceDeviceGray(), nil
|
||||||
|
} else if *name == "RGB" {
|
||||||
|
return NewPdfColorspaceDeviceRGB(), nil
|
||||||
|
} else if *name == "CMYK" {
|
||||||
|
return NewPdfColorspaceDeviceCMYK(), nil
|
||||||
|
//} else if *name == "I" {
|
||||||
|
// cs := NewPdfColorspaceSpecialIndexed()
|
||||||
|
// return cs, nil
|
||||||
|
} else {
|
||||||
|
common.Log.Debug("Error, unsupported inline image colorspace: %s", *name)
|
||||||
|
return nil, errors.New("Invalid parameter")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Export the inline image to Image which can be transformed or exported easily.
|
||||||
|
func (this *ContentStreamInlineImage) ToImage() (*Image, error) {
|
||||||
|
// Decode the imaging data if encoded.
|
||||||
|
encoder, err := NewEncoderFromInlineImage(this)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
common.Log.Trace("encoder: %+v %T", encoder, encoder)
|
||||||
|
|
||||||
|
decoded, err := encoder.DecodeBytes(this.stream)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
image := &Image{}
|
||||||
|
|
||||||
|
// Height.
|
||||||
|
if this.Height == nil {
|
||||||
|
return nil, errors.New("Height attribute missing")
|
||||||
|
}
|
||||||
|
height, ok := this.Height.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("Invalid height")
|
||||||
|
}
|
||||||
|
image.Height = int64(*height)
|
||||||
|
|
||||||
|
// Width.
|
||||||
|
if this.Width == nil {
|
||||||
|
return nil, errors.New("Width attribute missing")
|
||||||
|
}
|
||||||
|
width, ok := this.Width.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("Invalid width")
|
||||||
|
}
|
||||||
|
image.Width = int64(*width)
|
||||||
|
|
||||||
|
// BPC.
|
||||||
|
if this.BitsPerComponent == nil {
|
||||||
|
common.Log.Debug("Inline Bits per component missing - assuming 8")
|
||||||
|
image.BitsPerComponent = 8
|
||||||
|
} else {
|
||||||
|
bpc, ok := this.BitsPerComponent.(*PdfObjectInteger)
|
||||||
|
if !ok {
|
||||||
|
common.Log.Debug("Error invalid bits per component value, type %T", this.BitsPerComponent)
|
||||||
|
return nil, errors.New("BPC Type error")
|
||||||
|
}
|
||||||
|
image.BitsPerComponent = int64(*bpc)
|
||||||
|
}
|
||||||
|
|
||||||
|
image.Data = decoded
|
||||||
|
|
||||||
|
return image, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse an inline image from a content stream, both read its properties and binary data.
|
||||||
|
// When called, "BI" has already been read from the stream. This function
|
||||||
|
// finishes reading through "EI" and then returns the ContentStreamInlineImage.
|
||||||
|
func (this *ContentStreamParser) ParseInlineImage() (*ContentStreamInlineImage, error) {
|
||||||
|
// Reading parameters.
|
||||||
|
im := ContentStreamInlineImage{}
|
||||||
|
|
||||||
|
for {
|
||||||
|
this.skipSpaces()
|
||||||
|
obj, err, isOperand := this.parseObject()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isOperand {
|
||||||
|
// Not an operand.. Read key value properties..
|
||||||
|
param, ok := obj.(*PdfObjectName)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Invalid inline image property (expecting name) - %T", obj)
|
||||||
|
}
|
||||||
|
|
||||||
|
valueObj, err, isOperand := this.parseObject()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if isOperand {
|
||||||
|
return nil, fmt.Errorf("Not expecting an operand")
|
||||||
|
}
|
||||||
|
|
||||||
|
if *param == "BPC" {
|
||||||
|
im.BitsPerComponent = valueObj
|
||||||
|
} else if *param == "CS" {
|
||||||
|
im.ColorSpace = valueObj
|
||||||
|
} else if *param == "D" {
|
||||||
|
im.Decode = valueObj
|
||||||
|
} else if *param == "DP" {
|
||||||
|
im.DecodeParms = valueObj
|
||||||
|
} else if *param == "F" {
|
||||||
|
im.Filter = valueObj
|
||||||
|
} else if *param == "H" {
|
||||||
|
im.Height = valueObj
|
||||||
|
} else if *param == "IM" {
|
||||||
|
im.ImageMask = valueObj
|
||||||
|
} else if *param == "Intent" {
|
||||||
|
im.Intent = valueObj
|
||||||
|
} else if *param == "I" {
|
||||||
|
im.Interpolate = valueObj
|
||||||
|
} else if *param == "W" {
|
||||||
|
im.Width = valueObj
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("Unknown inline image parameter %s", *param)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if isOperand {
|
||||||
|
operand, ok := obj.(*PdfObjectString)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Failed to read inline image - invalid operand")
|
||||||
|
}
|
||||||
|
|
||||||
|
if *operand == "EI" {
|
||||||
|
// Image fully defined
|
||||||
|
common.Log.Debug("Inline image finished...")
|
||||||
|
return &im, nil
|
||||||
|
} else if *operand == "ID" {
|
||||||
|
// Inline image data.
|
||||||
|
// Should get a single space (0x20) followed by the data and then EI.
|
||||||
|
common.Log.Debug("ID start")
|
||||||
|
|
||||||
|
// Skip the space if its there.
|
||||||
|
b, err := this.reader.Peek(1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if IsWhiteSpace(b[0]) {
|
||||||
|
this.reader.Discard(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unfortunately there is no good way to know how many bytes to read since it
|
||||||
|
// depends on the Filter and encoding etc.
|
||||||
|
// Therefore we will simply read until we find "<ws>EI<ws>" where <ws> is whitespace
|
||||||
|
// although of course that could be a part of the data (even if unlikely).
|
||||||
|
im.stream = []byte{}
|
||||||
|
state := 0
|
||||||
|
var skipBytes []byte
|
||||||
|
for {
|
||||||
|
c, err := this.reader.ReadByte()
|
||||||
|
if err != nil {
|
||||||
|
common.Log.Debug("Unable to find end of image EI in inline image data")
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if state == 0 {
|
||||||
|
if IsWhiteSpace(c) {
|
||||||
|
skipBytes = []byte{}
|
||||||
|
skipBytes = append(skipBytes, c)
|
||||||
|
state = 1
|
||||||
|
} else {
|
||||||
|
im.stream = append(im.stream, c)
|
||||||
|
}
|
||||||
|
} else if state == 1 {
|
||||||
|
skipBytes = append(skipBytes, c)
|
||||||
|
if c == 'E' {
|
||||||
|
state = 2
|
||||||
|
} else {
|
||||||
|
im.stream = append(im.stream, skipBytes...)
|
||||||
|
// Need an extra check to decide if we fall back to state 0 or 1.
|
||||||
|
if IsWhiteSpace(c) {
|
||||||
|
state = 1
|
||||||
|
} else {
|
||||||
|
state = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if state == 2 {
|
||||||
|
skipBytes = append(skipBytes, c)
|
||||||
|
if c == 'I' {
|
||||||
|
state = 3
|
||||||
|
} else {
|
||||||
|
im.stream = append(im.stream, skipBytes...)
|
||||||
|
state = 0
|
||||||
|
}
|
||||||
|
} else if state == 3 {
|
||||||
|
skipBytes = append(skipBytes, c)
|
||||||
|
if IsWhiteSpace(c) {
|
||||||
|
// image data finished.
|
||||||
|
common.Log.Debug("Image stream (%d): % x", len(im.stream), im.stream)
|
||||||
|
// Exit point.
|
||||||
|
return &im, nil
|
||||||
|
} else {
|
||||||
|
// Seems like "<ws>EI" was part of the data.
|
||||||
|
im.stream = append(im.stream, skipBytes...)
|
||||||
|
state = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Never reached (exit point is at end of EI).
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -3,10 +3,7 @@
|
|||||||
* file 'LICENSE.md', which is part of this source code package.
|
* file 'LICENSE.md', which is part of this source code package.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// The content stream parser provides functionality to parse the content stream into a list of
|
package contentstream
|
||||||
// operands that can then be processed further for rendering or extraction of information.
|
|
||||||
|
|
||||||
package model
|
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
@ -26,11 +23,6 @@ type ContentStreamParser struct {
|
|||||||
reader *bufio.Reader
|
reader *bufio.Reader
|
||||||
}
|
}
|
||||||
|
|
||||||
type ContentStreamOperation struct {
|
|
||||||
Params []PdfObject
|
|
||||||
Operand string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a new instance of the content stream parser from an input content
|
// Create a new instance of the content stream parser from an input content
|
||||||
// stream string.
|
// stream string.
|
||||||
func NewContentStreamParser(contentStr string) *ContentStreamParser {
|
func NewContentStreamParser(contentStr string) *ContentStreamParser {
|
||||||
@ -44,199 +36,6 @@ func NewContentStreamParser(contentStr string) *ContentStreamParser {
|
|||||||
return &parser
|
return &parser
|
||||||
}
|
}
|
||||||
|
|
||||||
// A representation of an inline image in a Content stream.
|
|
||||||
// Everything between the BI and EI operands.
|
|
||||||
// ContentStreamInlineImage implements the PdfObject interface
|
|
||||||
// although strictly it is not a PDF object.
|
|
||||||
type ContentStreamInlineImage struct {
|
|
||||||
BitsPerComponent PdfObject
|
|
||||||
ColorSpace PdfObject
|
|
||||||
Decode PdfObject
|
|
||||||
DecodeParms PdfObject
|
|
||||||
Filter PdfObject
|
|
||||||
Height PdfObject
|
|
||||||
ImageMask PdfObject
|
|
||||||
Intent PdfObject
|
|
||||||
Interpolate PdfObject
|
|
||||||
Width PdfObject
|
|
||||||
stream []byte
|
|
||||||
}
|
|
||||||
|
|
||||||
func (this *ContentStreamInlineImage) String() string {
|
|
||||||
str := fmt.Sprintf("InlineImage(len=%d)", len(this.stream))
|
|
||||||
return str
|
|
||||||
}
|
|
||||||
|
|
||||||
func (this *ContentStreamInlineImage) DefaultWriteString() string {
|
|
||||||
var output bytes.Buffer
|
|
||||||
|
|
||||||
// We do not start with "BI" as that is the operand and is written out separately.
|
|
||||||
// Write out the parameters
|
|
||||||
s := "BPC " + this.BitsPerComponent.DefaultWriteString() + "\n"
|
|
||||||
s += "CS " + this.ColorSpace.DefaultWriteString() + "\n"
|
|
||||||
s += "D " + this.Decode.DefaultWriteString() + "\n"
|
|
||||||
s += "DP " + this.DecodeParms.DefaultWriteString() + "\n"
|
|
||||||
s += "F " + this.Filter.DefaultWriteString() + "\n"
|
|
||||||
s += "H " + this.Height.DefaultWriteString() + "\n"
|
|
||||||
s += "IM " + this.ImageMask.DefaultWriteString() + "\n"
|
|
||||||
s += "Intent " + this.Intent.DefaultWriteString() + "\n"
|
|
||||||
s += "I " + this.Interpolate.DefaultWriteString() + "\n"
|
|
||||||
s += "W " + this.Width.DefaultWriteString() + "\n"
|
|
||||||
output.WriteString(s)
|
|
||||||
|
|
||||||
output.WriteString("ID ")
|
|
||||||
output.Write(this.stream)
|
|
||||||
|
|
||||||
return output.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Export the inline image to Image which can be transformed or exported easily.
|
|
||||||
func (this *ContentStreamInlineImage) ToImage() (*Image, error) {
|
|
||||||
return nil, fmt.Errorf("Not implemented yet")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse an inline image from a content stream, both read its properties and
|
|
||||||
// binary data.
|
|
||||||
// When called, "BI" has already been read from the stream. This function
|
|
||||||
// finishes reading through "EI" and then returns the ContentStreamInlineImage.
|
|
||||||
func (this *ContentStreamParser) ParseInlineImage() (*ContentStreamInlineImage, error) {
|
|
||||||
// Reading parameters.
|
|
||||||
im := ContentStreamInlineImage{}
|
|
||||||
|
|
||||||
for {
|
|
||||||
this.skipSpaces()
|
|
||||||
obj, err, isOperand := this.parseObject()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if !isOperand {
|
|
||||||
// Not an operand.. Read key value properties..
|
|
||||||
param, ok := obj.(*PdfObjectName)
|
|
||||||
if !ok {
|
|
||||||
return nil, fmt.Errorf("Invalid inline image property (expecting name) - %T", obj)
|
|
||||||
}
|
|
||||||
|
|
||||||
valueObj, err, isOperand := this.parseObject()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if isOperand {
|
|
||||||
return nil, fmt.Errorf("Not expecting an operand")
|
|
||||||
}
|
|
||||||
|
|
||||||
if *param == "BPC" {
|
|
||||||
im.BitsPerComponent = valueObj
|
|
||||||
} else if *param == "CS" {
|
|
||||||
im.ColorSpace = valueObj
|
|
||||||
} else if *param == "D" {
|
|
||||||
im.Decode = valueObj
|
|
||||||
} else if *param == "DP" {
|
|
||||||
im.DecodeParms = valueObj
|
|
||||||
} else if *param == "F" {
|
|
||||||
im.Filter = valueObj
|
|
||||||
} else if *param == "H" {
|
|
||||||
im.Height = valueObj
|
|
||||||
} else if *param == "IM" {
|
|
||||||
im.ImageMask = valueObj
|
|
||||||
} else if *param == "Intent" {
|
|
||||||
im.Intent = valueObj
|
|
||||||
} else if *param == "I" {
|
|
||||||
im.Interpolate = valueObj
|
|
||||||
} else if *param == "W" {
|
|
||||||
im.Width = valueObj
|
|
||||||
} else {
|
|
||||||
return nil, fmt.Errorf("Unknown inline image parameter %s", *param)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if isOperand {
|
|
||||||
operand, ok := obj.(*PdfObjectString)
|
|
||||||
if !ok {
|
|
||||||
return nil, fmt.Errorf("Failed to read inline image - invalid operand")
|
|
||||||
}
|
|
||||||
|
|
||||||
if *operand == "EI" {
|
|
||||||
// Image fully defined
|
|
||||||
common.Log.Debug("Inline image finished...")
|
|
||||||
return &im, nil
|
|
||||||
} else if *operand == "ID" {
|
|
||||||
// Inline image data.
|
|
||||||
// Should get a single space (0x20) followed by the data and then EI.
|
|
||||||
common.Log.Debug("ID start")
|
|
||||||
|
|
||||||
// Skip the space if its there.
|
|
||||||
b, err := this.reader.Peek(1)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if IsWhiteSpace(b[0]) {
|
|
||||||
this.reader.Discard(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Unfortunately there is no good way to know how many bytes to read since it
|
|
||||||
// depends on the Filter and encoding etc.
|
|
||||||
// Therefore we will simply read until we find "<ws>EI<ws>" where <ws> is whitespace
|
|
||||||
// although of course that could be a part of the data (even if unlikely).
|
|
||||||
im.stream = []byte{}
|
|
||||||
state := 0
|
|
||||||
var skipBytes []byte
|
|
||||||
for {
|
|
||||||
c, err := this.reader.ReadByte()
|
|
||||||
if err != nil {
|
|
||||||
common.Log.Debug("Unable to find end of image EI in inline image data")
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if state == 0 {
|
|
||||||
if IsWhiteSpace(c) {
|
|
||||||
skipBytes = []byte{}
|
|
||||||
skipBytes = append(skipBytes, c)
|
|
||||||
state = 1
|
|
||||||
} else {
|
|
||||||
im.stream = append(im.stream, c)
|
|
||||||
}
|
|
||||||
} else if state == 1 {
|
|
||||||
skipBytes = append(skipBytes, c)
|
|
||||||
if c == 'E' {
|
|
||||||
state = 2
|
|
||||||
} else {
|
|
||||||
im.stream = append(im.stream, skipBytes...)
|
|
||||||
// Need an extra check to decide if we fall back to state 0 or 1.
|
|
||||||
if IsWhiteSpace(c) {
|
|
||||||
state = 1
|
|
||||||
} else {
|
|
||||||
state = 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if state == 2 {
|
|
||||||
skipBytes = append(skipBytes, c)
|
|
||||||
if c == 'I' {
|
|
||||||
state = 3
|
|
||||||
} else {
|
|
||||||
im.stream = append(im.stream, skipBytes...)
|
|
||||||
state = 0
|
|
||||||
}
|
|
||||||
} else if state == 3 {
|
|
||||||
skipBytes = append(skipBytes, c)
|
|
||||||
if IsWhiteSpace(c) {
|
|
||||||
// image data finished.
|
|
||||||
common.Log.Debug("Image stream (%d): % x", len(im.stream), im.stream)
|
|
||||||
// Exit point.
|
|
||||||
return &im, nil
|
|
||||||
} else {
|
|
||||||
// Seems like "<ws>EI" was part of the data.
|
|
||||||
im.stream = append(im.stream, skipBytes...)
|
|
||||||
state = 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Never reached (exit point is at end of EI).
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parses all commands in content stream, returning a list of operation data.
|
// Parses all commands in content stream, returning a list of operation data.
|
||||||
func (this *ContentStreamParser) Parse() ([]*ContentStreamOperation, error) {
|
func (this *ContentStreamParser) Parse() ([]*ContentStreamOperation, error) {
|
||||||
operations := []*ContentStreamOperation{}
|
operations := []*ContentStreamOperation{}
|
||||||
@ -276,53 +75,6 @@ func (this *ContentStreamParser) Parse() ([]*ContentStreamOperation, error) {
|
|||||||
return operations, nil
|
return operations, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parses and extracts all text data in content streams and returns as a string.
|
|
||||||
// Does not take into account Encoding table, the output is simply the character codes.
|
|
||||||
func (this *ContentStreamParser) ExtractText() (string, error) {
|
|
||||||
operations, err := this.Parse()
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
inText := false
|
|
||||||
txt := ""
|
|
||||||
for _, op := range operations {
|
|
||||||
if op.Operand == "BT" {
|
|
||||||
inText = true
|
|
||||||
} else if op.Operand == "ET" {
|
|
||||||
inText = false
|
|
||||||
}
|
|
||||||
if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
|
|
||||||
// Move to next line...
|
|
||||||
txt += "\n"
|
|
||||||
}
|
|
||||||
if inText && op.Operand == "TJ" {
|
|
||||||
if len(op.Params) < 1 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
paramList, ok := op.Params[0].(*PdfObjectArray)
|
|
||||||
if !ok {
|
|
||||||
return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
|
|
||||||
}
|
|
||||||
for _, obj := range *paramList {
|
|
||||||
if strObj, ok := obj.(*PdfObjectString); ok {
|
|
||||||
txt += string(*strObj)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if inText && op.Operand == "Tj" {
|
|
||||||
if len(op.Params) < 1 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
param, ok := op.Params[0].(*PdfObjectString)
|
|
||||||
if !ok {
|
|
||||||
return "", fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
|
|
||||||
}
|
|
||||||
txt += string(*param)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return txt, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip over any spaces. Returns the number of spaces skipped and
|
// Skip over any spaces. Returns the number of spaces skipped and
|
||||||
// an error if any.
|
// an error if any.
|
||||||
func (this *ContentStreamParser) skipSpaces() (int, error) {
|
func (this *ContentStreamParser) skipSpaces() (int, error) {
|
||||||
@ -446,7 +198,7 @@ func (this *ContentStreamParser) parseNumber() (PdfObject, error) {
|
|||||||
allowSigns := true
|
allowSigns := true
|
||||||
numStr := ""
|
numStr := ""
|
||||||
for {
|
for {
|
||||||
common.Log.Debug("Parsing number \"%s\"", numStr)
|
common.Log.Trace("Parsing number \"%s\"", numStr)
|
||||||
bb, err := this.reader.Peek(1)
|
bb, err := this.reader.Peek(1)
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
// GH: EOF handling. Handle EOF like end of line. Can happen with
|
// GH: EOF handling. Handle EOF like end of line. Can happen with
|
||||||
@ -529,7 +281,7 @@ func (this *ContentStreamParser) parseString() (PdfObjectString, error) {
|
|||||||
}
|
}
|
||||||
this.reader.Discard(len(numeric) - 1)
|
this.reader.Discard(len(numeric) - 1)
|
||||||
|
|
||||||
common.Log.Debug("Numeric string \"%s\"", numeric)
|
common.Log.Trace("Numeric string \"%s\"", numeric)
|
||||||
code, err := strconv.ParseUint(string(numeric), 8, 32)
|
code, err := strconv.ParseUint(string(numeric), 8, 32)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return PdfObjectString(bytes), err
|
return PdfObjectString(bytes), err
|
||||||
@ -668,7 +420,7 @@ func (this *ContentStreamParser) parseNull() (PdfObjectNull, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
|
func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
|
||||||
common.Log.Debug("Reading content stream dict!")
|
common.Log.Trace("Reading content stream dict!")
|
||||||
|
|
||||||
dict := make(PdfObjectDictionary)
|
dict := make(PdfObjectDictionary)
|
||||||
|
|
||||||
@ -690,17 +442,17 @@ func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
common.Log.Debug("Dict peek: %s (% x)!", string(bb), string(bb))
|
common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))
|
||||||
if (bb[0] == '>') && (bb[1] == '>') {
|
if (bb[0] == '>') && (bb[1] == '>') {
|
||||||
common.Log.Debug("EOF dictionary")
|
common.Log.Trace("EOF dictionary")
|
||||||
this.reader.ReadByte()
|
this.reader.ReadByte()
|
||||||
this.reader.ReadByte()
|
this.reader.ReadByte()
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
common.Log.Debug("Parse the name!")
|
common.Log.Trace("Parse the name!")
|
||||||
|
|
||||||
keyName, err := this.parseName()
|
keyName, err := this.parseName()
|
||||||
common.Log.Debug("Key: %s", keyName)
|
common.Log.Trace("Key: %s", keyName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
common.Log.Debug("ERROR Returning name err %s", err)
|
common.Log.Debug("ERROR Returning name err %s", err)
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -710,8 +462,8 @@ func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
|
|||||||
// Some writers have a bug where the null is appended without
|
// Some writers have a bug where the null is appended without
|
||||||
// space. For example "\Boundsnull"
|
// space. For example "\Boundsnull"
|
||||||
newKey := keyName[0 : len(keyName)-4]
|
newKey := keyName[0 : len(keyName)-4]
|
||||||
common.Log.Debug("Taking care of null bug (%s)", keyName)
|
common.Log.Trace("Taking care of null bug (%s)", keyName)
|
||||||
common.Log.Debug("New key \"%s\" = null", newKey)
|
common.Log.Trace("New key \"%s\" = null", newKey)
|
||||||
this.skipSpaces()
|
this.skipSpaces()
|
||||||
bb, _ := this.reader.Peek(1)
|
bb, _ := this.reader.Peek(1)
|
||||||
if bb[0] == '/' {
|
if bb[0] == '/' {
|
||||||
@ -729,7 +481,7 @@ func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
|
|||||||
}
|
}
|
||||||
dict[keyName] = val
|
dict[keyName] = val
|
||||||
|
|
||||||
common.Log.Debug("dict[%s] = %s", keyName, val.String())
|
common.Log.Trace("dict[%s] = %s", keyName, val.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
return &dict, nil
|
return &dict, nil
|
||||||
@ -772,40 +524,40 @@ func (this *ContentStreamParser) parseObject() (PdfObject, error, bool) {
|
|||||||
return nil, err, false
|
return nil, err, false
|
||||||
}
|
}
|
||||||
|
|
||||||
common.Log.Debug("Peek string: %s", string(bb))
|
common.Log.Trace("Peek string: %s", string(bb))
|
||||||
// Determine type.
|
// Determine type.
|
||||||
if bb[0] == '%' {
|
if bb[0] == '%' {
|
||||||
this.skipComments()
|
this.skipComments()
|
||||||
continue
|
continue
|
||||||
} else if bb[0] == '/' {
|
} else if bb[0] == '/' {
|
||||||
name, err := this.parseName()
|
name, err := this.parseName()
|
||||||
common.Log.Debug("->Name: '%s'", name)
|
common.Log.Trace("->Name: '%s'", name)
|
||||||
return &name, err, false
|
return &name, err, false
|
||||||
} else if bb[0] == '(' {
|
} else if bb[0] == '(' {
|
||||||
common.Log.Debug("->String!")
|
common.Log.Trace("->String!")
|
||||||
str, err := this.parseString()
|
str, err := this.parseString()
|
||||||
return &str, err, false
|
return &str, err, false
|
||||||
} else if bb[0] == '<' && bb[1] != '<' {
|
} else if bb[0] == '<' && bb[1] != '<' {
|
||||||
common.Log.Debug("->Hex String!")
|
common.Log.Trace("->Hex String!")
|
||||||
str, err := this.parseHexString()
|
str, err := this.parseHexString()
|
||||||
return &str, err, false
|
return &str, err, false
|
||||||
} else if bb[0] == '[' {
|
} else if bb[0] == '[' {
|
||||||
common.Log.Debug("->Array!")
|
common.Log.Trace("->Array!")
|
||||||
arr, err := this.parseArray()
|
arr, err := this.parseArray()
|
||||||
return &arr, err, false
|
return &arr, err, false
|
||||||
} else if IsDecimalDigit(bb[0]) || (bb[0] == '-' && IsDecimalDigit(bb[1])) {
|
} else if IsDecimalDigit(bb[0]) || (bb[0] == '-' && IsDecimalDigit(bb[1])) {
|
||||||
common.Log.Debug("->Number!")
|
common.Log.Trace("->Number!")
|
||||||
number, err := this.parseNumber()
|
number, err := this.parseNumber()
|
||||||
return number, err, false
|
return number, err, false
|
||||||
} else if bb[0] == '<' && bb[1] == '<' {
|
} else if bb[0] == '<' && bb[1] == '<' {
|
||||||
dict, err := this.parseDict()
|
dict, err := this.parseDict()
|
||||||
return dict, err, false
|
return dict, err, false
|
||||||
} else {
|
} else {
|
||||||
common.Log.Debug("->Operand or bool?")
|
common.Log.Trace("->Operand or bool?")
|
||||||
// Let's peek farther to find out.
|
// Let's peek farther to find out.
|
||||||
bb, _ = this.reader.Peek(5)
|
bb, _ = this.reader.Peek(5)
|
||||||
peekStr := string(bb)
|
peekStr := string(bb)
|
||||||
common.Log.Debug("Peek str: %s", peekStr)
|
common.Log.Trace("Peek str: %s", peekStr)
|
||||||
|
|
||||||
if (len(peekStr) > 3) && (peekStr[:4] == "null") {
|
if (len(peekStr) > 3) && (peekStr[:4] == "null") {
|
||||||
null, err := this.parseNull()
|
null, err := this.parseNull()
|
@ -20,7 +20,6 @@ func NewEncoderFromStream(streamObj *PdfObjectStream) (StreamEncoder, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// The filter should be a name or an array with a list of filter names.
|
// The filter should be a name or an array with a list of filter names.
|
||||||
// Currently only supporting a single filter.
|
|
||||||
method, ok := filterObj.(*PdfObjectName)
|
method, ok := filterObj.(*PdfObjectName)
|
||||||
if !ok {
|
if !ok {
|
||||||
array, ok := filterObj.(*PdfObjectArray)
|
array, ok := filterObj.(*PdfObjectArray)
|
||||||
|
@ -12,17 +12,8 @@ import (
|
|||||||
. "github.com/unidoc/unidoc/pdf/core"
|
. "github.com/unidoc/unidoc/pdf/core"
|
||||||
)
|
)
|
||||||
|
|
||||||
/*
|
// PDFAnnotation contains common attributes of an annotation. The context object contains the subannotation,
|
||||||
|
// which can be a markup annotation or other types.
|
||||||
markup annotations:
|
|
||||||
T, popup, ..., ExData
|
|
||||||
|
|
||||||
markup annotaitons are:
|
|
||||||
Text, FreeText, Line, Square, Circle, Polygon, PolyLine, Highlight, Underline, Squiggly, StrikeOut, Stamp, Caret, Ink, FileAttachment,
|
|
||||||
Sound, Redact
|
|
||||||
|
|
||||||
|
|
||||||
*/
|
|
||||||
type PdfAnnotation struct {
|
type PdfAnnotation struct {
|
||||||
context PdfModel // Sub-annotation.
|
context PdfModel // Sub-annotation.
|
||||||
Rect PdfObject
|
Rect PdfObject
|
||||||
|
@ -8,9 +8,164 @@ package model
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
|
||||||
|
"github.com/unidoc/unidoc/common"
|
||||||
. "github.com/unidoc/unidoc/pdf/core"
|
. "github.com/unidoc/unidoc/pdf/core"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// XObjectForm (Table 95 in 8.10.2).
|
||||||
|
type XObjectForm struct {
|
||||||
|
Filter StreamEncoder
|
||||||
|
|
||||||
|
FormType PdfObject
|
||||||
|
BBox PdfObject
|
||||||
|
Matrix PdfObject
|
||||||
|
Resources PdfObject
|
||||||
|
Group PdfObject
|
||||||
|
Ref PdfObject
|
||||||
|
MetaData PdfObject
|
||||||
|
PieceInfo PdfObject
|
||||||
|
LastModified PdfObject
|
||||||
|
StructParent PdfObject
|
||||||
|
StructParents PdfObject
|
||||||
|
OPI PdfObject
|
||||||
|
OC PdfObject
|
||||||
|
Name PdfObject
|
||||||
|
// Stream data.
|
||||||
|
Stream []byte
|
||||||
|
// Primitive
|
||||||
|
primitive *PdfObjectStream
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a brand new XObject Form. Creates a new underlying PDF object stream primitive.
|
||||||
|
func NewXObjectForm() *XObjectForm {
|
||||||
|
xobj := &XObjectForm{}
|
||||||
|
stream := &PdfObjectStream{}
|
||||||
|
stream.PdfObjectDictionary = &PdfObjectDictionary{}
|
||||||
|
xobj.primitive = stream
|
||||||
|
return xobj
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the Form XObject from a stream object.
|
||||||
|
// XXX: Should this be exposed? Consider different access points.
|
||||||
|
func NewXObjectFormFromStream(stream *PdfObjectStream) (*XObjectForm, error) {
|
||||||
|
form := &XObjectForm{}
|
||||||
|
form.primitive = stream
|
||||||
|
|
||||||
|
dict := *(stream.PdfObjectDictionary)
|
||||||
|
|
||||||
|
encoder, err := NewEncoderFromStream(stream)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
form.Filter = encoder
|
||||||
|
|
||||||
|
if obj, isDefined := dict["Subtype"]; isDefined {
|
||||||
|
name, ok := obj.(*PdfObjectName)
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("Type error")
|
||||||
|
}
|
||||||
|
if *name != "Form" {
|
||||||
|
common.Log.Debug("Invalid form subtype")
|
||||||
|
return nil, errors.New("Invalid form subtype")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if obj, isDefined := dict["FormType"]; isDefined {
|
||||||
|
form.FormType = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["BBox"]; isDefined {
|
||||||
|
form.BBox = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["Matrix"]; isDefined {
|
||||||
|
form.Matrix = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["Resources"]; isDefined {
|
||||||
|
form.Resources = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["Group"]; isDefined {
|
||||||
|
form.Group = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["Ref"]; isDefined {
|
||||||
|
form.Ref = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["MetaData"]; isDefined {
|
||||||
|
form.MetaData = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["PieceInfo"]; isDefined {
|
||||||
|
form.PieceInfo = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["LastModified"]; isDefined {
|
||||||
|
form.LastModified = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["StructParent"]; isDefined {
|
||||||
|
form.StructParent = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["StructParents"]; isDefined {
|
||||||
|
form.StructParents = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["OPI"]; isDefined {
|
||||||
|
form.OPI = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["OC"]; isDefined {
|
||||||
|
form.OC = obj
|
||||||
|
}
|
||||||
|
if obj, isDefined := dict["Name"]; isDefined {
|
||||||
|
form.Name = obj
|
||||||
|
}
|
||||||
|
|
||||||
|
form.Stream = stream.Stream
|
||||||
|
|
||||||
|
return form, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (xform *XObjectForm) GetContainingPdfObject() PdfObject {
|
||||||
|
return xform.primitive
|
||||||
|
}
|
||||||
|
|
||||||
|
func (xform *XObjectForm) GetContentStream() ([]byte, error) {
|
||||||
|
decoded, err := DecodeStream(xform.primitive)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return decoded, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return a stream object.
|
||||||
|
func (xform *XObjectForm) ToPdfObject() PdfObject {
|
||||||
|
stream := xform.primitive
|
||||||
|
|
||||||
|
dict := stream.PdfObjectDictionary
|
||||||
|
if xform.Filter != nil {
|
||||||
|
// Pre-populate the stream dictionary with the
|
||||||
|
// encoding related fields.
|
||||||
|
dict = xform.Filter.MakeStreamDict()
|
||||||
|
stream.PdfObjectDictionary = dict
|
||||||
|
}
|
||||||
|
dict.Set("Type", MakeName("XObject"))
|
||||||
|
dict.Set("Subtype", MakeName("Form"))
|
||||||
|
|
||||||
|
dict.SetIfNotNil("FormType", xform.FormType)
|
||||||
|
dict.SetIfNotNil("BBox", xform.BBox)
|
||||||
|
dict.SetIfNotNil("Matrix", xform.Matrix)
|
||||||
|
dict.SetIfNotNil("Resources", xform.Resources)
|
||||||
|
dict.SetIfNotNil("Group", xform.Group)
|
||||||
|
dict.SetIfNotNil("Ref", xform.Ref)
|
||||||
|
dict.SetIfNotNil("MetaData", xform.MetaData)
|
||||||
|
dict.SetIfNotNil("PieceInfo", xform.PieceInfo)
|
||||||
|
dict.SetIfNotNil("LastModified", xform.LastModified)
|
||||||
|
dict.SetIfNotNil("StructParent", xform.StructParent)
|
||||||
|
dict.SetIfNotNil("StructParents", xform.StructParents)
|
||||||
|
dict.SetIfNotNil("OPI", xform.OPI)
|
||||||
|
dict.SetIfNotNil("OC", xform.OC)
|
||||||
|
dict.SetIfNotNil("Name", xform.Name)
|
||||||
|
|
||||||
|
dict.Set("Length", MakeInteger(int64(len(xform.Stream))))
|
||||||
|
stream.Stream = xform.Stream
|
||||||
|
|
||||||
|
return stream
|
||||||
|
}
|
||||||
|
|
||||||
// XObjectImage (Table 89 in 8.9.5.1).
|
// XObjectImage (Table 89 in 8.9.5.1).
|
||||||
// Implements PdfModel interface.
|
// Implements PdfModel interface.
|
||||||
type XObjectImage struct {
|
type XObjectImage struct {
|
||||||
@ -19,8 +174,8 @@ type XObjectImage struct {
|
|||||||
Height *int64
|
Height *int64
|
||||||
ColorSpace PdfColorspace
|
ColorSpace PdfColorspace
|
||||||
BitsPerComponent *int64
|
BitsPerComponent *int64
|
||||||
//Filter *PdfObjectName
|
Filter StreamEncoder
|
||||||
Filter StreamEncoder
|
|
||||||
Intent PdfObject
|
Intent PdfObject
|
||||||
ImageMask PdfObject
|
ImageMask PdfObject
|
||||||
Mask PdfObject
|
Mask PdfObject
|
||||||
|
Loading…
x
Reference in New Issue
Block a user