Refactored contentstream into a separate package.

2025-05-02 22:17:06 +08:00 · 2017-03-01 16:02:53 +00:00 · 2017-03-01 16:02:53 +00:00 · 4aa6845e27
commit 4aa6845e27
parent 9247f5d954
8 changed files with 947 additions and 281 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+.idea
 *.mdb
 *.userprefs
 *.pidb
--- a/pdf/contentstream/contentstream.go
+++ b/pdf/contentstream/contentstream.go
@ -0,0 +1,68 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+// The content stream parser provides functionality to parse the content stream into a list of
+// operands that can then be processed further for rendering or extraction of information.
+// The contentstream package uses the core and model packages.
+
+package contentstream
+
+import (
+	"fmt"
+
+	. "github.com/unidoc/unidoc/pdf/core"
+)
+
+type ContentStreamOperation struct {
+	Params  []PdfObject
+	Operand string
+}
+
+// Parses and extracts all text data in content streams and returns as a string.
+// Does not take into account Encoding table, the output is simply the character codes.
+func (this *ContentStreamParser) ExtractText() (string, error) {
+	operations, err := this.Parse()
+	if err != nil {
+		return "", err
+	}
+	inText := false
+	txt := ""
+	for _, op := range operations {
+		if op.Operand == "BT" {
+			inText = true
+		} else if op.Operand == "ET" {
+			inText = false
+		}
+		if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
+			// Move to next line...
+			txt += "\n"
+		}
+		if inText && op.Operand == "TJ" {
+			if len(op.Params) < 1 {
+				continue
+			}
+			paramList, ok := op.Params[0].(*PdfObjectArray)
+			if !ok {
+				return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
+			}
+			for _, obj := range *paramList {
+				if strObj, ok := obj.(*PdfObjectString); ok {
+					txt += string(*strObj)
+				}
+			}
+		} else if inText && op.Operand == "Tj" {
+			if len(op.Params) < 1 {
+				continue
+			}
+			param, ok := op.Params[0].(*PdfObjectString)
+			if !ok {
+				return "", fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
+			}
+			txt += string(*param)
+		}
+	}
+
+	return txt, nil
+}
--- a/pdf/contentstream/encoding.go
+++ b/pdf/contentstream/encoding.go
@ -0,0 +1,386 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package contentstream
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	gocolor "image/color"
+	"image/jpeg"
+
+	"github.com/unidoc/unidoc/common"
+	. "github.com/unidoc/unidoc/pdf/core"
+)
+
+// Creates the encoder for the inline image's Filter and DecodeParms.
+func NewEncoderFromInlineImage(inlineImage *ContentStreamInlineImage) (StreamEncoder, error) {
+	if inlineImage.Filter == nil {
+		// No filter, return raw data back.
+		return NewRawEncoder(), nil
+	}
+
+	// The filter should be a name or an array with a list of filter names.
+	filterName, ok := inlineImage.Filter.(*PdfObjectName)
+	if !ok {
+		array, ok := inlineImage.Filter.(*PdfObjectArray)
+		if !ok {
+			return nil, fmt.Errorf("Filter not a Name or Array object")
+		}
+		if len(*array) == 0 {
+			// Empty array -> indicates raw filter (no filter).
+			return NewRawEncoder(), nil
+		}
+
+		if len(*array) != 1 {
+			menc, err := newMultiEncoderFromInlineImage(inlineImage)
+			if err != nil {
+				common.Log.Error("Failed creating multi encoder: %v", err)
+				return nil, err
+			}
+
+			common.Log.Trace("Multi enc: %s\n", menc)
+			return menc, nil
+		}
+
+		// Single element.
+		filterObj := (*array)[0]
+		filterName, ok = filterObj.(*PdfObjectName)
+		if !ok {
+			return nil, fmt.Errorf("Filter array member not a Name object")
+		}
+	}
+
+	if *filterName == "AHx" {
+		return NewASCIIHexEncoder(), nil
+	} else if *filterName == "A85" {
+		return NewASCII85Encoder(), nil
+	} else if *filterName == "DCT" {
+		return newDCTEncoderFromInlineImage(inlineImage)
+	} else if *filterName == "Fl" {
+		return newFlateEncoderFromInlineImage(inlineImage, nil)
+	} else if *filterName == "LZW" {
+		return newLZWEncoderFromInlineImage(inlineImage, nil)
+	} else {
+		common.Log.Debug("Unsupported inline image encoding filter name : %s", *filterName)
+		return nil, errors.New("Unsupported inline encoding method")
+	}
+}
+
+// Create a new flate decoder from an inline image object, getting all the encoding parameters
+// from the DecodeParms stream object dictionary entry that can be provided optionally, usually
+// only when a multi filter is used.
+func newFlateEncoderFromInlineImage(inlineImage *ContentStreamInlineImage, decodeParams *PdfObjectDictionary) (*FlateEncoder, error) {
+	encoder := NewFlateEncoder()
+
+	// If decodeParams not provided, see if we can get from the stream.
+	if decodeParams == nil {
+		obj := inlineImage.DecodeParms
+		if obj != nil {
+			dp, isDict := obj.(*PdfObjectDictionary)
+			if !isDict {
+				common.Log.Debug("Error: DecodeParms not a dictionary (%T)", obj)
+				return nil, fmt.Errorf("Invalid DecodeParms")
+			}
+			decodeParams = dp
+		}
+	}
+	if decodeParams == nil {
+		// Can safely return here if no decode params, as the following depend on the decode params.
+		return encoder, nil
+	}
+
+	common.Log.Trace("decode params: %s", decodeParams.String())
+	obj, has := (*decodeParams)["Predictor"]
+	if !has {
+		common.Log.Debug("Error: Predictor missing from DecodeParms - Continue with default (1)")
+	} else {
+		predictor, ok := obj.(*PdfObjectInteger)
+		if !ok {
+			common.Log.Debug("Error: Predictor specified but not numeric (%T)", obj)
+			return nil, fmt.Errorf("Invalid Predictor")
+		}
+		encoder.Predictor = int(*predictor)
+	}
+
+	// Bits per component.  Use default if not specified (8).
+	obj, has = (*decodeParams)["BitsPerComponent"]
+	if has {
+		bpc, ok := obj.(*PdfObjectInteger)
+		if !ok {
+			common.Log.Debug("ERROR: Invalid BitsPerComponent")
+			return nil, fmt.Errorf("Invalid BitsPerComponent")
+		}
+		encoder.BitsPerComponent = int(*bpc)
+	}
+
+	if encoder.Predictor > 1 {
+		// Columns.
+		encoder.Columns = 1
+		obj, has = (*decodeParams)["Columns"]
+		if has {
+			columns, ok := obj.(*PdfObjectInteger)
+			if !ok {
+				return nil, fmt.Errorf("Predictor column invalid")
+			}
+
+			encoder.Columns = int(*columns)
+		}
+
+		// Colors.
+		// Number of interleaved color components per sample (Default 1 if not specified)
+		encoder.Colors = 1
+		obj, has = (*decodeParams)["Colors"]
+		if has {
+			colors, ok := obj.(*PdfObjectInteger)
+			if !ok {
+				return nil, fmt.Errorf("Predictor colors not an integer")
+			}
+			encoder.Colors = int(*colors)
+		}
+	}
+
+	return encoder, nil
+}
+
+// Create a new LZW encoder/decoder based on an inline image object, getting all the encoding parameters
+// from the DecodeParms stream object dictionary entry.
+func newLZWEncoderFromInlineImage(inlineImage *ContentStreamInlineImage, decodeParams *PdfObjectDictionary) (*LZWEncoder, error) {
+	// Start with default settings.
+	encoder := NewLZWEncoder()
+
+	// If decodeParams not provided, see if we can get from the inline image directly.
+	if decodeParams == nil {
+		if inlineImage.DecodeParms != nil {
+			dp, isDict := inlineImage.DecodeParms.(*PdfObjectDictionary)
+			if !isDict {
+				common.Log.Debug("Error: DecodeParms not a dictionary (%T)", inlineImage.DecodeParms)
+				return nil, fmt.Errorf("Invalid DecodeParms")
+			}
+			decodeParams = dp
+		}
+	}
+
+	if decodeParams == nil {
+		// No decode parameters. Can safely return here if not set as the following options
+		// are related to the decode Params.
+		return encoder, nil
+	}
+
+	// The EarlyChange indicates when to increase code length, as different
+	// implementations use a different mechanisms. Essentially this chooses
+	// which LZW implementation to use.
+	// The default is 1 (one code early)
+	//
+	// The EarlyChange parameter is specified in the object stream dictionary for regular streams,
+	// but it is not specified explicitly where to check for it in the case of inline images.
+	// We will check in the decodeParms for now, we can adjust later if we come across cases of this.
+	obj, has := (*decodeParams)["EarlyChange"]
+	if has {
+		earlyChange, ok := obj.(*PdfObjectInteger)
+		if !ok {
+			common.Log.Debug("Error: EarlyChange specified but not numeric (%T)", obj)
+			return nil, fmt.Errorf("Invalid EarlyChange")
+		}
+		if *earlyChange != 0 && *earlyChange != 1 {
+			return nil, fmt.Errorf("Invalid EarlyChange value (not 0 or 1)")
+		}
+
+		encoder.EarlyChange = int(*earlyChange)
+	} else {
+		encoder.EarlyChange = 1 // default
+	}
+
+	obj, has = (*decodeParams)["Predictor"]
+	if has {
+		predictor, ok := obj.(*PdfObjectInteger)
+		if !ok {
+			common.Log.Debug("Error: Predictor specified but not numeric (%T)", obj)
+			return nil, fmt.Errorf("Invalid Predictor")
+		}
+		encoder.Predictor = int(*predictor)
+	}
+
+	// Bits per component.  Use default if not specified (8).
+	obj, has = (*decodeParams)["BitsPerComponent"]
+	if has {
+		bpc, ok := obj.(*PdfObjectInteger)
+		if !ok {
+			common.Log.Debug("ERROR: Invalid BitsPerComponent")
+			return nil, fmt.Errorf("Invalid BitsPerComponent")
+		}
+		encoder.BitsPerComponent = int(*bpc)
+	}
+
+	if encoder.Predictor > 1 {
+		// Columns.
+		encoder.Columns = 1
+		obj, has = (*decodeParams)["Columns"]
+		if has {
+			columns, ok := obj.(*PdfObjectInteger)
+			if !ok {
+				return nil, fmt.Errorf("Predictor column invalid")
+			}
+
+			encoder.Columns = int(*columns)
+		}
+
+		// Colors.
+		// Number of interleaved color components per sample (Default 1 if not specified)
+		encoder.Colors = 1
+		obj, has = (*decodeParams)["Colors"]
+		if has {
+			colors, ok := obj.(*PdfObjectInteger)
+			if !ok {
+				return nil, fmt.Errorf("Predictor colors not an integer")
+			}
+			encoder.Colors = int(*colors)
+		}
+	}
+
+	common.Log.Trace("decode params: %s", decodeParams.String())
+	return encoder, nil
+}
+
+// Create a new DCT encoder/decoder based on an inline image, getting all the encoding parameters
+// from the stream object dictionary entry and the image data itself.
+func newDCTEncoderFromInlineImage(inlineImage *ContentStreamInlineImage) (*DCTEncoder, error) {
+	// Start with default settings.
+	encoder := NewDCTEncoder()
+
+	bufReader := bytes.NewReader(inlineImage.stream)
+
+	cfg, err := jpeg.DecodeConfig(bufReader)
+	//img, _, err := goimage.Decode(bufReader)
+	if err != nil {
+		common.Log.Debug("Error decoding file: %s", err)
+		return nil, err
+	}
+
+	switch cfg.ColorModel {
+	case gocolor.RGBAModel:
+		encoder.BitsPerComponent = 8
+		encoder.ColorComponents = 3 // alpha is not included in pdf.
+	case gocolor.RGBA64Model:
+		encoder.BitsPerComponent = 16
+		encoder.ColorComponents = 3
+	case gocolor.GrayModel:
+		encoder.BitsPerComponent = 8
+		encoder.ColorComponents = 1
+	case gocolor.Gray16Model:
+		encoder.BitsPerComponent = 16
+		encoder.ColorComponents = 1
+	case gocolor.CMYKModel:
+		encoder.BitsPerComponent = 8
+		encoder.ColorComponents = 4
+	case gocolor.YCbCrModel:
+		// YCbCr is not supported by PDF, but it could be a different colorspace
+		// with 3 components.  Would be specified by the ColorSpace entry.
+		encoder.BitsPerComponent = 8
+		encoder.ColorComponents = 3
+	default:
+		return nil, errors.New("Unsupported color model")
+	}
+	encoder.Width = cfg.Width
+	encoder.Height = cfg.Height
+	common.Log.Trace("DCT Encoder: %+v", encoder)
+
+	return encoder, nil
+}
+
+// Create a new multi-filter encoder/decoder based on an inline image, getting all the encoding parameters
+// from the filter specification and the DecodeParms (DP) dictionaries.
+func newMultiEncoderFromInlineImage(inlineImage *ContentStreamInlineImage) (*MultiEncoder, error) {
+	mencoder := NewMultiEncoder()
+
+	// Prepare the decode params array (one for each filter type)
+	// Optional, not always present.
+	var decodeParamsDict *PdfObjectDictionary
+	decodeParamsArray := []PdfObject{}
+	if obj := inlineImage.DecodeParms; obj != nil {
+		// If it is a dictionary, assume it applies to all
+		dict, isDict := obj.(*PdfObjectDictionary)
+		if isDict {
+			decodeParamsDict = dict
+		}
+
+		// If it is an array, assume there is one for each
+		arr, isArray := obj.(*PdfObjectArray)
+		if isArray {
+			for _, dictObj := range *arr {
+				if dict, is := dictObj.(*PdfObjectDictionary); is {
+					decodeParamsArray = append(decodeParamsArray, dict)
+				} else {
+					decodeParamsArray = append(decodeParamsArray, nil)
+				}
+			}
+		}
+	}
+
+	obj := inlineImage.Filter
+	if obj == nil {
+		return nil, fmt.Errorf("Filter missing")
+	}
+
+	array, ok := obj.(*PdfObjectArray)
+	if !ok {
+		return nil, fmt.Errorf("Multi filter can only be made from array")
+	}
+
+	for idx, obj := range *array {
+		name, ok := obj.(*PdfObjectName)
+		if !ok {
+			return nil, fmt.Errorf("Multi filter array element not a name")
+		}
+
+		var dp PdfObject
+
+		// If decode params dict is set, use it.  Otherwise take from array..
+		if decodeParamsDict != nil {
+			dp = decodeParamsDict
+		} else {
+			// Only get the dp if provided.  Oftentimes there is no decode params dict
+			// provided.
+			if len(decodeParamsArray) > 0 {
+				if idx >= len(decodeParamsArray) {
+					return nil, fmt.Errorf("Missing elements in decode params array")
+				}
+				dp = decodeParamsArray[idx]
+			}
+		}
+
+		var dParams *PdfObjectDictionary
+		if dict, is := dp.(*PdfObjectDictionary); is {
+			dParams = dict
+		}
+
+		if *name == StreamEncodingFilterNameFlate {
+			// XXX: need to separate out the DecodeParms..
+			encoder, err := newFlateEncoderFromInlineImage(inlineImage, dParams)
+			if err != nil {
+				return nil, err
+			}
+			mencoder.AddEncoder(encoder)
+		} else if *name == StreamEncodingFilterNameLZW {
+			encoder, err := newLZWEncoderFromInlineImage(inlineImage, dParams)
+			if err != nil {
+				return nil, err
+			}
+			mencoder.AddEncoder(encoder)
+		} else if *name == StreamEncodingFilterNameASCIIHex {
+			encoder := NewASCIIHexEncoder()
+			mencoder.AddEncoder(encoder)
+		} else if *name == StreamEncodingFilterNameASCII85 {
+			encoder := NewASCII85Encoder()
+			mencoder.AddEncoder(encoder)
+		} else {
+			common.Log.Error("Unsupported filter %s", *name)
+			return nil, fmt.Errorf("Invalid filter in multi filter array")
+		}
+	}
+
+	return mencoder, nil
+}
--- a/pdf/contentstream/inline-image.go
+++ b/pdf/contentstream/inline-image.go
@ -0,0 +1,314 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package contentstream
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+
+	"github.com/unidoc/unidoc/common"
+	. "github.com/unidoc/unidoc/pdf/core"
+	. "github.com/unidoc/unidoc/pdf/model"
+)
+
+// A representation of an inline image in a Content stream. Everything between the BI and EI operands.
+// ContentStreamInlineImage implements the PdfObject interface although strictly it is not a PDF object.
+type ContentStreamInlineImage struct {
+	BitsPerComponent PdfObject
+	ColorSpace       PdfObject
+	Decode           PdfObject
+	DecodeParms      PdfObject
+	Filter           PdfObject
+	Height           PdfObject
+	ImageMask        PdfObject
+	Intent           PdfObject
+	Interpolate      PdfObject
+	Width            PdfObject
+	stream           []byte
+}
+
+func (this *ContentStreamInlineImage) String() string {
+	s := fmt.Sprintf("InlineImage(len=%d)\n", len(this.stream))
+	if this.BitsPerComponent != nil {
+		s += "- BPC " + this.BitsPerComponent.DefaultWriteString() + "\n"
+	}
+	if this.ColorSpace != nil {
+		s += "- CS " + this.ColorSpace.DefaultWriteString() + "\n"
+	}
+	if this.Decode != nil {
+		s += "- D " + this.Decode.DefaultWriteString() + "\n"
+	}
+	if this.DecodeParms != nil {
+		s += "- DP " + this.DecodeParms.DefaultWriteString() + "\n"
+	}
+	if this.Filter != nil {
+		s += "- F " + this.Filter.DefaultWriteString() + "\n"
+	}
+	if this.Height != nil {
+		s += "- H " + this.Height.DefaultWriteString() + "\n"
+	}
+	if this.ImageMask != nil {
+		s += "- IM " + this.ImageMask.DefaultWriteString() + "\n"
+	}
+	if this.Intent != nil {
+		s += "- Intent " + this.Intent.DefaultWriteString() + "\n"
+	}
+	if this.Interpolate != nil {
+		s += "- I " + this.Interpolate.DefaultWriteString() + "\n"
+	}
+	if this.Width != nil {
+		s += "- W " + this.Width.DefaultWriteString() + "\n"
+	}
+	return s
+}
+
+func (this *ContentStreamInlineImage) DefaultWriteString() string {
+	var output bytes.Buffer
+
+	// We do not start with "BI" as that is the operand and is written out separately.
+	// Write out the parameters
+	s := "BPC " + this.BitsPerComponent.DefaultWriteString() + "\n"
+	s += "CS " + this.ColorSpace.DefaultWriteString() + "\n"
+	s += "D " + this.Decode.DefaultWriteString() + "\n"
+	s += "DP " + this.DecodeParms.DefaultWriteString() + "\n"
+	s += "F " + this.Filter.DefaultWriteString() + "\n"
+	s += "H " + this.Height.DefaultWriteString() + "\n"
+	s += "IM " + this.ImageMask.DefaultWriteString() + "\n"
+	s += "Intent " + this.Intent.DefaultWriteString() + "\n"
+	s += "I " + this.Interpolate.DefaultWriteString() + "\n"
+	s += "W " + this.Width.DefaultWriteString() + "\n"
+	output.WriteString(s)
+
+	output.WriteString("ID ")
+	output.Write(this.stream)
+
+	return output.String()
+}
+
+func (this *ContentStreamInlineImage) GetColorSpace() (PdfColorspace, error) {
+	if this.ColorSpace == nil {
+		return nil, nil
+	}
+
+	// Can also refer to a name in the PDF page resources...
+
+	name, ok := this.ColorSpace.(*PdfObjectName)
+	if !ok {
+		common.Log.Debug("Error: Invalid object type")
+		return nil, errors.New("Invalid type")
+	}
+
+	if *name == "G" {
+		return NewPdfColorspaceDeviceGray(), nil
+	} else if *name == "RGB" {
+		return NewPdfColorspaceDeviceRGB(), nil
+	} else if *name == "CMYK" {
+		return NewPdfColorspaceDeviceCMYK(), nil
+		//} else if *name == "I" {
+		//	cs := NewPdfColorspaceSpecialIndexed()
+		//	return cs, nil
+	} else {
+		common.Log.Debug("Error, unsupported inline image colorspace: %s", *name)
+		return nil, errors.New("Invalid parameter")
+	}
+
+}
+
+// Export the inline image to Image which can be transformed or exported easily.
+func (this *ContentStreamInlineImage) ToImage() (*Image, error) {
+	// Decode the imaging data if encoded.
+	encoder, err := NewEncoderFromInlineImage(this)
+	if err != nil {
+		return nil, err
+	}
+	common.Log.Trace("encoder: %+v %T", encoder, encoder)
+
+	decoded, err := encoder.DecodeBytes(this.stream)
+	if err != nil {
+		return nil, err
+	}
+
+	image := &Image{}
+
+	// Height.
+	if this.Height == nil {
+		return nil, errors.New("Height attribute missing")
+	}
+	height, ok := this.Height.(*PdfObjectInteger)
+	if !ok {
+		return nil, errors.New("Invalid height")
+	}
+	image.Height = int64(*height)
+
+	// Width.
+	if this.Width == nil {
+		return nil, errors.New("Width attribute missing")
+	}
+	width, ok := this.Width.(*PdfObjectInteger)
+	if !ok {
+		return nil, errors.New("Invalid width")
+	}
+	image.Width = int64(*width)
+
+	// BPC.
+	if this.BitsPerComponent == nil {
+		common.Log.Debug("Inline Bits per component missing - assuming 8")
+		image.BitsPerComponent = 8
+	} else {
+		bpc, ok := this.BitsPerComponent.(*PdfObjectInteger)
+		if !ok {
+			common.Log.Debug("Error invalid bits per component value, type %T", this.BitsPerComponent)
+			return nil, errors.New("BPC Type error")
+		}
+		image.BitsPerComponent = int64(*bpc)
+	}
+
+	image.Data = decoded
+
+	return image, nil
+}
+
+// Parse an inline image from a content stream, both read its properties and binary data.
+// When called, "BI" has already been read from the stream.  This function
+// finishes reading through "EI" and then returns the ContentStreamInlineImage.
+func (this *ContentStreamParser) ParseInlineImage() (*ContentStreamInlineImage, error) {
+	// Reading parameters.
+	im := ContentStreamInlineImage{}
+
+	for {
+		this.skipSpaces()
+		obj, err, isOperand := this.parseObject()
+		if err != nil {
+			return nil, err
+		}
+
+		if !isOperand {
+			// Not an operand.. Read key value properties..
+			param, ok := obj.(*PdfObjectName)
+			if !ok {
+				return nil, fmt.Errorf("Invalid inline image property (expecting name) - %T", obj)
+			}
+
+			valueObj, err, isOperand := this.parseObject()
+			if err != nil {
+				return nil, err
+			}
+			if isOperand {
+				return nil, fmt.Errorf("Not expecting an operand")
+			}
+
+			if *param == "BPC" {
+				im.BitsPerComponent = valueObj
+			} else if *param == "CS" {
+				im.ColorSpace = valueObj
+			} else if *param == "D" {
+				im.Decode = valueObj
+			} else if *param == "DP" {
+				im.DecodeParms = valueObj
+			} else if *param == "F" {
+				im.Filter = valueObj
+			} else if *param == "H" {
+				im.Height = valueObj
+			} else if *param == "IM" {
+				im.ImageMask = valueObj
+			} else if *param == "Intent" {
+				im.Intent = valueObj
+			} else if *param == "I" {
+				im.Interpolate = valueObj
+			} else if *param == "W" {
+				im.Width = valueObj
+			} else {
+				return nil, fmt.Errorf("Unknown inline image parameter %s", *param)
+			}
+		}
+
+		if isOperand {
+			operand, ok := obj.(*PdfObjectString)
+			if !ok {
+				return nil, fmt.Errorf("Failed to read inline image - invalid operand")
+			}
+
+			if *operand == "EI" {
+				// Image fully defined
+				common.Log.Debug("Inline image finished...")
+				return &im, nil
+			} else if *operand == "ID" {
+				// Inline image data.
+				// Should get a single space (0x20) followed by the data and then EI.
+				common.Log.Debug("ID start")
+
+				// Skip the space if its there.
+				b, err := this.reader.Peek(1)
+				if err != nil {
+					return nil, err
+				}
+				if IsWhiteSpace(b[0]) {
+					this.reader.Discard(1)
+				}
+
+				// Unfortunately there is no good way to know how many bytes to read since it
+				// depends on the Filter and encoding etc.
+				// Therefore we will simply read until we find "<ws>EI<ws>" where <ws> is whitespace
+				// although of course that could be a part of the data (even if unlikely).
+				im.stream = []byte{}
+				state := 0
+				var skipBytes []byte
+				for {
+					c, err := this.reader.ReadByte()
+					if err != nil {
+						common.Log.Debug("Unable to find end of image EI in inline image data")
+						return nil, err
+					}
+
+					if state == 0 {
+						if IsWhiteSpace(c) {
+							skipBytes = []byte{}
+							skipBytes = append(skipBytes, c)
+							state = 1
+						} else {
+							im.stream = append(im.stream, c)
+						}
+					} else if state == 1 {
+						skipBytes = append(skipBytes, c)
+						if c == 'E' {
+							state = 2
+						} else {
+							im.stream = append(im.stream, skipBytes...)
+							// Need an extra check to decide if we fall back to state 0 or 1.
+							if IsWhiteSpace(c) {
+								state = 1
+							} else {
+								state = 0
+							}
+						}
+					} else if state == 2 {
+						skipBytes = append(skipBytes, c)
+						if c == 'I' {
+							state = 3
+						} else {
+							im.stream = append(im.stream, skipBytes...)
+							state = 0
+						}
+					} else if state == 3 {
+						skipBytes = append(skipBytes, c)
+						if IsWhiteSpace(c) {
+							// image data finished.
+							common.Log.Debug("Image stream (%d): % x", len(im.stream), im.stream)
+							// Exit point.
+							return &im, nil
+						} else {
+							// Seems like "<ws>EI" was part of the data.
+							im.stream = append(im.stream, skipBytes...)
+							state = 0
+						}
+					}
+				}
+				// Never reached (exit point is at end of EI).
+			}
+		}
+	}
+}
--- a/pdf/contentstream/parser.go
+++ b/pdf/contentstream/parser.go
@ -3,10 +3,7 @@
 * file 'LICENSE.md', which is part of this source code package.
 */

-// The content stream parser provides functionality to parse the content stream into a list of
-// operands that can then be processed further for rendering or extraction of information.
-
-package model
+package contentstream

 import (
 	"bufio"
@ -26,11 +23,6 @@ type ContentStreamParser struct {
 	reader *bufio.Reader
 }

-type ContentStreamOperation struct {
-	Params  []PdfObject
-	Operand string
-}
-
 // Create a new instance of the content stream parser from an input content
 // stream string.
 func NewContentStreamParser(contentStr string) *ContentStreamParser {
@ -44,199 +36,6 @@ func NewContentStreamParser(contentStr string) *ContentStreamParser {
 	return &parser
 }

-// A representation of an inline image in a Content stream.
-// Everything between the BI and EI operands.
-// ContentStreamInlineImage implements the PdfObject interface
-// although strictly it is not a PDF object.
-type ContentStreamInlineImage struct {
-	BitsPerComponent PdfObject
-	ColorSpace       PdfObject
-	Decode           PdfObject
-	DecodeParms      PdfObject
-	Filter           PdfObject
-	Height           PdfObject
-	ImageMask        PdfObject
-	Intent           PdfObject
-	Interpolate      PdfObject
-	Width            PdfObject
-	stream           []byte
-}
-
-func (this *ContentStreamInlineImage) String() string {
-	str := fmt.Sprintf("InlineImage(len=%d)", len(this.stream))
-	return str
-}
-
-func (this *ContentStreamInlineImage) DefaultWriteString() string {
-	var output bytes.Buffer
-
-	// We do not start with "BI" as that is the operand and is written out separately.
-	// Write out the parameters
-	s := "BPC " + this.BitsPerComponent.DefaultWriteString() + "\n"
-	s += "CS " + this.ColorSpace.DefaultWriteString() + "\n"
-	s += "D " + this.Decode.DefaultWriteString() + "\n"
-	s += "DP " + this.DecodeParms.DefaultWriteString() + "\n"
-	s += "F " + this.Filter.DefaultWriteString() + "\n"
-	s += "H " + this.Height.DefaultWriteString() + "\n"
-	s += "IM " + this.ImageMask.DefaultWriteString() + "\n"
-	s += "Intent " + this.Intent.DefaultWriteString() + "\n"
-	s += "I " + this.Interpolate.DefaultWriteString() + "\n"
-	s += "W " + this.Width.DefaultWriteString() + "\n"
-	output.WriteString(s)
-
-	output.WriteString("ID ")
-	output.Write(this.stream)
-
-	return output.String()
-}
-
-// Export the inline image to Image which can be transformed or exported easily.
-func (this *ContentStreamInlineImage) ToImage() (*Image, error) {
-	return nil, fmt.Errorf("Not implemented yet")
-}
-
-// Parse an inline image from a content stream, both read its properties and
-// binary data.
-// When called, "BI" has already been read from the stream.  This function
-// finishes reading through "EI" and then returns the ContentStreamInlineImage.
-func (this *ContentStreamParser) ParseInlineImage() (*ContentStreamInlineImage, error) {
-	// Reading parameters.
-	im := ContentStreamInlineImage{}
-
-	for {
-		this.skipSpaces()
-		obj, err, isOperand := this.parseObject()
-		if err != nil {
-			return nil, err
-		}
-
-		if !isOperand {
-			// Not an operand.. Read key value properties..
-			param, ok := obj.(*PdfObjectName)
-			if !ok {
-				return nil, fmt.Errorf("Invalid inline image property (expecting name) - %T", obj)
-			}
-
-			valueObj, err, isOperand := this.parseObject()
-			if err != nil {
-				return nil, err
-			}
-			if isOperand {
-				return nil, fmt.Errorf("Not expecting an operand")
-			}
-
-			if *param == "BPC" {
-				im.BitsPerComponent = valueObj
-			} else if *param == "CS" {
-				im.ColorSpace = valueObj
-			} else if *param == "D" {
-				im.Decode = valueObj
-			} else if *param == "DP" {
-				im.DecodeParms = valueObj
-			} else if *param == "F" {
-				im.Filter = valueObj
-			} else if *param == "H" {
-				im.Height = valueObj
-			} else if *param == "IM" {
-				im.ImageMask = valueObj
-			} else if *param == "Intent" {
-				im.Intent = valueObj
-			} else if *param == "I" {
-				im.Interpolate = valueObj
-			} else if *param == "W" {
-				im.Width = valueObj
-			} else {
-				return nil, fmt.Errorf("Unknown inline image parameter %s", *param)
-			}
-		}
-
-		if isOperand {
-			operand, ok := obj.(*PdfObjectString)
-			if !ok {
-				return nil, fmt.Errorf("Failed to read inline image - invalid operand")
-			}
-
-			if *operand == "EI" {
-				// Image fully defined
-				common.Log.Debug("Inline image finished...")
-				return &im, nil
-			} else if *operand == "ID" {
-				// Inline image data.
-				// Should get a single space (0x20) followed by the data and then EI.
-				common.Log.Debug("ID start")
-
-				// Skip the space if its there.
-				b, err := this.reader.Peek(1)
-				if err != nil {
-					return nil, err
-				}
-				if IsWhiteSpace(b[0]) {
-					this.reader.Discard(1)
-				}
-
-				// Unfortunately there is no good way to know how many bytes to read since it
-				// depends on the Filter and encoding etc.
-				// Therefore we will simply read until we find "<ws>EI<ws>" where <ws> is whitespace
-				// although of course that could be a part of the data (even if unlikely).
-				im.stream = []byte{}
-				state := 0
-				var skipBytes []byte
-				for {
-					c, err := this.reader.ReadByte()
-					if err != nil {
-						common.Log.Debug("Unable to find end of image EI in inline image data")
-						return nil, err
-					}
-
-					if state == 0 {
-						if IsWhiteSpace(c) {
-							skipBytes = []byte{}
-							skipBytes = append(skipBytes, c)
-							state = 1
-						} else {
-							im.stream = append(im.stream, c)
-						}
-					} else if state == 1 {
-						skipBytes = append(skipBytes, c)
-						if c == 'E' {
-							state = 2
-						} else {
-							im.stream = append(im.stream, skipBytes...)
-							// Need an extra check to decide if we fall back to state 0 or 1.
-							if IsWhiteSpace(c) {
-								state = 1
-							} else {
-								state = 0
-							}
-						}
-					} else if state == 2 {
-						skipBytes = append(skipBytes, c)
-						if c == 'I' {
-							state = 3
-						} else {
-							im.stream = append(im.stream, skipBytes...)
-							state = 0
-						}
-					} else if state == 3 {
-						skipBytes = append(skipBytes, c)
-						if IsWhiteSpace(c) {
-							// image data finished.
-							common.Log.Debug("Image stream (%d): % x", len(im.stream), im.stream)
-							// Exit point.
-							return &im, nil
-						} else {
-							// Seems like "<ws>EI" was part of the data.
-							im.stream = append(im.stream, skipBytes...)
-							state = 0
-						}
-					}
-				}
-				// Never reached (exit point is at end of EI).
-			}
-		}
-	}
-}
-
 // Parses all commands in content stream, returning a list of operation data.
 func (this *ContentStreamParser) Parse() ([]*ContentStreamOperation, error) {
 	operations := []*ContentStreamOperation{}
@ -276,53 +75,6 @@ func (this *ContentStreamParser) Parse() ([]*ContentStreamOperation, error) {
 	return operations, nil
 }

-// Parses and extracts all text data in content streams and returns as a string.
-// Does not take into account Encoding table, the output is simply the character codes.
-func (this *ContentStreamParser) ExtractText() (string, error) {
-	operations, err := this.Parse()
-	if err != nil {
-		return "", err
-	}
-	inText := false
-	txt := ""
-	for _, op := range operations {
-		if op.Operand == "BT" {
-			inText = true
-		} else if op.Operand == "ET" {
-			inText = false
-		}
-		if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
-			// Move to next line...
-			txt += "\n"
-		}
-		if inText && op.Operand == "TJ" {
-			if len(op.Params) < 1 {
-				continue
-			}
-			paramList, ok := op.Params[0].(*PdfObjectArray)
-			if !ok {
-				return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
-			}
-			for _, obj := range *paramList {
-				if strObj, ok := obj.(*PdfObjectString); ok {
-					txt += string(*strObj)
-				}
-			}
-		} else if inText && op.Operand == "Tj" {
-			if len(op.Params) < 1 {
-				continue
-			}
-			param, ok := op.Params[0].(*PdfObjectString)
-			if !ok {
-				return "", fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
-			}
-			txt += string(*param)
-		}
-	}
-
-	return txt, nil
-}
-
 // Skip over any spaces.  Returns the number of spaces skipped and
 // an error if any.
 func (this *ContentStreamParser) skipSpaces() (int, error) {
@ -446,7 +198,7 @@ func (this *ContentStreamParser) parseNumber() (PdfObject, error) {
 	allowSigns := true
 	numStr := ""
 	for {
-		common.Log.Debug("Parsing number \"%s\"", numStr)
+		common.Log.Trace("Parsing number \"%s\"", numStr)
 		bb, err := this.reader.Peek(1)
 		if err == io.EOF {
 			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
@ -529,7 +281,7 @@ func (this *ContentStreamParser) parseString() (PdfObjectString, error) {
 				}
 				this.reader.Discard(len(numeric) - 1)

-				common.Log.Debug("Numeric string \"%s\"", numeric)
+				common.Log.Trace("Numeric string \"%s\"", numeric)
 				code, err := strconv.ParseUint(string(numeric), 8, 32)
 				if err != nil {
 					return PdfObjectString(bytes), err
@ -668,7 +420,7 @@ func (this *ContentStreamParser) parseNull() (PdfObjectNull, error) {
 }

 func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
-	common.Log.Debug("Reading content stream dict!")
+	common.Log.Trace("Reading content stream dict!")

 	dict := make(PdfObjectDictionary)

@ -690,17 +442,17 @@ func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
 			return nil, err
 		}

-		common.Log.Debug("Dict peek: %s (% x)!", string(bb), string(bb))
+		common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))
 		if (bb[0] == '>') && (bb[1] == '>') {
-			common.Log.Debug("EOF dictionary")
+			common.Log.Trace("EOF dictionary")
 			this.reader.ReadByte()
 			this.reader.ReadByte()
 			break
 		}
-		common.Log.Debug("Parse the name!")
+		common.Log.Trace("Parse the name!")

 		keyName, err := this.parseName()
-		common.Log.Debug("Key: %s", keyName)
+		common.Log.Trace("Key: %s", keyName)
 		if err != nil {
 			common.Log.Debug("ERROR Returning name err %s", err)
 			return nil, err
@ -710,8 +462,8 @@ func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
 			// Some writers have a bug where the null is appended without
 			// space.  For example "\Boundsnull"
 			newKey := keyName[0 : len(keyName)-4]
-			common.Log.Debug("Taking care of null bug (%s)", keyName)
-			common.Log.Debug("New key \"%s\" = null", newKey)
+			common.Log.Trace("Taking care of null bug (%s)", keyName)
+			common.Log.Trace("New key \"%s\" = null", newKey)
 			this.skipSpaces()
 			bb, _ := this.reader.Peek(1)
 			if bb[0] == '/' {
@ -729,7 +481,7 @@ func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
 		}
 		dict[keyName] = val

-		common.Log.Debug("dict[%s] = %s", keyName, val.String())
+		common.Log.Trace("dict[%s] = %s", keyName, val.String())
 	}

 	return &dict, nil
@ -772,40 +524,40 @@ func (this *ContentStreamParser) parseObject() (PdfObject, error, bool) {
 			return nil, err, false
 		}

-		common.Log.Debug("Peek string: %s", string(bb))
+		common.Log.Trace("Peek string: %s", string(bb))
 		// Determine type.
 		if bb[0] == '%' {
 			this.skipComments()
 			continue
 		} else if bb[0] == '/' {
 			name, err := this.parseName()
-			common.Log.Debug("->Name: '%s'", name)
+			common.Log.Trace("->Name: '%s'", name)
 			return &name, err, false
 		} else if bb[0] == '(' {
-			common.Log.Debug("->String!")
+			common.Log.Trace("->String!")
 			str, err := this.parseString()
 			return &str, err, false
 		} else if bb[0] == '<' && bb[1] != '<' {
-			common.Log.Debug("->Hex String!")
+			common.Log.Trace("->Hex String!")
 			str, err := this.parseHexString()
 			return &str, err, false
 		} else if bb[0] == '[' {
-			common.Log.Debug("->Array!")
+			common.Log.Trace("->Array!")
 			arr, err := this.parseArray()
 			return &arr, err, false
 		} else if IsDecimalDigit(bb[0]) || (bb[0] == '-' && IsDecimalDigit(bb[1])) {
-			common.Log.Debug("->Number!")
+			common.Log.Trace("->Number!")
 			number, err := this.parseNumber()
 			return number, err, false
 		} else if bb[0] == '<' && bb[1] == '<' {
 			dict, err := this.parseDict()
 			return dict, err, false
 		} else {
-			common.Log.Debug("->Operand or bool?")
+			common.Log.Trace("->Operand or bool?")
 			// Let's peek farther to find out.
 			bb, _ = this.reader.Peek(5)
 			peekStr := string(bb)
-			common.Log.Debug("Peek str: %s", peekStr)
+			common.Log.Trace("Peek str: %s", peekStr)

 			if (len(peekStr) > 3) && (peekStr[:4] == "null") {
 				null, err := this.parseNull()
--- a/pdf/core/stream.go
+++ b/pdf/core/stream.go
@ -20,7 +20,6 @@ func NewEncoderFromStream(streamObj *PdfObjectStream) (StreamEncoder, error) {
 	}

 	// The filter should be a name or an array with a list of filter names.
-	// Currently only supporting a single filter.
 	method, ok := filterObj.(*PdfObjectName)
 	if !ok {
 		array, ok := filterObj.(*PdfObjectArray)
--- a/pdf/model/annotations.go
+++ b/pdf/model/annotations.go
@ -12,17 +12,8 @@ import (
 	. "github.com/unidoc/unidoc/pdf/core"
 )

-/*
-
-markup annotations:
-T, popup, ..., ExData
-
-markup annotaitons are:
-Text, FreeText, Line, Square, Circle, Polygon, PolyLine, Highlight, Underline, Squiggly, StrikeOut, Stamp, Caret, Ink, FileAttachment,
-Sound, Redact
-
-
-*/
+// PDFAnnotation contains common attributes of an annotation.  The context object contains the subannotation,
+// which can be a markup annotation or other types.
 type PdfAnnotation struct {
 	context      PdfModel // Sub-annotation.
 	Rect         PdfObject
--- a/pdf/model/xobject.go
+++ b/pdf/model/xobject.go
@ -8,9 +8,164 @@ package model
 import (
 	"errors"

+	"github.com/unidoc/unidoc/common"
 	. "github.com/unidoc/unidoc/pdf/core"
 )

+// XObjectForm (Table 95 in 8.10.2).
+type XObjectForm struct {
+	Filter StreamEncoder
+
+	FormType      PdfObject
+	BBox          PdfObject
+	Matrix        PdfObject
+	Resources     PdfObject
+	Group         PdfObject
+	Ref           PdfObject
+	MetaData      PdfObject
+	PieceInfo     PdfObject
+	LastModified  PdfObject
+	StructParent  PdfObject
+	StructParents PdfObject
+	OPI           PdfObject
+	OC            PdfObject
+	Name          PdfObject
+	// Stream data.
+	Stream []byte
+	// Primitive
+	primitive *PdfObjectStream
+}
+
+// Create a brand new XObject Form. Creates a new underlying PDF object stream primitive.
+func NewXObjectForm() *XObjectForm {
+	xobj := &XObjectForm{}
+	stream := &PdfObjectStream{}
+	stream.PdfObjectDictionary = &PdfObjectDictionary{}
+	xobj.primitive = stream
+	return xobj
+}
+
+// Build the Form XObject from a stream object.
+// XXX: Should this be exposed? Consider different access points.
+func NewXObjectFormFromStream(stream *PdfObjectStream) (*XObjectForm, error) {
+	form := &XObjectForm{}
+	form.primitive = stream
+
+	dict := *(stream.PdfObjectDictionary)
+
+	encoder, err := NewEncoderFromStream(stream)
+	if err != nil {
+		return nil, err
+	}
+	form.Filter = encoder
+
+	if obj, isDefined := dict["Subtype"]; isDefined {
+		name, ok := obj.(*PdfObjectName)
+		if !ok {
+			return nil, errors.New("Type error")
+		}
+		if *name != "Form" {
+			common.Log.Debug("Invalid form subtype")
+			return nil, errors.New("Invalid form subtype")
+		}
+	}
+
+	if obj, isDefined := dict["FormType"]; isDefined {
+		form.FormType = obj
+	}
+	if obj, isDefined := dict["BBox"]; isDefined {
+		form.BBox = obj
+	}
+	if obj, isDefined := dict["Matrix"]; isDefined {
+		form.Matrix = obj
+	}
+	if obj, isDefined := dict["Resources"]; isDefined {
+		form.Resources = obj
+	}
+	if obj, isDefined := dict["Group"]; isDefined {
+		form.Group = obj
+	}
+	if obj, isDefined := dict["Ref"]; isDefined {
+		form.Ref = obj
+	}
+	if obj, isDefined := dict["MetaData"]; isDefined {
+		form.MetaData = obj
+	}
+	if obj, isDefined := dict["PieceInfo"]; isDefined {
+		form.PieceInfo = obj
+	}
+	if obj, isDefined := dict["LastModified"]; isDefined {
+		form.LastModified = obj
+	}
+	if obj, isDefined := dict["StructParent"]; isDefined {
+		form.StructParent = obj
+	}
+	if obj, isDefined := dict["StructParents"]; isDefined {
+		form.StructParents = obj
+	}
+	if obj, isDefined := dict["OPI"]; isDefined {
+		form.OPI = obj
+	}
+	if obj, isDefined := dict["OC"]; isDefined {
+		form.OC = obj
+	}
+	if obj, isDefined := dict["Name"]; isDefined {
+		form.Name = obj
+	}
+
+	form.Stream = stream.Stream
+
+	return form, nil
+}
+
+func (xform *XObjectForm) GetContainingPdfObject() PdfObject {
+	return xform.primitive
+}
+
+func (xform *XObjectForm) GetContentStream() ([]byte, error) {
+	decoded, err := DecodeStream(xform.primitive)
+	if err != nil {
+		return nil, err
+	}
+
+	return decoded, nil
+}
+
+// Return a stream object.
+func (xform *XObjectForm) ToPdfObject() PdfObject {
+	stream := xform.primitive
+
+	dict := stream.PdfObjectDictionary
+	if xform.Filter != nil {
+		// Pre-populate the stream dictionary with the
+		// encoding related fields.
+		dict = xform.Filter.MakeStreamDict()
+		stream.PdfObjectDictionary = dict
+	}
+	dict.Set("Type", MakeName("XObject"))
+	dict.Set("Subtype", MakeName("Form"))
+
+	dict.SetIfNotNil("FormType", xform.FormType)
+	dict.SetIfNotNil("BBox", xform.BBox)
+	dict.SetIfNotNil("Matrix", xform.Matrix)
+	dict.SetIfNotNil("Resources", xform.Resources)
+	dict.SetIfNotNil("Group", xform.Group)
+	dict.SetIfNotNil("Ref", xform.Ref)
+	dict.SetIfNotNil("MetaData", xform.MetaData)
+	dict.SetIfNotNil("PieceInfo", xform.PieceInfo)
+	dict.SetIfNotNil("LastModified", xform.LastModified)
+	dict.SetIfNotNil("StructParent", xform.StructParent)
+	dict.SetIfNotNil("StructParents", xform.StructParents)
+	dict.SetIfNotNil("OPI", xform.OPI)
+	dict.SetIfNotNil("OC", xform.OC)
+	dict.SetIfNotNil("Name", xform.Name)
+
+	dict.Set("Length", MakeInteger(int64(len(xform.Stream))))
+	stream.Stream = xform.Stream
+
+	return stream
+}
+
 // XObjectImage (Table 89 in 8.9.5.1).
 // Implements PdfModel interface.
 type XObjectImage struct {
@ -19,8 +174,8 @@ type XObjectImage struct {
 	Height           *int64
 	ColorSpace       PdfColorspace
 	BitsPerComponent *int64
-	//Filter           *PdfObjectName
-	Filter       StreamEncoder
+	Filter           StreamEncoder
+
 	Intent       PdfObject
 	ImageMask    PdfObject
 	Mask         PdfObject