unipdf/pdf/contentstream/processor.go

677 lines
20 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package contentstream
import (
"errors"
"fmt"
"github.com/unidoc/unidoc/common"
. "github.com/unidoc/unidoc/pdf/core"
. "github.com/unidoc/unidoc/pdf/model"
)
// Basic graphics state implementation.
// Initially only implementing and tracking a portion of the information specified. Easy to add more.
type GraphicsState struct {
ColorspaceStroking PdfColorspace
ColorspaceNonStroking PdfColorspace
ColorStroking PdfColor
ColorNonStroking PdfColor
CTM Matrix
}
type Orientation int
const (
OrientationPortrait Orientation = iota
OrientationLandscape
)
type GraphicStateStack []GraphicsState
func (gsStack *GraphicStateStack) Push(gs GraphicsState) {
*gsStack = append(*gsStack, gs)
}
func (gsStack *GraphicStateStack) Pop() GraphicsState {
gs := (*gsStack)[len(*gsStack)-1]
*gsStack = (*gsStack)[:len(*gsStack)-1]
return gs
}
// Transform returns coordinates x, y transformed by the CTM
func (gs *GraphicsState) Transform(x, y float64) (float64, float64) {
// xp, yp := gs.CTM.Transform(x, y)
// fmt.Printf("Transform. %5.1f,%5.1f->%5.1f,%5.1f %+v\n", x, y, xp, yp, gs.CTM)
return gs.CTM.Transform(x, y)
}
// Returns the likely page orientation given the CTM
func (gs *GraphicsState) PageOrientation() Orientation {
return gs.CTM.pageOrientation()
}
// ContentStreamProcessor defines a data structure and methods for processing a content stream, keeping track of the
// current graphics state, and allowing external handlers to define their own functions as a part of the processing,
// for example rendering or extracting certain information.
type ContentStreamProcessor struct {
graphicsStack GraphicStateStack
operations []*ContentStreamOperation
graphicsState GraphicsState
handlers []HandlerEntry
currentIndex int
}
type HandlerFunc func(op *ContentStreamOperation, gs GraphicsState, resources *PdfPageResources) error
type HandlerEntry struct {
Condition HandlerConditionEnum
Operand string
Handler HandlerFunc
}
type HandlerConditionEnum int
func (csp HandlerConditionEnum) All() bool {
return csp == HandlerConditionEnumAllOperands
}
func (csp HandlerConditionEnum) Operand() bool {
return csp == HandlerConditionEnumOperand
}
const (
HandlerConditionEnumOperand HandlerConditionEnum = iota
HandlerConditionEnumAllOperands HandlerConditionEnum = iota
)
func NewContentStreamProcessor(ops []*ContentStreamOperation) *ContentStreamProcessor {
csp := ContentStreamProcessor{}
csp.graphicsStack = GraphicStateStack{}
// Set defaults..
gs := GraphicsState{}
csp.graphicsState = gs
csp.handlers = []HandlerEntry{}
csp.currentIndex = 0
csp.operations = ops
return &csp
}
func (csp *ContentStreamProcessor) AddHandler(condition HandlerConditionEnum, operand string, handler HandlerFunc) {
entry := HandlerEntry{}
entry.Condition = condition
entry.Operand = operand
entry.Handler = handler
csp.handlers = append(csp.handlers, entry)
}
func (csp *ContentStreamProcessor) getColorspace(name string, resources *PdfPageResources) (PdfColorspace, error) {
switch name {
case "DeviceGray":
return NewPdfColorspaceDeviceGray(), nil
case "DeviceRGB":
return NewPdfColorspaceDeviceRGB(), nil
case "DeviceCMYK":
return NewPdfColorspaceDeviceCMYK(), nil
case "Pattern":
return NewPdfColorspaceSpecialPattern(), nil
}
// Next check the colorspace dictionary.
cs, has := resources.ColorSpace.Colorspaces[name]
if has {
return cs, nil
}
// Lastly check other potential colormaps.
switch name {
case "CalGray":
return NewPdfColorspaceCalGray(), nil
case "CalRGB":
return NewPdfColorspaceCalRGB(), nil
case "Lab":
return NewPdfColorspaceLab(), nil
}
// Otherwise unsupported.
common.Log.Debug("Unknown colorspace requested: %s", name)
return nil, errors.New("Unsupported colorspace")
}
// Get initial color for a given colorspace.
func (csp *ContentStreamProcessor) getInitialColor(cs PdfColorspace) (PdfColor, error) {
switch cs := cs.(type) {
case *PdfColorspaceDeviceGray:
return NewPdfColorDeviceGray(0.0), nil
case *PdfColorspaceDeviceRGB:
return NewPdfColorDeviceRGB(0.0, 0.0, 0.0), nil
case *PdfColorspaceDeviceCMYK:
return NewPdfColorDeviceCMYK(0.0, 0.0, 0.0, 1.0), nil
case *PdfColorspaceCalGray:
return NewPdfColorCalGray(0.0), nil
case *PdfColorspaceCalRGB:
return NewPdfColorCalRGB(0.0, 0.0, 0.0), nil
case *PdfColorspaceLab:
l := 0.0
a := 0.0
b := 0.0
if cs.Range[0] > 0 {
l = cs.Range[0]
}
if cs.Range[2] > 0 {
a = cs.Range[2]
}
return NewPdfColorLab(l, a, b), nil
case *PdfColorspaceICCBased:
if cs.Alternate == nil {
// Alternate not defined.
// Try to fall back to DeviceGray, DeviceRGB or DeviceCMYK.
common.Log.Trace("ICC Based not defined - attempting fall back (N = %d)", cs.N)
if cs.N == 1 {
common.Log.Trace("Falling back to DeviceGray")
return csp.getInitialColor(NewPdfColorspaceDeviceGray())
} else if cs.N == 3 {
common.Log.Trace("Falling back to DeviceRGB")
return csp.getInitialColor(NewPdfColorspaceDeviceRGB())
} else if cs.N == 4 {
common.Log.Trace("Falling back to DeviceCMYK")
return csp.getInitialColor(NewPdfColorspaceDeviceCMYK())
} else {
return nil, errors.New("Alternate space not defined for ICC")
}
}
return csp.getInitialColor(cs.Alternate)
case *PdfColorspaceSpecialIndexed:
if cs.Base == nil {
return nil, errors.New("Indexed base not specified")
}
return csp.getInitialColor(cs.Base)
case *PdfColorspaceSpecialSeparation:
if cs.AlternateSpace == nil {
return nil, errors.New("Alternate space not specified")
}
return csp.getInitialColor(cs.AlternateSpace)
case *PdfColorspaceDeviceN:
if cs.AlternateSpace == nil {
return nil, errors.New("Alternate space not specified")
}
return csp.getInitialColor(cs.AlternateSpace)
case *PdfColorspaceSpecialPattern:
// FIXME/check: A pattern does not have an initial color...
return nil, nil
}
common.Log.Debug("Unable to determine initial color for unknown colorspace: %T", cs)
return nil, errors.New("Unsupported colorspace")
}
// Process the entire operations.
func (csp *ContentStreamProcessor) Process(resources *PdfPageResources) error {
// Initialize graphics state
csp.graphicsState.ColorspaceStroking = NewPdfColorspaceDeviceGray()
csp.graphicsState.ColorspaceNonStroking = NewPdfColorspaceDeviceGray()
csp.graphicsState.ColorStroking = NewPdfColorDeviceGray(0)
csp.graphicsState.ColorNonStroking = NewPdfColorDeviceGray(0)
csp.graphicsState.CTM = IdentityMatrix()
for _, op := range csp.operations {
var err error
// Internal handling.
switch op.Operand {
case "q":
csp.graphicsStack.Push(csp.graphicsState)
case "Q":
csp.graphicsState = csp.graphicsStack.Pop()
// Color operations (Table 74 p. 179)
case "CS":
err = csp.handleCommand_CS(op, resources)
case "cs":
err = csp.handleCommand_cs(op, resources)
case "SC":
err = csp.handleCommand_SC(op, resources)
case "SCN":
err = csp.handleCommand_SCN(op, resources)
case "sc":
err = csp.handleCommand_sc(op, resources)
case "scn":
err = csp.handleCommand_scn(op, resources)
case "G":
err = csp.handleCommand_G(op, resources)
case "g":
err = csp.handleCommand_g(op, resources)
case "RG":
err = csp.handleCommand_RG(op, resources)
case "rg":
err = csp.handleCommand_rg(op, resources)
case "K":
err = csp.handleCommand_K(op, resources)
case "k":
err = csp.handleCommand_k(op, resources)
}
if err != nil {
common.Log.Debug("Processor handling error (%s): %v", op.Operand, err)
common.Log.Debug("Operand: %#v", op.Operand)
return err
}
// Check if have external handler also, and process if so.
for _, entry := range csp.handlers {
var err error
if entry.Condition.All() {
err = entry.Handler(op, csp.graphicsState, resources)
} else if entry.Condition.Operand() && op.Operand == entry.Operand {
err = entry.Handler(op, csp.graphicsState, resources)
}
if err != nil {
common.Log.Debug("Processor handler error: %v", err)
return err
}
}
}
return nil
}
// CS: Set the current color space for stroking operations.
func (csp *ContentStreamProcessor) handleCommand_CS(op *ContentStreamOperation, resources *PdfPageResources) error {
if len(op.Params) < 1 {
common.Log.Debug("Invalid cs command, skipping over")
return errors.New("Too few parameters")
}
if len(op.Params) > 1 {
common.Log.Debug("cs command with too many parameters - continuing")
return errors.New("Too many parameters")
}
name, ok := op.Params[0].(*PdfObjectName)
if !ok {
common.Log.Debug("ERROR: cs command with invalid parameter, skipping over")
return errors.New("Type check error")
}
// Set the current color space to use for stroking operations.
// Either device based or referring to resource dict.
cs, err := csp.getColorspace(string(*name), resources)
if err != nil {
return err
}
csp.graphicsState.ColorspaceStroking = cs
// Set initial color.
color, err := csp.getInitialColor(cs)
if err != nil {
return err
}
csp.graphicsState.ColorStroking = color
return nil
}
// cs: Set the current color space for non-stroking operations.
func (csp *ContentStreamProcessor) handleCommand_cs(op *ContentStreamOperation, resources *PdfPageResources) error {
if len(op.Params) < 1 {
common.Log.Debug("Invalid CS command, skipping over")
return errors.New("Too few parameters")
}
if len(op.Params) > 1 {
common.Log.Debug("CS command with too many parameters - continuing")
return errors.New("Too many parameters")
}
name, ok := op.Params[0].(*PdfObjectName)
if !ok {
common.Log.Debug("ERROR: CS command with invalid parameter, skipping over")
return errors.New("Type check error")
}
// Set the current color space to use for non-stroking operations.
// Either device based or referring to resource dict.
cs, err := csp.getColorspace(string(*name), resources)
if err != nil {
return err
}
csp.graphicsState.ColorspaceNonStroking = cs
// Set initial color.
color, err := csp.getInitialColor(cs)
if err != nil {
return err
}
csp.graphicsState.ColorNonStroking = color
return nil
}
// SC: Set the color to use for stroking operations in a device, CIE-based or Indexed colorspace. (not ICC based)
func (csp *ContentStreamProcessor) handleCommand_SC(op *ContentStreamOperation, resources *PdfPageResources) error {
// For DeviceGray, CalGray, Indexed: one operand is required
// For DeviceRGB, CalRGB, Lab: 3 operands required
cs := csp.graphicsState.ColorspaceStroking
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorStroking = color
return nil
}
func isPatternCS(cs PdfColorspace) bool {
_, isPattern := cs.(*PdfColorspaceSpecialPattern)
return isPattern
}
// SCN: Same as SC but also supports Pattern, Separation, DeviceN and ICCBased color spaces.
func (csp *ContentStreamProcessor) handleCommand_SCN(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := csp.graphicsState.ColorspaceStroking
if !isPatternCS(cs) {
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorStroking = color
return nil
}
// sc: Same as SC except used for non-stroking operations.
func (csp *ContentStreamProcessor) handleCommand_sc(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := csp.graphicsState.ColorspaceNonStroking
if !isPatternCS(cs) {
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorNonStroking = color
return nil
}
// scn: Same as SCN except used for non-stroking operations.
func (csp *ContentStreamProcessor) handleCommand_scn(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := csp.graphicsState.ColorspaceNonStroking
if !isPatternCS(cs) {
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
common.Log.Debug("ERROR: Fail to get color from params: %+v (CS is %+v)", op.Params, cs)
return err
}
csp.graphicsState.ColorNonStroking = color
return nil
}
// G: Set the stroking colorspace to DeviceGray, and the color to the specified graylevel (range [0-1]).
// gray G
func (csp *ContentStreamProcessor) handleCommand_G(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := NewPdfColorspaceDeviceGray()
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorspaceStroking = cs
csp.graphicsState.ColorStroking = color
return nil
}
// g: Same as G, but for non-stroking colorspace and color (range [0-1]).
// gray g
func (csp *ContentStreamProcessor) handleCommand_g(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := NewPdfColorspaceDeviceGray()
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for g")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
common.Log.Debug("ERROR: handleCommand_g Invalid params. cs=%T op=%s err=%v", cs, op, err)
return err
}
csp.graphicsState.ColorspaceNonStroking = cs
csp.graphicsState.ColorNonStroking = color
return nil
}
// RG: Sets the stroking colorspace to DeviceRGB and the stroking color to r,g,b. [0-1] ranges.
// r g b RG
func (csp *ContentStreamProcessor) handleCommand_RG(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := NewPdfColorspaceDeviceRGB()
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for RG")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorspaceStroking = cs
csp.graphicsState.ColorStroking = color
return nil
}
// rg: Same as RG but for non-stroking colorspace, color.
func (csp *ContentStreamProcessor) handleCommand_rg(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := NewPdfColorspaceDeviceRGB()
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorspaceNonStroking = cs
csp.graphicsState.ColorNonStroking = color
return nil
}
// K: Sets the stroking colorspace to DeviceCMYK and the stroking color to c,m,y,k. [0-1] ranges.
// c m y k K
func (csp *ContentStreamProcessor) handleCommand_K(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := NewPdfColorspaceDeviceCMYK()
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorspaceStroking = cs
csp.graphicsState.ColorStroking = color
return nil
}
// k: Same as K but for non-stroking colorspace, color.
func (csp *ContentStreamProcessor) handleCommand_k(op *ContentStreamOperation, resources *PdfPageResources) error {
cs := NewPdfColorspaceDeviceCMYK()
if len(op.Params) != cs.GetNumComponents() {
common.Log.Debug("Invalid number of parameters for SC")
common.Log.Debug("Number %d not matching colorspace %T", len(op.Params), cs)
return errors.New("Invalid number of parameters")
}
color, err := cs.ColorFromPdfObjects(op.Params)
if err != nil {
return err
}
csp.graphicsState.ColorspaceNonStroking = cs
csp.graphicsState.ColorNonStroking = color
return nil
}
// cm: concatenates an affine transform to the CTM
func (csp *ContentStreamProcessor) handleCommand_cm(op *ContentStreamOperation,
resources *PdfPageResources) error {
if len(op.Params) != 6 {
common.Log.Debug("Invalid number of parameters for cm: %d", len(op.Params))
return errors.New("Invalid number of parameters")
}
f, err := GetNumbersAsFloat(op.Params)
if err != nil {
return err
}
m := NewMatrix(f[0], f[1], f[2], f[3], f[4], f[5])
csp.graphicsState.CTM.Concat(m)
return nil
}
// Matrix is a linear transform matrix in homogenous coordinates
// PDF coordinate transforms are always affine so we only need 6 of these. See newMatrix
type Matrix [9]float64
// IdentityMatrix returns the identity transform
func IdentityMatrix() Matrix {
return NewMatrix(1, 0, 0, 1, 0, 0)
}
// NewMatrix returns an affine transform matrix laid out in homogenous coordinates as
// a b 0
// c d 0
// tx ty 1
func NewMatrix(a, b, c, d, tx, ty float64) Matrix {
m := Matrix{
a, b, 0,
c, d, 0,
tx, ty, 1,
}
m.fixup()
return m
}
// String returns a string describing `m`
func (m Matrix) String() string {
a, b, c, d, tx, ty := m[0], m[1], m[3], m[4], m[6], m[7]
return fmt.Sprintf("%5.1f,%5.1f,%5.1f,%5.1f: %5.1f,%5.1f", a, b, c, d, tx, ty)
}
// Set sets `m` to affine transform a,b,c,d,tx,ty
func (m *Matrix) Set(a, b, c, d, tx, ty float64) {
m[0], m[1] = a, b
m[3], m[4] = c, d
m[6], m[7] = tx, ty
m.fixup()
}
// Concat sets `m` to `m` × `b`
// `b` needs to be created by newMatrix. i.e. It must be an affine transform
func (m *Matrix) Concat(b Matrix) {
*m = Matrix{
m[0]*b[0] + m[1]*b[3], m[0]*b[1] + m[1]*b[4], 0,
m[3]*b[0] + m[4]*b[3], m[3]*b[1] + m[4]*b[4], 0,
m[6]*b[0] + m[7]*b[3] + b[6], m[6]*b[1] + m[7]*b[4] + b[7], 1,
}
m.fixup()
}
// Translate appends a translation of `dx`,`dy` to `m`
// m.Translate(dx, dy) is equivalent to m.Concat(NewMatrix(1, 0, 0, 1, dx, dy))
func (m *Matrix) Translate(dx, dy float64) {
m[6] += dx
m[7] += dy
m.fixup()
}
// Transform returns coordinates `x`,`y` transformed by `m`
func (m *Matrix) Transform(x, y float64) (float64, float64) {
xp := x*m[0] + y*m[1] + m[6]
yp := x*m[3] + y*m[4] + m[7]
return xp, yp
}
// pageOrientation returns a guess at the pdf page orientation when text is printed with CTM `m`
// XXX: Use pageRotate flag instead !@#$
func (m *Matrix) pageOrientation() Orientation {
switch {
case m[1]*m[1]+m[3]*m[3] > m[0]*m[0]+m[4]*m[4]:
return OrientationLandscape
default:
return OrientationPortrait
}
}
// fixup forces `m` to have reasonable values. It is a guard against crazy values in corrupt PDF
// files
// Currently it clamps elements to [-maxAbsNumber, -maxAbsNumber] to avoid floating point exceptions
func (m *Matrix) fixup() {
for i, x := range m {
if x > maxAbsNumber {
m[i] = maxAbsNumber
} else if x < -maxAbsNumber {
m[i] = -maxAbsNumber
}
}
}
// largest numbers needed in PDF transforms. Is this correct?
const maxAbsNumber = 1e9