unipdf/pdf/contentstream/contentstream.go

152 lines
3.3 KiB
Go
Raw Normal View History

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package contentstream
import (
"bytes"
"fmt"
. "github.com/unidoc/unidoc/pdf/core"
)
type ContentStreamOperation struct {
Params []PdfObject
Operand string
}
type ContentStreamOperations []*ContentStreamOperation
// Check if the content stream operations are fully wrapped (within q ... Q)
func (this *ContentStreamOperations) isWrapped() bool {
if len(*this) < 2 {
return false
}
depth := 0
for _, op := range *this {
if op.Operand == "q" {
depth++
} else if op.Operand == "Q" {
depth--
} else {
if depth < 1 {
return false
}
}
}
// Should end at depth == 0
return depth == 0
}
// Wrap entire contents within q ... Q. If unbalanced, then adds extra Qs at the end.
// Only does if needed. Ensures that when adding new content, one start with all states
// in the default condition.
func (this *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations {
if len(*this) == 0 {
// No need to wrap if empty.
return this
}
if this.isWrapped() {
return this
}
*this = append([]*ContentStreamOperation{&ContentStreamOperation{Operand: "q"}}, *this...)
depth := 0
for _, op := range *this {
if op.Operand == "q" {
depth++
} else if op.Operand == "Q" {
depth--
}
}
for depth > 0 {
*this = append(*this, &ContentStreamOperation{Operand: "Q"})
depth--
}
return this
}
// Convert a set of content stream operations to a content stream byte presentation, i.e. the kind that can be
// stored as a PDF stream or string format.
func (this *ContentStreamOperations) Bytes() []byte {
var buf bytes.Buffer
for _, op := range *this {
if op == nil {
continue
}
if op.Operand == "BI" {
// Inline image requires special handling.
buf.WriteString(op.Operand + "\n")
buf.WriteString(op.Params[0].DefaultWriteString())
} else {
// Default handler.
for _, param := range op.Params {
buf.WriteString(param.DefaultWriteString())
buf.WriteString(" ")
}
buf.WriteString(op.Operand + "\n")
}
}
return buf.Bytes()
}
// Parses and extracts all text data in content streams and returns as a string.
// Does not take into account Encoding table, the output is simply the character codes.
func (this *ContentStreamParser) ExtractText() (string, error) {
operations, err := this.Parse()
if err != nil {
return "", err
}
inText := false
txt := ""
for _, op := range *operations {
if op.Operand == "BT" {
inText = true
} else if op.Operand == "ET" {
inText = false
}
if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
// Move to next line...
txt += "\n"
}
if inText && op.Operand == "TJ" {
if len(op.Params) < 1 {
continue
}
paramList, ok := op.Params[0].(*PdfObjectArray)
if !ok {
return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
}
for _, obj := range *paramList {
if strObj, ok := obj.(*PdfObjectString); ok {
txt += string(*strObj)
}
}
} else if inText && op.Operand == "Tj" {
if len(op.Params) < 1 {
continue
}
param, ok := op.Params[0].(*PdfObjectString)
if !ok {
return "", fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
}
txt += string(*param)
}
}
return txt, nil
}