2017-03-01 16:02:53 +00:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package contentstream
|
|
|
|
|
|
|
|
import (
|
2017-03-14 13:04:51 +00:00
|
|
|
"bytes"
|
2017-03-01 16:02:53 +00:00
|
|
|
"fmt"
|
|
|
|
|
2019-05-16 23:44:51 +03:00
|
|
|
"github.com/unidoc/unipdf/v3/core"
|
2017-03-01 16:02:53 +00:00
|
|
|
)
|
|
|
|
|
2018-08-03 10:17:06 +00:00
|
|
|
// ContentStreamOperation represents an operation in PDF contentstream which consists of
|
|
|
|
// an operand and parameters.
|
2017-03-01 16:02:53 +00:00
|
|
|
type ContentStreamOperation struct {
|
2018-08-03 10:17:06 +00:00
|
|
|
Params []core.PdfObject
|
2017-03-01 16:02:53 +00:00
|
|
|
Operand string
|
|
|
|
}
|
|
|
|
|
2018-08-03 10:17:06 +00:00
|
|
|
// ContentStreamOperations is a slice of ContentStreamOperations.
|
2017-03-14 13:04:51 +00:00
|
|
|
type ContentStreamOperations []*ContentStreamOperation
|
|
|
|
|
2017-06-28 15:13:37 +00:00
|
|
|
// Check if the content stream operations are fully wrapped (within q ... Q)
|
2018-08-03 10:17:06 +00:00
|
|
|
func (ops *ContentStreamOperations) isWrapped() bool {
|
|
|
|
if len(*ops) < 2 {
|
2017-06-28 15:13:37 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
depth := 0
|
2018-08-03 10:17:06 +00:00
|
|
|
for _, op := range *ops {
|
2017-06-28 15:13:37 +00:00
|
|
|
if op.Operand == "q" {
|
|
|
|
depth++
|
|
|
|
} else if op.Operand == "Q" {
|
|
|
|
depth--
|
|
|
|
} else {
|
|
|
|
if depth < 1 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Should end at depth == 0
|
|
|
|
return depth == 0
|
|
|
|
}
|
|
|
|
|
2018-08-03 10:17:06 +00:00
|
|
|
// WrapIfNeeded wraps the entire contents within q ... Q. If unbalanced, then adds extra Qs at the end.
|
2017-06-28 15:13:37 +00:00
|
|
|
// Only does if needed. Ensures that when adding new content, one start with all states
|
|
|
|
// in the default condition.
|
2018-08-03 10:17:06 +00:00
|
|
|
func (ops *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations {
|
|
|
|
if len(*ops) == 0 {
|
2017-07-01 21:57:31 +00:00
|
|
|
// No need to wrap if empty.
|
2018-08-03 10:17:06 +00:00
|
|
|
return ops
|
2017-07-01 21:57:31 +00:00
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
if ops.isWrapped() {
|
|
|
|
return ops
|
2017-06-28 15:13:37 +00:00
|
|
|
}
|
|
|
|
|
2018-08-03 10:17:06 +00:00
|
|
|
*ops = append([]*ContentStreamOperation{{Operand: "q"}}, *ops...)
|
2017-06-28 15:13:37 +00:00
|
|
|
|
|
|
|
depth := 0
|
2018-08-03 10:17:06 +00:00
|
|
|
for _, op := range *ops {
|
2017-06-28 15:13:37 +00:00
|
|
|
if op.Operand == "q" {
|
|
|
|
depth++
|
|
|
|
} else if op.Operand == "Q" {
|
|
|
|
depth--
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for depth > 0 {
|
2018-08-03 10:17:06 +00:00
|
|
|
*ops = append(*ops, &ContentStreamOperation{Operand: "Q"})
|
2017-06-28 15:13:37 +00:00
|
|
|
depth--
|
|
|
|
}
|
|
|
|
|
2018-08-03 10:17:06 +00:00
|
|
|
return ops
|
2017-06-28 15:13:37 +00:00
|
|
|
}
|
|
|
|
|
2018-08-03 10:17:06 +00:00
|
|
|
// Bytes converts a set of content stream operations to a content stream byte presentation,
|
|
|
|
// i.e. the kind that can be stored as a PDF stream or string format.
|
|
|
|
func (ops *ContentStreamOperations) Bytes() []byte {
|
2017-03-14 13:04:51 +00:00
|
|
|
var buf bytes.Buffer
|
|
|
|
|
2018-08-03 10:17:06 +00:00
|
|
|
for _, op := range *ops {
|
2017-03-14 13:04:51 +00:00
|
|
|
if op == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if op.Operand == "BI" {
|
|
|
|
// Inline image requires special handling.
|
|
|
|
buf.WriteString(op.Operand + "\n")
|
2018-12-11 04:37:00 +02:00
|
|
|
buf.WriteString(op.Params[0].WriteString())
|
2017-03-14 13:04:51 +00:00
|
|
|
|
|
|
|
} else {
|
|
|
|
// Default handler.
|
|
|
|
for _, param := range op.Params {
|
2018-12-11 04:37:00 +02:00
|
|
|
buf.WriteString(param.WriteString())
|
2017-03-14 13:04:51 +00:00
|
|
|
buf.WriteString(" ")
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
buf.WriteString(op.Operand + "\n")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return buf.Bytes()
|
|
|
|
}
|
|
|
|
|
2018-10-05 01:59:19 +00:00
|
|
|
// String returns `ops.Bytes()` as a string.
|
|
|
|
func (ops *ContentStreamOperations) String() string {
|
|
|
|
return string(ops.Bytes())
|
|
|
|
}
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
// ExtractText parses and extracts all text data in content streams and returns as a string.
|
2017-03-01 16:02:53 +00:00
|
|
|
// Does not take into account Encoding table, the output is simply the character codes.
|
2018-03-22 13:01:04 +00:00
|
|
|
//
|
|
|
|
// Deprecated: More advanced text extraction is offered in package extractor with character encoding support.
|
2018-10-15 10:13:50 +00:00
|
|
|
func (csp *ContentStreamParser) ExtractText() (string, error) {
|
|
|
|
operations, err := csp.Parse()
|
2017-03-01 16:02:53 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
inText := false
|
2018-03-17 12:16:59 -07:00
|
|
|
xPos, yPos := float64(-1), float64(-1)
|
2017-03-01 16:02:53 +00:00
|
|
|
txt := ""
|
2017-06-28 15:13:37 +00:00
|
|
|
for _, op := range *operations {
|
2017-03-01 16:02:53 +00:00
|
|
|
if op.Operand == "BT" {
|
|
|
|
inText = true
|
|
|
|
} else if op.Operand == "ET" {
|
|
|
|
inText = false
|
|
|
|
}
|
|
|
|
if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
|
|
|
|
// Move to next line...
|
|
|
|
txt += "\n"
|
|
|
|
}
|
2018-03-17 12:16:59 -07:00
|
|
|
if op.Operand == "Tm" {
|
|
|
|
if len(op.Params) != 6 {
|
|
|
|
continue
|
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
|
2018-03-17 12:16:59 -07:00
|
|
|
if !ok {
|
2018-08-03 10:17:06 +00:00
|
|
|
xint, ok := op.Params[4].(*core.PdfObjectInteger)
|
2018-03-17 12:16:59 -07:00
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
xfloat = core.MakeFloat(float64(*xint))
|
2018-03-17 12:16:59 -07:00
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
|
2018-03-17 12:16:59 -07:00
|
|
|
if !ok {
|
2018-08-03 10:17:06 +00:00
|
|
|
yint, ok := op.Params[5].(*core.PdfObjectInteger)
|
2018-03-17 12:16:59 -07:00
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
yfloat = core.MakeFloat(float64(*yint))
|
2018-03-17 12:16:59 -07:00
|
|
|
}
|
|
|
|
if yPos == -1 {
|
|
|
|
yPos = float64(*yfloat)
|
|
|
|
} else if yPos > float64(*yfloat) {
|
|
|
|
txt += "\n"
|
|
|
|
xPos = float64(*xfloat)
|
|
|
|
yPos = float64(*yfloat)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if xPos == -1 {
|
|
|
|
xPos = float64(*xfloat)
|
|
|
|
} else if xPos < float64(*xfloat) {
|
|
|
|
txt += "\t"
|
|
|
|
xPos = float64(*xfloat)
|
|
|
|
}
|
|
|
|
}
|
2017-03-01 16:02:53 +00:00
|
|
|
if inText && op.Operand == "TJ" {
|
|
|
|
if len(op.Params) < 1 {
|
|
|
|
continue
|
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
paramList, ok := op.Params[0].(*core.PdfObjectArray)
|
2017-03-01 16:02:53 +00:00
|
|
|
if !ok {
|
2018-12-08 19:16:52 +02:00
|
|
|
return "", fmt.Errorf("invalid parameter type, no array (%T)", op.Params[0])
|
2017-03-01 16:02:53 +00:00
|
|
|
}
|
2018-07-15 17:52:53 +00:00
|
|
|
for _, obj := range paramList.Elements() {
|
2017-10-29 10:34:40 -07:00
|
|
|
switch v := obj.(type) {
|
2018-08-03 10:17:06 +00:00
|
|
|
case *core.PdfObjectString:
|
2018-07-14 02:25:29 +00:00
|
|
|
txt += v.Str()
|
2018-08-03 10:17:06 +00:00
|
|
|
case *core.PdfObjectFloat:
|
2017-10-29 10:34:40 -07:00
|
|
|
if *v < -100 {
|
|
|
|
txt += " "
|
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
case *core.PdfObjectInteger:
|
2017-10-29 10:34:40 -07:00
|
|
|
if *v < -100 {
|
|
|
|
txt += " "
|
|
|
|
}
|
2017-03-01 16:02:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if inText && op.Operand == "Tj" {
|
|
|
|
if len(op.Params) < 1 {
|
|
|
|
continue
|
|
|
|
}
|
2018-08-03 10:17:06 +00:00
|
|
|
param, ok := op.Params[0].(*core.PdfObjectString)
|
2017-03-01 16:02:53 +00:00
|
|
|
if !ok {
|
2018-12-08 19:16:52 +02:00
|
|
|
return "", fmt.Errorf("invalid parameter type, not string (%T)", op.Params[0])
|
2017-03-01 16:02:53 +00:00
|
|
|
}
|
2018-07-14 02:25:29 +00:00
|
|
|
txt += param.Str()
|
2017-03-01 16:02:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return txt, nil
|
|
|
|
}
|