mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-01 22:17:29 +08:00

* Track runes in IdentityEncoder (for subsetting), track decoded runes * Working with the identity encoder in font_composite.go * Add GetFilterArray to multi encoder. Add comments. * Add NewFromContents constructor to extractor only requiring contents and resources * golint fixes * Optimizer compress streams - improved detection of raw streams * Optimize - CleanContentStream optimizer that removes redundant operands * WIP Optimize - clean fonts Will support both font file reduction and subsetting. (WIP) * Optimize - image processing - try combined DCT and Flate * Update options.go * Update optimizer.go * Create utils.go for optimize with common methods needed for optimization * Optimizer - add font subsetting method Covers XObject Forms, annotaitons etc. Uses extractor package to extract text marks covering what fonts and glyphs are used. Package truetype used for subsetting. * Add some comments * Fix cmap parsing rune conversion * Error checking for extractor. Add some comments. * Update Jenkinsfile * Update modules
140 lines
3.9 KiB
Go
140 lines
3.9 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package optimize
|
|
|
|
import (
|
|
"github.com/unidoc/unipdf/v3/contentstream"
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
)
|
|
|
|
// CleanContentstream cleans up redundant operands in content streams, including Page and XObject Form
|
|
// contents. This process includes:
|
|
// 1. Marked content operators are removed.
|
|
// 2. Some operands are simplified (shorter form).
|
|
// TODO: Add more reduction methods and improving the methods for identifying unnecessary operands.
|
|
type CleanContentstream struct {
|
|
}
|
|
|
|
// filterOps cleans up the content stream in `ops`:
|
|
// 1. Marked content operators are cleaned.
|
|
// 2. Tm with 1 0 0 1 params are converted to Td (slightly shorter for same transformation).
|
|
// TODO: Add operations that track the state and remove unnecessary operands, such as duplicates
|
|
// or ones setting default values, or ones not drawing anything.
|
|
func filterOps(ops *contentstream.ContentStreamOperations) *contentstream.ContentStreamOperations {
|
|
if ops == nil {
|
|
return nil
|
|
}
|
|
|
|
filtered := contentstream.ContentStreamOperations{}
|
|
for _, op := range *ops {
|
|
switch op.Operand {
|
|
case "BDC", "BMC", "EMC":
|
|
continue
|
|
case "Tm":
|
|
if len(op.Params) == 6 {
|
|
if nums, err := core.GetNumbersAsFloat(op.Params); err == nil {
|
|
if nums[0] == 1 && nums[1] == 0 && nums[2] == 0 && nums[3] == 1 {
|
|
op = &contentstream.ContentStreamOperation{
|
|
Params: []core.PdfObject{
|
|
op.Params[4],
|
|
op.Params[5],
|
|
},
|
|
Operand: "Td",
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
filtered = append(filtered, op)
|
|
}
|
|
return &filtered
|
|
}
|
|
|
|
// reduceContent performs content stream optimization of contents in `cstream` which can either be
|
|
// from Page Contents or XObject Form.
|
|
// NOTE: If from a Contents array, the operations may be unbalanced.
|
|
func reduceContent(cstream *core.PdfObjectStream) error {
|
|
decoded, err := core.DecodeStream(cstream)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
csp := contentstream.NewContentStreamParser(string(decoded))
|
|
ops, err := csp.Parse()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
ops = filterOps(ops)
|
|
cleaned := ops.Bytes()
|
|
if len(cleaned) >= len(decoded) {
|
|
// No need to replace if no improvement.
|
|
return nil
|
|
}
|
|
|
|
newstream, err := core.MakeStream(ops.Bytes(), core.NewFlateEncoder())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cstream.Stream = newstream.Stream
|
|
cstream.Merge(newstream.PdfObjectDictionary)
|
|
return nil
|
|
}
|
|
|
|
// Optimize optimizes PDF objects to decrease PDF size.
|
|
func (c *CleanContentstream) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
|
|
// Track which content streams to process.
|
|
queuedMap := map[*core.PdfObjectStream]struct{}{}
|
|
var queued []*core.PdfObjectStream
|
|
appendQueue := func(stream *core.PdfObjectStream) {
|
|
if _, has := queuedMap[stream]; !has {
|
|
queuedMap[stream] = struct{}{}
|
|
queued = append(queued, stream)
|
|
}
|
|
}
|
|
|
|
// Collect objects to process: XObject Form and Page Content streams.
|
|
for _, obj := range objects {
|
|
switch t := obj.(type) {
|
|
case *core.PdfIndirectObject:
|
|
switch ti := t.PdfObject.(type) {
|
|
case *core.PdfObjectDictionary:
|
|
if name, ok := core.GetName(ti.Get("Type")); !ok || name.String() != "Page" {
|
|
continue
|
|
}
|
|
|
|
if stream, ok := core.GetStream(ti.Get("Contents")); ok {
|
|
appendQueue(stream)
|
|
} else if array, ok := core.GetArray(ti.Get("Contents")); ok {
|
|
for _, el := range array.Elements() {
|
|
if stream, ok := core.GetStream(el); ok {
|
|
appendQueue(stream)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
case *core.PdfObjectStream:
|
|
if name, ok := core.GetName(t.Get("Type")); !ok || name.String() != "XObject" {
|
|
continue
|
|
}
|
|
if name, ok := core.GetName(t.Get("Subtype")); !ok || name.String() != "Form" {
|
|
continue
|
|
}
|
|
appendQueue(t)
|
|
}
|
|
}
|
|
|
|
// Process the queued content streams.
|
|
for _, stream := range queued {
|
|
err = reduceContent(stream)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return objects, nil
|
|
}
|