Merge branch 'development' of https://github.com/unidoc/unipdf into development

2025-04-26 13:48:55 +08:00 · 2020-06-16 21:19:49 +00:00 · 2020-06-16 21:19:49 +00:00 · dbd2364470
commit dbd2364470
parent 9e5a17eace 11f692bc3a
20 changed files with 851 additions and 95 deletions
--- a/26
+++ b/26
@ -1,13 +1,10 @@
 node {
    // Install the desired Go version
-    def root = tool name: 'go 1.11.5', type: 'go'
+    def root = tool name: 'go 1.14.3', type: 'go'

    env.GOROOT="${root}"
-    env.GOPATH="${WORKSPACE}/gopath"
-    // Hack for 1.11.5 testing work.
-    env.CGO_ENABLED="0"
-    env.PATH="${root}/bin:${env.GOPATH}/bin:${env.PATH}"
-    env.GOCACHE="off"
+    env.GOBIN="${WORKSPACE}/bin"
+    env.PATH="${root}/bin:${env.GOBIN}:${env.PATH}"
    env.UNIDOC_EXTRACT_FORCETEST="1"
    env.UNIDOC_E2E_FORCE_TESTS="1"
    env.UNIDOC_EXTRACT_TESTDATA="/home/jenkins/corpus/unidoc-extractor-testdata"
@ -19,13 +16,13 @@ node {
    env.UNIDOC_JBIG2_TESTDATA="/home/jenkins/corpus/jbig2-testdata"
    env.UNIDOC_FDFMERGE_TESTDATA="/home/jenkins/corpus/fdfmerge-testdata"
    env.UNIDOC_GS_BIN_PATH="/usr/bin/gs"
-    // Hack for 1.11.5 testing work.
    env.CGO_ENABLED="0"

    env.TMPDIR="${WORKSPACE}/temp"
+    sh "mkdir -p ${env.GOBIN}"
    sh "mkdir -p ${env.TMPDIR}"

-    dir("${GOPATH}/src/github.com/unidoc/unipdf") {
+    dir("${WORKSPACE}/unipdf") {
        sh 'go version'

        stage('Checkout') {
@ -35,11 +32,9 @@ node {

        stage('Prepare') {
            // Get linter and other build tools.
-            sh 'go get -u golang.org/x/lint/golint'
+            sh 'go get golang.org/x/lint/golint'
            sh 'go get github.com/tebeka/go2xunit'
            sh 'go get github.com/t-yuki/gocover-cobertura'
-            // Get all dependencies (for tests also).
-            sh 'go get -t ./...'
        }

        stage('Linting') {
@ -53,7 +48,7 @@ node {
        stage('Testing') {
            // Go test - No tolerance.
            sh "rm -f ${env.TMPDIR}/*.pdf"
-            sh '2>&1 go test -v ./... | tee gotest.txt'
+            sh '2>&1 go test -count=1 -v ./... | tee gotest.txt'
        }

        stage('Check generated PDFs') {
@ -62,7 +57,7 @@ node {
        }

        stage('Test coverage') {
-            sh 'go test -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
+            sh 'go test -count=1 -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
            sh '/home/jenkins/codecov.sh'
            sh 'gocover-cobertura < coverage.out > coverage.xml'
            step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml'])
@ -80,7 +75,7 @@ node {
        }
    }

-    dir("${GOPATH}/src/github.com/unidoc/unipdf-examples") {
+    dir("${WORKSPACE}/unipdf-examples") {
        stage('Build examples') {
            // Output environment variables (useful for debugging).
            sh("printenv")
@ -97,6 +92,9 @@ node {

            echo "Pulling unipdf-examples on branch ${examplesBranch}"
            git url: 'https://github.com/unidoc/unidoc-examples.git', branch: examplesBranch
+
+            // Use replace directive to use disk version of unipdf.
+            sh 'echo "replace github.com/unidoc/unipdf/v3 => ../unipdf" >>go.mod'
            
            // Dependencies for examples.
            sh './build_examples.sh'
--- a/core/encoding.go
+++ b/core/encoding.go
@ -948,7 +948,6 @@ func newDCTEncoderFromStream(streamObj *PdfObjectStream, multiEnc *MultiEncoder)
 			return nil, err
 		}
 		encoded = e
-
 	}

 	bufReader := bytes.NewReader(encoded)
@ -2158,6 +2157,9 @@ func newMultiEncoderFromStream(streamObj *PdfObjectStream) (*MultiEncoder, error

 // GetFilterName returns the names of the underlying encoding filters,
 // separated by spaces.
+// Note: This is just a string, should not be used in /Filter dictionary entry. Use GetFilterArray for that.
+// TODO(v4): Refactor to GetFilter() which can be used for /Filter (either Name or Array), this can be
+//  renamed to String() as a pretty string to use in debugging etc.
 func (enc *MultiEncoder) GetFilterName() string {
 	name := ""
 	for idx, encoder := range enc.encoders {
@ -2169,6 +2171,16 @@ func (enc *MultiEncoder) GetFilterName() string {
 	return name
 }

+// GetFilterArray returns the names of the underlying encoding filters in an array that
+// can be used as /Filter entry.
+func (enc *MultiEncoder) GetFilterArray() *PdfObjectArray {
+	names := make([]PdfObject, len(enc.encoders))
+	for i, e := range enc.encoders {
+		names[i] = MakeName(e.GetFilterName())
+	}
+	return MakeArray(names...)
+}
+
 // MakeDecodeParams makes a new instance of an encoding dictionary based on
 // the current encoder settings.
 func (enc *MultiEncoder) MakeDecodeParams() PdfObject {
@ -2201,12 +2213,7 @@ func (enc *MultiEncoder) AddEncoder(encoder StreamEncoder) {
 // MakeStreamDict makes a new instance of an encoding dictionary for a stream object.
 func (enc *MultiEncoder) MakeStreamDict() *PdfObjectDictionary {
 	dict := MakeDict()
-
-	names := make([]PdfObject, len(enc.encoders))
-	for i, e := range enc.encoders {
-		names[i] = MakeName(e.GetFilterName())
-	}
-	dict.Set("Filter", MakeArray(names...))
+	dict.Set("Filter", enc.GetFilterArray())

 	// Pass all values from children, except Filter and DecodeParms.
 	for _, encoder := range enc.encoders {
--- a/extractor/extractor.go
+++ b/extractor/extractor.go
@ -42,9 +42,14 @@ func New(page *model.PdfPage) (*Extractor, error) {
 	// fmt.Printf("%s\n", contents)
 	// fmt.Println("========================= ::: =========================")

+	return NewFromContents(contents, page.Resources)
+}
+
+// NewFromContents creates a new extractor from contents and page resources.
+func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) {
 	e := &Extractor{
 		contents:    contents,
-		resources:   page.Resources,
+		resources:   resources,
 		fontCache:   map[string]fontEntry{},
 		formResults: map[string]textResult{},
 	}
--- a/extractor/text.go
+++ b/extractor/text.go
@ -439,7 +439,11 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 				common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
 				return core.ErrTypeError
 			}
-			to.renderText(charcodes)
+			err := to.renderText(charcodes)
+			if err != nil {
+				common.Log.Debug("Render text error: %v", err)
+				return err
+			}
 		default:
 			common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
 			return core.ErrTypeError
@ -736,6 +740,7 @@ func (to *textObject) renderText(data []byte) error {
 			continue
 		}

+		// TODO(gunnsth): Assuming 1:1 charcode[i] <-> rune[i] mapping.
 		code := charcodes[i]
 		// The location of the text on the page in device coordinates is given by trm, the text
 		// rendering matrix.
@ -785,6 +790,8 @@ func (to *textObject) renderText(data []byte) error {
 		} else if font.Encoder() == nil {
 			common.Log.Debug("ERROR: No encoding. font=%s", font)
 		} else {
+			// TODO: This lookup seems confusing. Went from bytes <-> charcodes already.
+			// NOTE: This is needed to register runes by the font encoder - for subsetting (optimization).
 			original, ok := font.Encoder().CharcodeToRune(code)
 			if ok {
 				mark.original = string(original)
--- a/extractor/text_test.go
+++ b/extractor/text_test.go
@ -51,9 +51,7 @@ var doStress bool
 func init() {
 	flag.BoolVar(&doStress, "extractor-stresstest", false, "Run text extractor stress tests.")
 	common.SetLogger(common.NewConsoleLogger(common.LogLevelInfo))
-	if flag.Lookup("test.v") != nil {
-		isTesting = true
-	}
+	isTesting = true
 }

 // TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
--- a/go.mod
+++ b/go.mod
@ -5,12 +5,15 @@ go 1.11
 require (
 	github.com/adrg/sysfont v0.1.0
 	github.com/boombuler/barcode v1.0.0
+	github.com/davecgh/go-spew v1.1.1
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0
+	github.com/sirupsen/logrus v1.6.0
 	github.com/stretchr/testify v1.4.0
 	github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df
 	github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a
-	github.com/unidoc/unitype v0.1.0
+	github.com/unidoc/unitype v0.2.0
 	golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5
 	golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b
+	golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 // indirect
 	golang.org/x/text v0.3.2
 )
--- a/go.sum
+++ b/go.sum
@ -15,6 +15,8 @@ github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGw
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@ -24,6 +26,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/sirupsen/logrus v1.5.0 h1:1N5EYkVAPEywqZRJd7cwnRtCb6xJx7NH3T3WUTF980Q=
 github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s+Squfpo=
+github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
+github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
@ -34,6 +38,10 @@ github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a h1:RLtvUhe4DsUDl6
 github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a/go.mod h1:j+qMWZVpZFTvDey3zxUkSgPJZEX33tDgU/QIA0IzCUw=
 github.com/unidoc/unitype v0.1.0 h1:6zJYMl8XdwFBD45Cmg8Ge13WyE92jwLuK1tk2IsRb9s=
 github.com/unidoc/unitype v0.1.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
+github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02 h1:zVMJh0ehLc0amGBcqIh7HWikIGXGBGpmW+Lvz1YVYH8=
+github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
+github.com/unidoc/unitype v0.2.0 h1:N+ZKjwz8UDU0qa1IYzstDLffvQEctFo+bo6b6ZqW+9M=
+github.com/unidoc/unitype v0.2.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 h1:58fnuSXlxZmFdJyvtTFVmVhcMLU6v5fEb/ok4wyqtNU=
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@ -45,6 +53,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 h1:opSr2sbRXk5X5/givKrrKj9HXxFpW2sdCiP8MJSKLQY=
 golang.org/x/sys v0.0.0-20200413165638-669c56c373c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 h1:ogLJMz+qpzav7lGMh10LMvAkM/fAoGlaiiHYiFYdm80=
+golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
--- a/internal/textencoding/identity.go
+++ b/internal/textencoding/identity.go
@ -13,47 +13,81 @@ import (
 	"github.com/unidoc/unipdf/v3/core"
 )

-// IdentityEncoder represents an 2-byte identity encoding
+// IdentityEncoder represents an 2-byte identity encoding.
+// NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode
+//  meaning, except via the ToUnicode maps.
+// TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing.
+//  Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual
+//  runes and character codes, CMaps together.
 type IdentityEncoder struct {
 	baseName string
+
+	// runes registered by encoder for tracking what runes are used for subsetting.
+	registeredMap map[rune]struct{}
 }

 // NewIdentityTextEncoder returns a new IdentityEncoder based on predefined
 // encoding `baseName` and difference map `differences`.
-func NewIdentityTextEncoder(baseName string) IdentityEncoder {
-	return IdentityEncoder{baseName}
+func NewIdentityTextEncoder(baseName string) *IdentityEncoder {
+	return &IdentityEncoder{
+		baseName: baseName,
+	}
+}
+
+// RegisteredRunes returns the slice of runes that have been registered as used by the encoder.
+func (enc *IdentityEncoder) RegisteredRunes() []rune {
+	runes := make([]rune, len(enc.registeredMap))
+	i := 0
+	for r := range enc.registeredMap {
+		runes[i] = r
+		i++
+	}
+	return runes
 }

 // String returns a string that describes `enc`.
-func (enc IdentityEncoder) String() string {
+func (enc *IdentityEncoder) String() string {
 	return enc.baseName
 }

 // Encode converts the Go unicode string to a PDF encoded string.
-func (enc IdentityEncoder) Encode(str string) []byte {
+func (enc *IdentityEncoder) Encode(str string) []byte {
 	return encodeString16bit(enc, str)
 }

 // Decode converts PDF encoded string to a Go unicode string.
-func (enc IdentityEncoder) Decode(raw []byte) string {
+func (enc *IdentityEncoder) Decode(raw []byte) string {
 	return decodeString16bit(enc, raw)
 }

 // RuneToCharcode converts rune `r` to a PDF character code.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
+// TODO: Here the `r` is an actual rune.
+func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+	enc.registeredMap[r] = struct{}{} // Register use (subsetting).
+
 	return CharCode(r), true
 }

 // CharcodeToRune converts PDF character code `code` to a rune.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
+// TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both).
+func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+
+	// TODO: The rune(code) is confusing and is not an actual utf8 rune.
+	enc.registeredMap[rune(code)] = struct{}{}
 	return rune(code), true
 }

 // RuneToGlyph returns the glyph name for rune `r`.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
+func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
 	if r == ' ' {
 		return "space", true
 	}
@ -63,7 +97,7 @@ func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {

 // GlyphToRune returns the rune corresponding to glyph name `glyph`.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
+func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
 	// String with "uniXXXX" format where XXXX is the hexcode.
 	if glyph == "space" {
 		return ' ', true
@ -78,7 +112,7 @@ func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
 }

 // ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
-func (enc IdentityEncoder) ToPdfObject() core.PdfObject {
+func (enc *IdentityEncoder) ToPdfObject() core.PdfObject {
 	if enc.baseName != "" {
 		return core.MakeName(enc.baseName)
 	}
--- a/internal/textencoding/simple.go
+++ b/internal/textencoding/simple.go
@ -103,6 +103,9 @@ type simpleEncoding struct {
 	// one byte encoding: CharCode <-> byte
 	encode map[rune]byte
 	decode map[byte]rune
+
+	// runes registered by encoder for tracking what runes are used for subsetting.
+	registeredMap map[rune]struct{}
 }

 // Encode converts the Go unicode string to a PDF encoded string.
@ -213,6 +216,10 @@ func (enc *simpleEncoding) Charcodes() []CharCode {

 func (enc *simpleEncoding) RuneToCharcode(r rune) (CharCode, bool) {
 	b, ok := enc.encode[r]
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+	enc.registeredMap[r] = struct{}{} // Register use (subsetting).
 	return CharCode(b), ok
 }

@ -222,6 +229,10 @@ func (enc *simpleEncoding) CharcodeToRune(code CharCode) (rune, bool) {
 	}
 	b := byte(code)
 	r, ok := enc.decode[b]
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+	enc.registeredMap[r] = struct{}{} // Register use (subsetting).
 	return r, ok
 }

--- a/model/annotations.go
+++ b/model/annotations.go
@ -103,31 +103,31 @@ type PdfAnnotationLink struct {
 }

 // GetAction returns the PDF action for the annotation link.
-func (a *PdfAnnotationLink) GetAction() (*PdfAction, error) {
-	if a.action != nil {
-		return a.action, nil
+func (link *PdfAnnotationLink) GetAction() (*PdfAction, error) {
+	if link.action != nil {
+		return link.action, nil
 	}
-	if a.A == nil {
+	if link.A == nil {
 		return nil, nil
 	}
-	if a.reader == nil {
+	if link.reader == nil {
 		return nil, nil
 	}

-	action, err := a.reader.loadAction(a.A)
+	action, err := link.reader.loadAction(link.A)
 	if err != nil {
 		return nil, err
 	}
-	a.action = action
+	link.action = action

-	return a.action, nil
+	return link.action, nil
 }

 // SetAction sets the PDF action for the annotation link.
-func (a *PdfAnnotationLink) SetAction(action *PdfAction) {
-	a.action = action
+func (link *PdfAnnotationLink) SetAction(action *PdfAction) {
+	link.action = action
 	if action == nil {
-		a.A = nil
+		link.A = nil
 	}
 }

--- a/model/font.go
+++ b/model/font.go
@ -50,6 +50,7 @@ func (font *PdfFont) SubsetRegistered() error {
 	case *pdfFontType0:
 		err := t.subsetRegistered()
 		if err != nil {
+			common.Log.Debug("Subset error: %v", err)
 			return err
 		}
 		if t.container != nil {
@ -401,6 +402,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {

 	charcodes := make([]textencoding.CharCode, 0, len(data)+len(data)%2)
 	if font.baseFields().isCIDFont() {
+		// Identity only?
 		if len(data) == 1 {
 			data = []byte{0, data[0]}
 		}
@ -413,6 +415,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
 			charcodes = append(charcodes, textencoding.CharCode(b))
 		}
 	} else {
+		// Simple font: byte -> charcode.
 		for _, b := range data {
 			charcodes = append(charcodes, textencoding.CharCode(b))
 		}
@ -755,8 +758,7 @@ func (base fontCommon) isCIDFont() bool {
 // newFontBaseFieldsFromPdfObject returns `fontObj` as a dictionary the common fields from that
 // dictionary in the fontCommon return.  If there is a problem an error is returned.
 // The fontCommon is the group of fields common to all PDF fonts.
-func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon,
-	error) {
+func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon, error) {
 	font := &fontCommon{}

 	if obj, ok := fontObj.(*core.PdfIndirectObject); ok {
--- a/model/font_composite.go
+++ b/model/font_composite.go
@ -127,6 +127,9 @@ func (font *pdfFontType0) baseFields() *fontCommon {
 }

 func (font *pdfFontType0) getFontDescriptor() *PdfFontDescriptor {
+	if font.fontDescriptor == nil && font.DescendantFont != nil {
+		return font.DescendantFont.FontDescriptor()
+	}
 	return font.fontDescriptor
 }

@ -210,14 +213,19 @@ func (font *pdfFontType0) subsetRegistered() error {
 		common.Log.Debug("Missing font descriptor")
 		return nil
 	}
+	if font.encoder == nil {
+		common.Log.Debug("No encoder - subsetting ignored")
+		return nil
+	}

 	stream, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2)
 	if !ok {
-		common.Log.Debug("Embedded font object not found -- ABORT subsseting")
+		common.Log.Debug("Embedded font object not found -- ABORT subsetting")
 		return errors.New("fontfile2 not found")
 	}
 	decoded, err := core.DecodeStream(stream)
 	if err != nil {
+		common.Log.Debug("Decode error: %v", err)
 		return err
 	}

@ -227,21 +235,52 @@ func (font *pdfFontType0) subsetRegistered() error {
 		return err
 	}

-	tenc, ok := font.encoder.(*textencoding.TrueTypeFontEncoder)
-	if !ok {
-		return fmt.Errorf("unsupported encoder for subsetting: %T", cidfnt.encoder)
+	var runes []rune
+	var subset *unitype.Font
+	switch tenc := font.encoder.(type) {
+	case *textencoding.TrueTypeFontEncoder:
+		// Means the font has been loaded from TTF file.
+		runes = tenc.RegisteredRunes()
+		subset, err = fnt.SubsetKeepRunes(runes)
+		if err != nil {
+			common.Log.Debug("ERROR: %v", err)
+			return err
+		}
+		// Reduce the encoder also.
+		tenc.SubsetRegistered()
+	case *textencoding.IdentityEncoder:
+		// IdentityEncoder typically means font was parsed from PDF file.
+		// TODO: These are not actual runes... but glyph ids ? Very confusing.
+		runes = tenc.RegisteredRunes()
+		indices := make([]unitype.GlyphIndex, len(runes))
+		for i, r := range runes {
+			indices[i] = unitype.GlyphIndex(r)
+		}
+
+		subset, err = fnt.SubsetKeepIndices(indices)
+		if err != nil {
+			common.Log.Debug("ERROR: %v", err)
+			return err
+		}
+	case textencoding.SimpleEncoder:
+		// Simple encoding, bytes are 0-255
+		charcodes := tenc.Charcodes()
+		for _, c := range charcodes {
+			r, ok := tenc.CharcodeToRune(c)
+			if !ok {
+				common.Log.Debug("ERROR: unable convert charcode to rune: %d", c)
+				continue
+			}
+			runes = append(runes, r)
+		}
+	default:
+		return fmt.Errorf("unsupported encoder for subsetting: %T", font.encoder)
 	}

-	runes := tenc.RegisteredRunes()
-	subset, err := fnt.SubsetKeepRunes(runes)
-	if err != nil {
-		return err
-	}
-	// Reduce the encoder also.
-	tenc.SubsetRegistered()
 	var buf bytes.Buffer
 	err = subset.Write(&buf)
 	if err != nil {
+		common.Log.Debug("ERROR: %v", err)
 		return err
 	}

@ -249,7 +288,7 @@ func (font *pdfFontType0) subsetRegistered() error {
 	if font.toUnicodeCmap != nil {
 		codeToUnicode := make(map[cmap.CharCode]rune, len(runes))
 		for _, r := range runes {
-			cc, ok := tenc.RuneToCharcode(r)
+			cc, ok := font.encoder.RuneToCharcode(r)
 			if !ok {
 				continue
 			}
@ -260,9 +299,16 @@ func (font *pdfFontType0) subsetRegistered() error {

 	stream, err = core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
 	if err != nil {
+		common.Log.Debug("ERROR: %v", err)
 		return err
 	}
-	cidfnt.fontDescriptor.FontFile2 = stream
+	stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
+	if curstr, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2); ok {
+		// Replace the current stream (keep same object).
+		*curstr = *stream
+	} else {
+		cidfnt.fontDescriptor.FontFile2 = stream
+	}

 	// Set subset name.
 	tag := genSubsetTag()
@ -334,6 +380,7 @@ func newPdfFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon)

 	encoderName, ok := core.GetNameVal(d.Get("Encoding"))
 	if ok {
+		// TODO: Identity-H maps 16-bit character codes straight to glyph index (don't need actual runes).
 		if encoderName == "Identity-H" || encoderName == "Identity-V" {
 			font.encoder = textencoding.NewIdentityTextEncoder(encoderName)
 		} else if cmap.IsPredefinedCMap(encoderName) {
--- a/model/optimize/clean_contentstream.go
+++ b/model/optimize/clean_contentstream.go
@ -0,0 +1,139 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package optimize
+
+import (
+	"github.com/unidoc/unipdf/v3/contentstream"
+	"github.com/unidoc/unipdf/v3/core"
+)
+
+// CleanContentstream cleans up redundant operands in content streams, including Page and XObject Form
+// contents. This process includes:
+// 1. Marked content operators are removed.
+// 2. Some operands are simplified (shorter form).
+// TODO: Add more reduction methods and improving the methods for identifying unnecessary operands.
+type CleanContentstream struct {
+}
+
+// filterOps cleans up the content stream in `ops`:
+// 1. Marked content operators are cleaned.
+// 2. Tm with 1 0 0 1 params are converted to Td (slightly shorter for same transformation).
+// TODO: Add operations that track the state and remove unnecessary operands, such as duplicates
+//  or ones setting default values, or ones not drawing anything.
+func filterOps(ops *contentstream.ContentStreamOperations) *contentstream.ContentStreamOperations {
+	if ops == nil {
+		return nil
+	}
+
+	filtered := contentstream.ContentStreamOperations{}
+	for _, op := range *ops {
+		switch op.Operand {
+		case "BDC", "BMC", "EMC":
+			continue
+		case "Tm":
+			if len(op.Params) == 6 {
+				if nums, err := core.GetNumbersAsFloat(op.Params); err == nil {
+					if nums[0] == 1 && nums[1] == 0 && nums[2] == 0 && nums[3] == 1 {
+						op = &contentstream.ContentStreamOperation{
+							Params: []core.PdfObject{
+								op.Params[4],
+								op.Params[5],
+							},
+							Operand: "Td",
+						}
+					}
+				}
+			}
+		}
+		filtered = append(filtered, op)
+	}
+	return &filtered
+}
+
+// reduceContent performs content stream optimization of contents in `cstream` which can either be
+// from Page Contents or XObject Form.
+// NOTE: If from a Contents array, the operations may be unbalanced.
+func reduceContent(cstream *core.PdfObjectStream) error {
+	decoded, err := core.DecodeStream(cstream)
+	if err != nil {
+		return err
+	}
+
+	csp := contentstream.NewContentStreamParser(string(decoded))
+	ops, err := csp.Parse()
+	if err != nil {
+		return err
+	}
+
+	ops = filterOps(ops)
+	cleaned := ops.Bytes()
+	if len(cleaned) >= len(decoded) {
+		// No need to replace if no improvement.
+		return nil
+	}
+
+	newstream, err := core.MakeStream(ops.Bytes(), core.NewFlateEncoder())
+	if err != nil {
+		return err
+	}
+	cstream.Stream = newstream.Stream
+	cstream.Merge(newstream.PdfObjectDictionary)
+	return nil
+}
+
+// Optimize optimizes PDF objects to decrease PDF size.
+func (c *CleanContentstream) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
+	// Track which content streams to process.
+	queuedMap := map[*core.PdfObjectStream]struct{}{}
+	var queued []*core.PdfObjectStream
+	appendQueue := func(stream *core.PdfObjectStream) {
+		if _, has := queuedMap[stream]; !has {
+			queuedMap[stream] = struct{}{}
+			queued = append(queued, stream)
+		}
+	}
+
+	// Collect objects to process: XObject Form and Page Content streams.
+	for _, obj := range objects {
+		switch t := obj.(type) {
+		case *core.PdfIndirectObject:
+			switch ti := t.PdfObject.(type) {
+			case *core.PdfObjectDictionary:
+				if name, ok := core.GetName(ti.Get("Type")); !ok || name.String() != "Page" {
+					continue
+				}
+
+				if stream, ok := core.GetStream(ti.Get("Contents")); ok {
+					appendQueue(stream)
+				} else if array, ok := core.GetArray(ti.Get("Contents")); ok {
+					for _, el := range array.Elements() {
+						if stream, ok := core.GetStream(el); ok {
+							appendQueue(stream)
+						}
+					}
+				}
+			}
+		case *core.PdfObjectStream:
+			if name, ok := core.GetName(t.Get("Type")); !ok || name.String() != "XObject" {
+				continue
+			}
+			if name, ok := core.GetName(t.Get("Subtype")); !ok || name.String() != "Form" {
+				continue
+			}
+			appendQueue(t)
+		}
+	}
+
+	// Process the queued content streams.
+	for _, stream := range queued {
+		err = reduceContent(stream)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return objects, nil
+}
--- a/model/optimize/clean_fonts.go
+++ b/model/optimize/clean_fonts.go
@ -0,0 +1,353 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package optimize
+
+import (
+	"bytes"
+	"errors"
+
+	"github.com/unidoc/unitype"
+
+	"github.com/unidoc/unipdf/v3/common"
+	"github.com/unidoc/unipdf/v3/core"
+	"github.com/unidoc/unipdf/v3/extractor"
+	"github.com/unidoc/unipdf/v3/internal/textencoding"
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+// CleanFonts cleans up embedded fonts, reducing font sizes.
+type CleanFonts struct {
+	// Subset embedded fonts if encountered (if true).
+	// Otherwise attempts to reduce the font program.
+	Subset bool
+}
+
+func optimizeFontsWithSubsetting(objects []core.PdfObject) (processed map[*core.PdfObjectStream]struct{}, err error) {
+	// 1. Identify all fonts.
+	// 2. Identify content streams and their Resources dictionaries (both via page, forms and annotations).
+	// 3. Process content streams.
+	processed = map[*core.PdfObjectStream]struct{}{}
+
+	fontMap := map[*model.PdfFont]struct{}{}
+
+	objstr := getObjectStructure(objects)
+	for _, p := range objstr.pages {
+		pdict, ok := core.GetDict(p.PdfObject)
+		if !ok {
+			continue
+		}
+		resourcesDict, ok := core.GetDict(pdict.Get("Resources"))
+		if !ok {
+			continue
+		}
+		contents, _ := getPageContents(pdict.Get("Contents"))
+		resources, err := model.NewPdfPageResourcesFromDict(resourcesDict)
+		if err != nil {
+			return nil, err
+		}
+
+		allContents := []content{
+			{
+				content:   contents,
+				resources: resources,
+			},
+		}
+
+		annotContents := getAnnotationContents(pdict.Get("Annots"))
+		if annotContents != nil {
+			allContents = append(allContents, annotContents...)
+		}
+
+		for _, cont := range allContents {
+			e, err := extractor.NewFromContents(cont.content, cont.resources)
+			if err != nil {
+				return nil, err
+			}
+
+			pt, _, _, err := e.ExtractPageText()
+			if err != nil {
+				return nil, err
+			}
+
+			for _, el := range pt.Marks().Elements() {
+				if el.Font == nil {
+					continue
+				}
+				if _, has := fontMap[el.Font]; !has {
+					fontMap[el.Font] = struct{}{}
+				}
+			}
+		}
+	}
+
+	// Map of font program stream to font. Multiple fonts can use the same font program.
+	fontFileMap := map[*core.PdfObjectStream][]*model.PdfFont{}
+	for font := range fontMap {
+		fontDesc := font.FontDescriptor()
+		if fontDesc == nil || fontDesc.FontFile2 == nil {
+			continue
+		}
+		stream, ok := core.GetStream(fontDesc.FontFile2)
+		if !ok {
+			continue
+		}
+		fontFileMap[stream] = append(fontFileMap[stream], font)
+	}
+
+	for stream := range fontFileMap {
+		var allRunes []rune
+		var allIndices []unitype.GlyphIndex
+
+		for _, font := range fontFileMap[stream] {
+			switch t := font.Encoder().(type) {
+			case *textencoding.IdentityEncoder:
+				// TODO: This terminology is wrong as those are not runes, just charcodes cast as runes.
+				//   Identity encoder maps via 2-byte encoding directly from 2byte charcode to glyph index.
+				runes := t.RegisteredRunes()
+				indices := make([]unitype.GlyphIndex, len(runes))
+				for i, r := range runes {
+					indices[i] = unitype.GlyphIndex(r)
+				}
+				allIndices = append(allIndices, indices...)
+			case *textencoding.TrueTypeFontEncoder:
+				runes := t.RegisteredRunes()
+				allRunes = append(allRunes, runes...)
+			case textencoding.SimpleEncoder:
+				charcodes := t.Charcodes()
+				for _, c := range charcodes {
+					r, ok := t.CharcodeToRune(c)
+					if !ok {
+						common.Log.Debug("Charcode<->rune not found: %d", c)
+						continue
+					}
+					allRunes = append(allRunes, r)
+				}
+			}
+		}
+
+		err = subsetFontStream(stream, allRunes, allIndices)
+		if err != nil {
+			common.Log.Debug("ERROR subsetting font stream: %v", err)
+			return nil, err
+		}
+		processed[stream] = struct{}{}
+	}
+	return processed, nil
+}
+
+// Subsets the font program in `stream` with the subset based on the `runes` and glyph `indices`.
+func subsetFontStream(stream *core.PdfObjectStream, runes []rune, indices []unitype.GlyphIndex) error {
+	stream, ok := core.GetStream(stream)
+	if !ok {
+		common.Log.Debug("Embedded font object not found -- ABORT subsetting")
+		return errors.New("fontfile2 not found")
+	}
+	decoded, err := core.DecodeStream(stream)
+	if err != nil {
+		common.Log.Debug("Decode error: %v", err)
+		return err
+	}
+
+	fnt, err := unitype.Parse(bytes.NewReader(decoded))
+	if err != nil {
+		common.Log.Debug("Error parsing %d byte font", len(stream.Stream))
+		return err
+	}
+
+	allIndices := indices
+	if len(runes) > 0 {
+		indices := fnt.LookupRunes(runes)
+		allIndices = append(allIndices, indices...)
+	}
+
+	fnt, err = fnt.SubsetKeepIndices(allIndices)
+	if err != nil {
+		common.Log.Debug("ERROR subsetting font: %v", err)
+		return err
+	}
+
+	var buf bytes.Buffer
+	err = fnt.Write(&buf)
+	if err != nil {
+		common.Log.Debug("ERROR Writing font: %v", err)
+		return err
+	}
+	if buf.Len() > len(decoded) {
+		common.Log.Debug("Re-written font is larger than original - skip")
+		return nil
+	}
+
+	newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
+	if err != nil {
+		common.Log.Debug("ERROR Writing font: %v", err)
+		return err
+	}
+	// Overwrite.
+	*stream = *newstream
+	stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
+
+	return nil
+}
+
+// Optimize optimizes PDF objects to decrease PDF size.
+func (c *CleanFonts) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
+	var processed map[*core.PdfObjectStream]struct{}
+	if c.Subset {
+		var err error
+		processed, err = optimizeFontsWithSubsetting(objects)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Clean font streams by loading and rewriting with minimal needed tables.
+	for _, obj := range objects {
+		stream, isStreamObj := core.GetStream(obj)
+		if !isStreamObj {
+			continue
+		}
+		if _, has := processed[stream]; has {
+			// Skip - has been processed.
+			continue
+		}
+
+		encoder, err := core.NewEncoderFromStream(stream)
+		if err != nil {
+			common.Log.Debug("ERROR getting encoder: %v - ignoring", err)
+			continue
+		}
+
+		decoded, err := encoder.DecodeStream(stream)
+		if err != nil {
+			common.Log.Debug("Decoding error : %v - ignoring", err)
+			continue
+		}
+		if len(decoded) < 4 {
+			continue
+		}
+
+		version := string(decoded[:4])
+		if version == "OTTO" {
+			// Fonts based on PostScript outlines not supported yet.
+			// See https://docs.microsoft.com/en-us/typography/opentype/spec/otff
+			continue
+		}
+		if version != "\x00\x01\x00\x00" && version != "true" {
+			continue
+		}
+
+		fnt, err := unitype.Parse(bytes.NewReader(decoded))
+		if err != nil {
+			common.Log.Debug("ERROR Parsing font: %v - ignoring", err)
+			continue
+		}
+		err = fnt.Optimize()
+		if err != nil {
+			continue
+		}
+
+		var buf bytes.Buffer
+		err = fnt.Write(&buf)
+		if err != nil {
+			common.Log.Debug("ERROR Writing font: %v - ignoring", err)
+			continue
+		}
+		if buf.Len() > len(decoded) {
+			common.Log.Debug("Re-written font is larger than original - skip")
+			continue
+		}
+
+		newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
+		if err != nil {
+			continue
+		}
+		// Overwrite.
+		*stream = *newstream
+		stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
+	}
+	return objects, nil
+}
+
+// content describes page or font contents which is a content stream along with resources.
+type content struct {
+	content   string
+	resources *model.PdfPageResources
+}
+
+// Best effort to get annotation contents.
+func getAnnotationContents(annotsObj core.PdfObject) []content {
+	if annotsObj == nil {
+		return nil
+	}
+	annotsArr, ok := core.GetArray(annotsObj)
+	if !ok {
+		common.Log.Debug("Annots not an array")
+		return nil
+	}
+
+	var annotContents []content
+	for _, obj := range annotsArr.Elements() {
+		annotDict, ok := core.GetDict(obj)
+		if !ok {
+			// Ignore any non dict elements.
+			common.Log.Debug("Ignoring non-dict element in Annots")
+			continue
+		}
+
+		// Appearance.
+		appDict, ok := core.GetDict(annotDict.Get("AP"))
+		if !ok {
+			common.Log.Debug("No AP entry - skipping")
+			continue
+		}
+
+		normal := core.TraceToDirectObject(appDict.Get("N"))
+		if normal == nil {
+			common.Log.Debug("No N entry - skipping")
+			continue
+		}
+
+		var stream *core.PdfObjectStream
+		switch t := normal.(type) {
+		case *core.PdfObjectDictionary:
+			appState, ok := core.GetName(annotDict.Get("AS"))
+			if !ok {
+				common.Log.Debug("No AS entry - skipping")
+				continue
+			}
+			stream, ok = core.GetStream(t.Get(*appState))
+			if !ok {
+				common.Log.Debug("Form not found - skipping")
+				continue
+			}
+		case *core.PdfObjectStream:
+			stream = t
+		}
+		if stream == nil {
+			common.Log.Debug("Form not found (nil) - skipping")
+			continue
+		}
+
+		xform, err := model.NewXObjectFormFromStream(stream)
+		if err != nil {
+			common.Log.Debug("Error loading form: %v - ignoring", err)
+			continue
+		}
+
+		contents, err := xform.GetContentStream()
+		if err != nil {
+			common.Log.Debug("Error decoding contents: %v", err)
+			continue
+		}
+
+		annotContents = append(annotContents, content{
+			content:   string(contents),
+			resources: xform.Resources,
+		})
+	}
+
+	return annotContents
+}
--- a/model/optimize/compress_streams.go
+++ b/model/optimize/compress_streams.go
@ -23,9 +23,17 @@ func (c *CompressStreams) Optimize(objects []core.PdfObject) (optimizedObjects [
 		if !isStreamObj {
 			continue
 		}
-		if _, found := core.GetName(stream.PdfObjectDictionary.Get("Filter")); found {
-			continue
+		// Skip objects that are already encoded.
+		// TODO: Try filter combinations, and ignoring inefficient filters.
+		if obj := stream.Get("Filter"); obj != nil {
+			if _, skip := core.GetName(obj); skip {
+				continue
+			}
+			if arr, ok := core.GetArray(obj); ok && arr.Len() > 0 {
+				continue
+			}
 		}
+
 		encoder := core.NewFlateEncoder() // Most mainstream compressor and probably most robust.
 		var data []byte
 		data, err = encoder.EncodeBytes(stream.Stream)
--- a/model/optimize/image.go
+++ b/model/optimize/image.go
@ -110,28 +110,51 @@ func (i *Image) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfO
 			common.Log.Warning("Error decode the image stream %s")
 			continue
 		}
-		encoder := core.NewDCTEncoder()
-		encoder.ColorComponents = img.ColorComponents
-		encoder.Quality = i.ImageQuality
-		encoder.BitsPerComponent = img.BitsPerComponent
-		encoder.Width = img.Width
-		encoder.Height = img.Height
-		streamData, err := encoder.EncodeBytes(data)
+		dctenc := core.NewDCTEncoder()
+		dctenc.ColorComponents = img.ColorComponents
+		dctenc.Quality = i.ImageQuality
+		dctenc.BitsPerComponent = img.BitsPerComponent
+		dctenc.Width = img.Width
+		dctenc.Height = img.Height
+		streamData, err := dctenc.EncodeBytes(data)
 		if err != nil {
+			common.Log.Debug("ERROR: %v", err)
 			return nil, err
 		}
+
+		var filter core.StreamEncoder
+		filter = dctenc
+
+		// Check if combining with FlateEncoding improves things further.
+		{
+			flate := core.NewFlateEncoder()
+			multienc := core.NewMultiEncoder()
+			multienc.AddEncoder(flate)
+			multienc.AddEncoder(dctenc)
+
+			encoded, err := multienc.EncodeBytes(data)
+			if err != nil {
+				return nil, err
+			}
+			if len(encoded) < len(streamData) {
+				common.Log.Debug("Multi enc improves: %d to %d (orig %d)",
+					len(streamData), len(encoded), len(stream.Stream))
+				streamData = encoded
+				filter = multienc
+			}
+		}
+
 		originalSize := len(stream.Stream)
 		if originalSize < len(streamData) {
+			// Worse - ignoring.
 			continue
 		}
 		newStream := &core.PdfObjectStream{Stream: streamData}
 		newStream.PdfObjectReference = stream.PdfObjectReference
 		newStream.PdfObjectDictionary = core.MakeDict()
-		newStream.PdfObjectDictionary.Merge(stream.PdfObjectDictionary)
-		fn := core.PdfObjectName(encoder.GetFilterName())
-		newStream.PdfObjectDictionary.Set(core.PdfObjectName("Filter"), &fn)
-		ln := core.PdfObjectInteger(int64(len(streamData)))
-		newStream.PdfObjectDictionary.Set(core.PdfObjectName("Length"), &ln)
+		newStream.Merge(stream.PdfObjectDictionary)
+		newStream.Merge(filter.MakeStreamDict())
+		newStream.Set("Length", core.MakeInteger(int64(len(streamData))))
 		replaceTable[stream] = newStream
 		images[index].Stream = newStream
 	}
--- a/model/optimize/optimizer.go
+++ b/model/optimize/optimizer.go
@ -12,6 +12,12 @@ import (
 // New creates a optimizers chain from options.
 func New(options Options) *Chain {
 	chain := new(Chain)
+	if options.CleanFonts || options.SubsetFonts {
+		chain.Append(&CleanFonts{Subset: options.SubsetFonts})
+	}
+	if options.CleanContentstream {
+		chain.Append(new(CleanContentstream))
+	}
 	if options.ImageUpperPPI > 0 {
 		imageOptimizer := new(ImagePPI)
 		imageOptimizer.ImageUpperPPI = options.ImageUpperPPI
--- a/model/optimize/options.go
+++ b/model/optimize/options.go
@ -14,4 +14,7 @@ type Options struct {
 	UseObjectStreams                bool
 	CombineIdenticalIndirectObjects bool
 	CompressStreams                 bool
+	CleanFonts                      bool
+	SubsetFonts                     bool
+	CleanContentstream              bool
 }
--- a/model/optimize/utils.go
+++ b/model/optimize/utils.go
@ -0,0 +1,102 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package optimize
+
+import (
+	"bytes"
+
+	"github.com/unidoc/unipdf/v3/core"
+)
+
+type objectStructure struct {
+	catalogDict *core.PdfObjectDictionary
+	pagesDict   *core.PdfObjectDictionary
+	pages       []*core.PdfIndirectObject
+}
+
+// getObjectStructure identifies the Catalog and Pages dictionary and finds a list of pages.
+func getObjectStructure(objects []core.PdfObject) objectStructure {
+	objstr := objectStructure{}
+	found := false
+	for _, obj := range objects {
+		switch t := obj.(type) {
+		case *core.PdfIndirectObject:
+			dict, is := core.GetDict(t)
+			if !is {
+				continue
+			}
+			kind, is := core.GetName(dict.Get("Type"))
+			if !is {
+				continue
+			}
+
+			switch kind.String() {
+			case "Catalog":
+				objstr.catalogDict = dict
+				found = true
+			}
+		}
+		if found {
+			break
+		}
+	}
+
+	if !found {
+		return objstr
+	}
+
+	pagesDict, ok := core.GetDict(objstr.catalogDict.Get("Pages"))
+	if !ok {
+		return objstr
+	}
+	objstr.pagesDict = pagesDict
+
+	kids, ok := core.GetArray(pagesDict.Get("Kids"))
+	if !ok {
+		return objstr
+	}
+	for _, obj := range kids.Elements() {
+		pobj, ok := core.GetIndirect(obj)
+		if !ok {
+			break
+		}
+		objstr.pages = append(objstr.pages, pobj)
+	}
+
+	return objstr
+}
+
+// getPageContents loads the page content stream as a string from a /Contents entry.
+// Either a single stream, or an array of streams. Returns the list of objects that
+// can be used if need to replace.
+func getPageContents(contentsObj core.PdfObject) (contents string, objs []core.PdfObject) {
+	var buf bytes.Buffer
+
+	switch t := contentsObj.(type) {
+	case *core.PdfIndirectObject:
+		objs = append(objs, t)
+		contentsObj = t.PdfObject
+	}
+
+	switch t := contentsObj.(type) {
+	case *core.PdfObjectStream:
+		if decoded, err := core.DecodeStream(t); err == nil {
+			buf.Write(decoded)
+			objs = append(objs, t)
+		}
+	case *core.PdfObjectArray:
+		for _, elobj := range t.Elements() {
+			switch el := elobj.(type) {
+			case *core.PdfObjectStream:
+				if decoded, err := core.DecodeStream(el); err == nil {
+					buf.Write(decoded)
+					objs = append(objs, el)
+				}
+			}
+		}
+	}
+	return buf.String(), objs
+}
--- a/model/page.go
+++ b/model/page.go
@ -281,42 +281,42 @@ func (r *PdfReader) newPdfPageFromDict(p *core.PdfObjectDictionary) (*PdfPage, e

 // GetAnnotations returns the list of page annotations for `page`. If not loaded attempts to load the
 // annotations, otherwise returns the loaded list.
-func (page *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) {
-	if page.annotations != nil {
-		return page.annotations, nil
+func (p *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) {
+	if p.annotations != nil {
+		return p.annotations, nil
 	}
-	if page.Annots == nil {
-		page.annotations = []*PdfAnnotation{}
+	if p.Annots == nil {
+		p.annotations = []*PdfAnnotation{}
 		return nil, nil
 	}
-	if page.reader == nil {
-		page.annotations = []*PdfAnnotation{}
+	if p.reader == nil {
+		p.annotations = []*PdfAnnotation{}
 		return nil, nil
 	}

-	annots, err := page.reader.loadAnnotations(page.Annots)
+	annots, err := p.reader.loadAnnotations(p.Annots)
 	if err != nil {
 		return nil, err
 	}
 	if annots == nil {
-		page.annotations = []*PdfAnnotation{}
+		p.annotations = []*PdfAnnotation{}
 	}

-	page.annotations = annots
-	return page.annotations, nil
+	p.annotations = annots
+	return p.annotations, nil
 }

 // AddAnnotation appends `annot` to the list of page annotations.
-func (page *PdfPage) AddAnnotation(annot *PdfAnnotation) {
-	if page.annotations == nil {
-		page.GetAnnotations() // Ensure has been loaded.
+func (p *PdfPage) AddAnnotation(annot *PdfAnnotation) {
+	if p.annotations == nil {
+		p.GetAnnotations() // Ensure has been loaded.
 	}
-	page.annotations = append(page.annotations, annot)
+	p.annotations = append(p.annotations, annot)
 }

 // SetAnnotations sets the annotations list.
-func (page *PdfPage) SetAnnotations(annotations []*PdfAnnotation) {
-	page.annotations = annotations
+func (p *PdfPage) SetAnnotations(annotations []*PdfAnnotation) {
+	p.annotations = annotations
 }

 // loadAnnotations loads and returns the PDF annotations from the input annotations object (array).