unipdf/model/optimize/utils.go
Gunnsteinn Hall 11f692bc3a
Font subsetting and font optimization improvements (#362)
* Track runes in IdentityEncoder (for subsetting), track decoded runes

* Working with the identity encoder in font_composite.go

* Add GetFilterArray to multi encoder.  Add comments.

* Add NewFromContents constructor to extractor only requiring contents and resources

* golint fixes

* Optimizer compress streams - improved detection of raw streams

* Optimize - CleanContentStream optimizer that removes redundant operands

* WIP Optimize - clean fonts

Will support both font file reduction and subsetting. (WIP)

* Optimize - image processing - try combined DCT and Flate

* Update options.go

* Update optimizer.go

* Create utils.go for optimize with common methods needed for optimization

* Optimizer - add font subsetting method

Covers XObject Forms, annotaitons etc.  Uses extractor package to extract text marks covering what fonts and glyphs are used.  Package truetype used for subsetting.

* Add some comments

* Fix cmap parsing rune conversion

* Error checking for extractor.  Add some comments.

* Update Jenkinsfile

* Update modules
2020-06-16 21:19:10 +00:00

103 lines
2.2 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"bytes"
"github.com/unidoc/unipdf/v3/core"
)
type objectStructure struct {
catalogDict *core.PdfObjectDictionary
pagesDict *core.PdfObjectDictionary
pages []*core.PdfIndirectObject
}
// getObjectStructure identifies the Catalog and Pages dictionary and finds a list of pages.
func getObjectStructure(objects []core.PdfObject) objectStructure {
objstr := objectStructure{}
found := false
for _, obj := range objects {
switch t := obj.(type) {
case *core.PdfIndirectObject:
dict, is := core.GetDict(t)
if !is {
continue
}
kind, is := core.GetName(dict.Get("Type"))
if !is {
continue
}
switch kind.String() {
case "Catalog":
objstr.catalogDict = dict
found = true
}
}
if found {
break
}
}
if !found {
return objstr
}
pagesDict, ok := core.GetDict(objstr.catalogDict.Get("Pages"))
if !ok {
return objstr
}
objstr.pagesDict = pagesDict
kids, ok := core.GetArray(pagesDict.Get("Kids"))
if !ok {
return objstr
}
for _, obj := range kids.Elements() {
pobj, ok := core.GetIndirect(obj)
if !ok {
break
}
objstr.pages = append(objstr.pages, pobj)
}
return objstr
}
// getPageContents loads the page content stream as a string from a /Contents entry.
// Either a single stream, or an array of streams. Returns the list of objects that
// can be used if need to replace.
func getPageContents(contentsObj core.PdfObject) (contents string, objs []core.PdfObject) {
var buf bytes.Buffer
switch t := contentsObj.(type) {
case *core.PdfIndirectObject:
objs = append(objs, t)
contentsObj = t.PdfObject
}
switch t := contentsObj.(type) {
case *core.PdfObjectStream:
if decoded, err := core.DecodeStream(t); err == nil {
buf.Write(decoded)
objs = append(objs, t)
}
case *core.PdfObjectArray:
for _, elobj := range t.Elements() {
switch el := elobj.(type) {
case *core.PdfObjectStream:
if decoded, err := core.DecodeStream(el); err == nil {
buf.Write(decoded)
objs = append(objs, el)
}
}
}
}
return buf.String(), objs
}