mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00

* Track runes in IdentityEncoder (for subsetting), track decoded runes * Working with the identity encoder in font_composite.go * Add GetFilterArray to multi encoder. Add comments. * Add NewFromContents constructor to extractor only requiring contents and resources * golint fixes * Optimizer compress streams - improved detection of raw streams * Optimize - CleanContentStream optimizer that removes redundant operands * WIP Optimize - clean fonts Will support both font file reduction and subsetting. (WIP) * Optimize - image processing - try combined DCT and Flate * Update options.go * Update optimizer.go * Create utils.go for optimize with common methods needed for optimization * Optimizer - add font subsetting method Covers XObject Forms, annotaitons etc. Uses extractor package to extract text marks covering what fonts and glyphs are used. Package truetype used for subsetting. * Add some comments * Fix cmap parsing rune conversion * Error checking for extractor. Add some comments. * Update Jenkinsfile * Update modules
103 lines
2.2 KiB
Go
103 lines
2.2 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package optimize
|
|
|
|
import (
|
|
"bytes"
|
|
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
)
|
|
|
|
type objectStructure struct {
|
|
catalogDict *core.PdfObjectDictionary
|
|
pagesDict *core.PdfObjectDictionary
|
|
pages []*core.PdfIndirectObject
|
|
}
|
|
|
|
// getObjectStructure identifies the Catalog and Pages dictionary and finds a list of pages.
|
|
func getObjectStructure(objects []core.PdfObject) objectStructure {
|
|
objstr := objectStructure{}
|
|
found := false
|
|
for _, obj := range objects {
|
|
switch t := obj.(type) {
|
|
case *core.PdfIndirectObject:
|
|
dict, is := core.GetDict(t)
|
|
if !is {
|
|
continue
|
|
}
|
|
kind, is := core.GetName(dict.Get("Type"))
|
|
if !is {
|
|
continue
|
|
}
|
|
|
|
switch kind.String() {
|
|
case "Catalog":
|
|
objstr.catalogDict = dict
|
|
found = true
|
|
}
|
|
}
|
|
if found {
|
|
break
|
|
}
|
|
}
|
|
|
|
if !found {
|
|
return objstr
|
|
}
|
|
|
|
pagesDict, ok := core.GetDict(objstr.catalogDict.Get("Pages"))
|
|
if !ok {
|
|
return objstr
|
|
}
|
|
objstr.pagesDict = pagesDict
|
|
|
|
kids, ok := core.GetArray(pagesDict.Get("Kids"))
|
|
if !ok {
|
|
return objstr
|
|
}
|
|
for _, obj := range kids.Elements() {
|
|
pobj, ok := core.GetIndirect(obj)
|
|
if !ok {
|
|
break
|
|
}
|
|
objstr.pages = append(objstr.pages, pobj)
|
|
}
|
|
|
|
return objstr
|
|
}
|
|
|
|
// getPageContents loads the page content stream as a string from a /Contents entry.
|
|
// Either a single stream, or an array of streams. Returns the list of objects that
|
|
// can be used if need to replace.
|
|
func getPageContents(contentsObj core.PdfObject) (contents string, objs []core.PdfObject) {
|
|
var buf bytes.Buffer
|
|
|
|
switch t := contentsObj.(type) {
|
|
case *core.PdfIndirectObject:
|
|
objs = append(objs, t)
|
|
contentsObj = t.PdfObject
|
|
}
|
|
|
|
switch t := contentsObj.(type) {
|
|
case *core.PdfObjectStream:
|
|
if decoded, err := core.DecodeStream(t); err == nil {
|
|
buf.Write(decoded)
|
|
objs = append(objs, t)
|
|
}
|
|
case *core.PdfObjectArray:
|
|
for _, elobj := range t.Elements() {
|
|
switch el := elobj.(type) {
|
|
case *core.PdfObjectStream:
|
|
if decoded, err := core.DecodeStream(el); err == nil {
|
|
buf.Write(decoded)
|
|
objs = append(objs, el)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return buf.String(), objs
|
|
}
|