mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00

* Track runes in IdentityEncoder (for subsetting), track decoded runes * Working with the identity encoder in font_composite.go * Add GetFilterArray to multi encoder. Add comments. * Add NewFromContents constructor to extractor only requiring contents and resources * golint fixes * Optimizer compress streams - improved detection of raw streams * Optimize - CleanContentStream optimizer that removes redundant operands * WIP Optimize - clean fonts Will support both font file reduction and subsetting. (WIP) * Optimize - image processing - try combined DCT and Flate * Update options.go * Update optimizer.go * Create utils.go for optimize with common methods needed for optimization * Optimizer - add font subsetting method Covers XObject Forms, annotaitons etc. Uses extractor package to extract text marks covering what fonts and glyphs are used. Package truetype used for subsetting. * Add some comments * Fix cmap parsing rune conversion * Error checking for extractor. Add some comments. * Update Jenkinsfile * Update modules
354 lines
8.7 KiB
Go
354 lines
8.7 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package optimize
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
|
|
"github.com/unidoc/unitype"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
"github.com/unidoc/unipdf/v3/extractor"
|
|
"github.com/unidoc/unipdf/v3/internal/textencoding"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// CleanFonts cleans up embedded fonts, reducing font sizes.
|
|
type CleanFonts struct {
|
|
// Subset embedded fonts if encountered (if true).
|
|
// Otherwise attempts to reduce the font program.
|
|
Subset bool
|
|
}
|
|
|
|
func optimizeFontsWithSubsetting(objects []core.PdfObject) (processed map[*core.PdfObjectStream]struct{}, err error) {
|
|
// 1. Identify all fonts.
|
|
// 2. Identify content streams and their Resources dictionaries (both via page, forms and annotations).
|
|
// 3. Process content streams.
|
|
processed = map[*core.PdfObjectStream]struct{}{}
|
|
|
|
fontMap := map[*model.PdfFont]struct{}{}
|
|
|
|
objstr := getObjectStructure(objects)
|
|
for _, p := range objstr.pages {
|
|
pdict, ok := core.GetDict(p.PdfObject)
|
|
if !ok {
|
|
continue
|
|
}
|
|
resourcesDict, ok := core.GetDict(pdict.Get("Resources"))
|
|
if !ok {
|
|
continue
|
|
}
|
|
contents, _ := getPageContents(pdict.Get("Contents"))
|
|
resources, err := model.NewPdfPageResourcesFromDict(resourcesDict)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
allContents := []content{
|
|
{
|
|
content: contents,
|
|
resources: resources,
|
|
},
|
|
}
|
|
|
|
annotContents := getAnnotationContents(pdict.Get("Annots"))
|
|
if annotContents != nil {
|
|
allContents = append(allContents, annotContents...)
|
|
}
|
|
|
|
for _, cont := range allContents {
|
|
e, err := extractor.NewFromContents(cont.content, cont.resources)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pt, _, _, err := e.ExtractPageText()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, el := range pt.Marks().Elements() {
|
|
if el.Font == nil {
|
|
continue
|
|
}
|
|
if _, has := fontMap[el.Font]; !has {
|
|
fontMap[el.Font] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Map of font program stream to font. Multiple fonts can use the same font program.
|
|
fontFileMap := map[*core.PdfObjectStream][]*model.PdfFont{}
|
|
for font := range fontMap {
|
|
fontDesc := font.FontDescriptor()
|
|
if fontDesc == nil || fontDesc.FontFile2 == nil {
|
|
continue
|
|
}
|
|
stream, ok := core.GetStream(fontDesc.FontFile2)
|
|
if !ok {
|
|
continue
|
|
}
|
|
fontFileMap[stream] = append(fontFileMap[stream], font)
|
|
}
|
|
|
|
for stream := range fontFileMap {
|
|
var allRunes []rune
|
|
var allIndices []unitype.GlyphIndex
|
|
|
|
for _, font := range fontFileMap[stream] {
|
|
switch t := font.Encoder().(type) {
|
|
case *textencoding.IdentityEncoder:
|
|
// TODO: This terminology is wrong as those are not runes, just charcodes cast as runes.
|
|
// Identity encoder maps via 2-byte encoding directly from 2byte charcode to glyph index.
|
|
runes := t.RegisteredRunes()
|
|
indices := make([]unitype.GlyphIndex, len(runes))
|
|
for i, r := range runes {
|
|
indices[i] = unitype.GlyphIndex(r)
|
|
}
|
|
allIndices = append(allIndices, indices...)
|
|
case *textencoding.TrueTypeFontEncoder:
|
|
runes := t.RegisteredRunes()
|
|
allRunes = append(allRunes, runes...)
|
|
case textencoding.SimpleEncoder:
|
|
charcodes := t.Charcodes()
|
|
for _, c := range charcodes {
|
|
r, ok := t.CharcodeToRune(c)
|
|
if !ok {
|
|
common.Log.Debug("Charcode<->rune not found: %d", c)
|
|
continue
|
|
}
|
|
allRunes = append(allRunes, r)
|
|
}
|
|
}
|
|
}
|
|
|
|
err = subsetFontStream(stream, allRunes, allIndices)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR subsetting font stream: %v", err)
|
|
return nil, err
|
|
}
|
|
processed[stream] = struct{}{}
|
|
}
|
|
return processed, nil
|
|
}
|
|
|
|
// Subsets the font program in `stream` with the subset based on the `runes` and glyph `indices`.
|
|
func subsetFontStream(stream *core.PdfObjectStream, runes []rune, indices []unitype.GlyphIndex) error {
|
|
stream, ok := core.GetStream(stream)
|
|
if !ok {
|
|
common.Log.Debug("Embedded font object not found -- ABORT subsetting")
|
|
return errors.New("fontfile2 not found")
|
|
}
|
|
decoded, err := core.DecodeStream(stream)
|
|
if err != nil {
|
|
common.Log.Debug("Decode error: %v", err)
|
|
return err
|
|
}
|
|
|
|
fnt, err := unitype.Parse(bytes.NewReader(decoded))
|
|
if err != nil {
|
|
common.Log.Debug("Error parsing %d byte font", len(stream.Stream))
|
|
return err
|
|
}
|
|
|
|
allIndices := indices
|
|
if len(runes) > 0 {
|
|
indices := fnt.LookupRunes(runes)
|
|
allIndices = append(allIndices, indices...)
|
|
}
|
|
|
|
fnt, err = fnt.SubsetKeepIndices(allIndices)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR subsetting font: %v", err)
|
|
return err
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
err = fnt.Write(&buf)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR Writing font: %v", err)
|
|
return err
|
|
}
|
|
if buf.Len() > len(decoded) {
|
|
common.Log.Debug("Re-written font is larger than original - skip")
|
|
return nil
|
|
}
|
|
|
|
newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
|
|
if err != nil {
|
|
common.Log.Debug("ERROR Writing font: %v", err)
|
|
return err
|
|
}
|
|
// Overwrite.
|
|
*stream = *newstream
|
|
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
|
|
|
|
return nil
|
|
}
|
|
|
|
// Optimize optimizes PDF objects to decrease PDF size.
|
|
func (c *CleanFonts) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
|
|
var processed map[*core.PdfObjectStream]struct{}
|
|
if c.Subset {
|
|
var err error
|
|
processed, err = optimizeFontsWithSubsetting(objects)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// Clean font streams by loading and rewriting with minimal needed tables.
|
|
for _, obj := range objects {
|
|
stream, isStreamObj := core.GetStream(obj)
|
|
if !isStreamObj {
|
|
continue
|
|
}
|
|
if _, has := processed[stream]; has {
|
|
// Skip - has been processed.
|
|
continue
|
|
}
|
|
|
|
encoder, err := core.NewEncoderFromStream(stream)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR getting encoder: %v - ignoring", err)
|
|
continue
|
|
}
|
|
|
|
decoded, err := encoder.DecodeStream(stream)
|
|
if err != nil {
|
|
common.Log.Debug("Decoding error : %v - ignoring", err)
|
|
continue
|
|
}
|
|
if len(decoded) < 4 {
|
|
continue
|
|
}
|
|
|
|
version := string(decoded[:4])
|
|
if version == "OTTO" {
|
|
// Fonts based on PostScript outlines not supported yet.
|
|
// See https://docs.microsoft.com/en-us/typography/opentype/spec/otff
|
|
continue
|
|
}
|
|
if version != "\x00\x01\x00\x00" && version != "true" {
|
|
continue
|
|
}
|
|
|
|
fnt, err := unitype.Parse(bytes.NewReader(decoded))
|
|
if err != nil {
|
|
common.Log.Debug("ERROR Parsing font: %v - ignoring", err)
|
|
continue
|
|
}
|
|
err = fnt.Optimize()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
err = fnt.Write(&buf)
|
|
if err != nil {
|
|
common.Log.Debug("ERROR Writing font: %v - ignoring", err)
|
|
continue
|
|
}
|
|
if buf.Len() > len(decoded) {
|
|
common.Log.Debug("Re-written font is larger than original - skip")
|
|
continue
|
|
}
|
|
|
|
newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
|
|
if err != nil {
|
|
continue
|
|
}
|
|
// Overwrite.
|
|
*stream = *newstream
|
|
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
|
|
}
|
|
return objects, nil
|
|
}
|
|
|
|
// content describes page or font contents which is a content stream along with resources.
|
|
type content struct {
|
|
content string
|
|
resources *model.PdfPageResources
|
|
}
|
|
|
|
// Best effort to get annotation contents.
|
|
func getAnnotationContents(annotsObj core.PdfObject) []content {
|
|
if annotsObj == nil {
|
|
return nil
|
|
}
|
|
annotsArr, ok := core.GetArray(annotsObj)
|
|
if !ok {
|
|
common.Log.Debug("Annots not an array")
|
|
return nil
|
|
}
|
|
|
|
var annotContents []content
|
|
for _, obj := range annotsArr.Elements() {
|
|
annotDict, ok := core.GetDict(obj)
|
|
if !ok {
|
|
// Ignore any non dict elements.
|
|
common.Log.Debug("Ignoring non-dict element in Annots")
|
|
continue
|
|
}
|
|
|
|
// Appearance.
|
|
appDict, ok := core.GetDict(annotDict.Get("AP"))
|
|
if !ok {
|
|
common.Log.Debug("No AP entry - skipping")
|
|
continue
|
|
}
|
|
|
|
normal := core.TraceToDirectObject(appDict.Get("N"))
|
|
if normal == nil {
|
|
common.Log.Debug("No N entry - skipping")
|
|
continue
|
|
}
|
|
|
|
var stream *core.PdfObjectStream
|
|
switch t := normal.(type) {
|
|
case *core.PdfObjectDictionary:
|
|
appState, ok := core.GetName(annotDict.Get("AS"))
|
|
if !ok {
|
|
common.Log.Debug("No AS entry - skipping")
|
|
continue
|
|
}
|
|
stream, ok = core.GetStream(t.Get(*appState))
|
|
if !ok {
|
|
common.Log.Debug("Form not found - skipping")
|
|
continue
|
|
}
|
|
case *core.PdfObjectStream:
|
|
stream = t
|
|
}
|
|
if stream == nil {
|
|
common.Log.Debug("Form not found (nil) - skipping")
|
|
continue
|
|
}
|
|
|
|
xform, err := model.NewXObjectFormFromStream(stream)
|
|
if err != nil {
|
|
common.Log.Debug("Error loading form: %v - ignoring", err)
|
|
continue
|
|
}
|
|
|
|
contents, err := xform.GetContentStream()
|
|
if err != nil {
|
|
common.Log.Debug("Error decoding contents: %v", err)
|
|
continue
|
|
}
|
|
|
|
annotContents = append(annotContents, content{
|
|
content: string(contents),
|
|
resources: xform.Resources,
|
|
})
|
|
}
|
|
|
|
return annotContents
|
|
}
|