Implemented : PDFs optimization

This commit is contained in:
Aleksei Pavliukov 2018-09-29 17:22:53 +03:00
parent 6a6a387faa
commit ea5dba8e0d
16 changed files with 1954 additions and 34 deletions

View File

@ -74,6 +74,13 @@ type PdfObjectStream struct {
Stream []byte
}
// PdfObjectStreams represents the primitive PDF object streams.
// 7.5.7 Object Streams (page 45).
type PdfObjectStreams struct {
PdfObjectReference
vec []PdfObject
}
// MakeDict creates and returns an empty PdfObjectDictionary.
func MakeDict() *PdfObjectDictionary {
d := &PdfObjectDictionary{}
@ -203,6 +210,16 @@ func MakeStream(contents []byte, encoder StreamEncoder) (*PdfObjectStream, error
return stream, nil
}
// MakeObjectStreams creates an PdfObjectStreams from a list of PdfObjects.
func MakeObjectStreams(objects ...PdfObject) *PdfObjectStreams {
streams := &PdfObjectStreams{}
streams.vec = []PdfObject{}
for _, obj := range objects {
streams.vec = append(streams.vec, obj)
}
return streams
}
func (bool *PdfObjectBool) String() string {
if *bool {
return "true"
@ -848,3 +865,53 @@ func GetStream(obj PdfObject) (stream *PdfObjectStream, found bool) {
stream, found = obj.(*PdfObjectStream)
return stream, found
}
// GetObjectStreams returns the *PdfObjectStreams represented by the PdfObject. On type mismatch the found bool flag is
// false and a nil pointer is returned.
func GetObjectStreams(obj PdfObject) (objStream *PdfObjectStreams, found bool) {
objStream, found = obj.(*PdfObjectStreams)
return objStream, found
}
// Append appends PdfObject(s) to the streams.
func (streams *PdfObjectStreams) Append(objects ...PdfObject) {
if streams == nil {
common.Log.Debug("Warn - Attempt to append to a nil streams")
return
}
if streams.vec == nil {
streams.vec = []PdfObject{}
}
for _, obj := range objects {
streams.vec = append(streams.vec, obj)
}
}
// Elements returns a slice of the PdfObject elements in the array.
// Preferred over accessing the array directly as type may be changed in future major versions (v3).
func (streams *PdfObjectStreams) Elements() []PdfObject {
if streams == nil {
return nil
}
return streams.vec
}
// String returns a string describing `streams`.
func (streams *PdfObjectStreams) String() string {
return fmt.Sprintf("Object stream %d", streams.ObjectNumber)
}
// Len returns the number of elements in the streams.
func (streams *PdfObjectStreams) Len() int {
if streams == nil {
return 0
}
return len(streams.vec)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (streams *PdfObjectStreams) DefaultWriteString() string {
outStr := fmt.Sprintf("%d 0 R", (*streams).ObjectNumber)
return outStr
}

View File

@ -44,6 +44,8 @@ type Creator struct {
// Forms.
acroForm *model.PdfAcroForm
optimizer model.Optimizer
}
// SetForms adds an Acroform to a PDF file. Sets the specified form for writing.
@ -101,6 +103,16 @@ func New() *Creator {
return c
}
// SetOptimizer sets the optimizer to optimize PDF before writing.
func (c *Creator) SetOptimizer(optimizer model.Optimizer) {
c.optimizer = optimizer
}
// GetOptimizer returns current PDF optimizer.
func (c *Creator) GetOptimizer() model.Optimizer {
return c.optimizer
}
// SetPageMargins sets the page margins: left, right, top, bottom.
// The default page margins are 10% of document width.
func (c *Creator) SetPageMargins(left, right, top, bottom float64) {
@ -459,13 +471,15 @@ func (c *Creator) Draw(d Drawable) error {
return nil
}
// Write output of creator to io.WriteSeeker interface.
func (c *Creator) Write(ws io.WriteSeeker) error {
// Write output of creator to io.Writer interface.
func (c *Creator) Write(ws io.Writer) error {
if !c.finalized {
c.finalize()
}
pdfWriter := model.NewPdfWriter()
pdfWriter.SetOptimizer(c.optimizer)
// Form fields.
if c.acroForm != nil {
err := pdfWriter.SetForms(c.acroForm)

View File

@ -14,6 +14,7 @@ import (
goimage "image"
"io/ioutil"
"math"
"os"
"testing"
"github.com/boombuler/barcode"
@ -22,6 +23,7 @@ import (
"github.com/unidoc/unidoc/pdf/contentstream/draw"
"github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/model"
"github.com/unidoc/unidoc/pdf/model/optimize"
"github.com/unidoc/unidoc/pdf/model/textencoding"
)
@ -2133,3 +2135,822 @@ func TestEncrypting1(t *testing.T) {
return
}
}
// TestOptimizeCombineDuplicateStreams tests optimizing PDFs to reduce output file size.
func TestOptimizeCombineDuplicateStreams(t *testing.T) {
c := createPdf4Optimization(t)
err := c.WriteToFile("/tmp/7_combine_duplicate_streams_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c = createPdf4Optimization(t)
c.SetOptimizer(optimize.New(optimize.Options{CombineDuplicateStreams: true}))
err = c.WriteToFile("/tmp/7_combine_duplicate_streams_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/7_combine_duplicate_streams_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/7_combine_duplicate_streams_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}
// TestOptimizeImageQuality tests optimizing PDFs to reduce output file size.
func TestOptimizeImageQuality(t *testing.T) {
c := New()
imgDataJpeg, err := ioutil.ReadFile(testImageFile1)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
imgJpeg, err := NewImageFromData(imgDataJpeg)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
// JPEG encoder (DCT) with quality factor 70.
encoder := core.NewDCTEncoder()
encoder.Quality = 100
encoder.Width = int(imgJpeg.Width())
encoder.Height = int(imgJpeg.Height())
imgJpeg.SetEncoder(encoder)
imgJpeg.SetPos(250, 350)
imgJpeg.Scale(0.25, 0.25)
err = c.Draw(imgJpeg)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
err = c.WriteToFile("/tmp/8_image_quality_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c.SetOptimizer(optimize.New(optimize.Options{ImageQuality: 20}))
err = c.WriteToFile("/tmp/8_image_quality_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/8_image_quality_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/8_image_quality_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}
func createPdf4Optimization(t *testing.T) *Creator {
c := New()
p := NewParagraph("Test text1")
// Change to times bold font (default is helvetica).
font, err := model.NewStandard14Font(model.CourierBold)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
p.SetFont(font)
p.SetPos(15, 15)
_ = c.Draw(p)
imgData, err := ioutil.ReadFile(testImageFile1)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
img, err := NewImageFromData(imgData)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
img.SetPos(0, 100)
img.ScaleToWidth(1.0 * c.Width())
err = c.Draw(img)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
img1, err := NewImageFromData(imgData)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
img1.SetPos(0, 200)
img1.ScaleToWidth(1.0 * c.Width())
err = c.Draw(img1)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
imgData2, err := ioutil.ReadFile(testImageFile1)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
img2, err := NewImageFromData(imgData2)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
img2.SetPos(0, 500)
img2.ScaleToWidth(1.0 * c.Width())
c.NewPage()
p = NewParagraph("Test text2")
// Change to times bold font (default is helvetica).
font, err = model.NewStandard14Font(model.Helvetica)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
p.SetFont(font)
p.SetPos(15, 15)
_ = c.Draw(p)
err = c.Draw(img2)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
return nil
}
return c
}
// TestOptimizeUseObjectStreams tests optimizing PDFs to reduce output file size.
func TestOptimizeUseObjectStreams(t *testing.T) {
c := createPdf4Optimization(t)
err := c.WriteToFile("/tmp/9_use_object_streams_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c = createPdf4Optimization(t)
c.SetOptimizer(optimize.New(optimize.Options{UseObjectStreams: true}))
err = c.WriteToFile("/tmp/9_use_object_streams_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/9_use_object_streams_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/9_use_object_streams_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}
// TestCombineDuplicateDirectObjects tests optimizing PDFs to reduce output file size.
func TestCombineDuplicateDirectObjects(t *testing.T) {
createDoc := func() *Creator {
c := New()
ch1 := c.NewChapter("Introduction")
subchap1 := c.NewSubchapter(ch1, "The fundamentals")
subchap1.SetMargins(0, 0, 5, 0)
//subCh1 := NewSubchapter(ch1, "Workflow")
p := NewParagraph("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt " +
"ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore " +
"eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt " +
"mollit anim id est laborum.")
p.SetTextAlignment(TextAlignmentJustify)
p.SetMargins(0, 0, 5, 0)
for j := 0; j < 7; j++ {
subchap1.Add(p)
}
subchap2 := c.NewSubchapter(ch1, "Mechanism")
subchap2.SetMargins(0, 0, 5, 0)
for j := 0; j < 15; j++ {
subchap2.Add(p)
}
subchap3 := c.NewSubchapter(ch1, "Discussion")
subchap3.SetMargins(0, 0, 5, 0)
for j := 0; j < 19; j++ {
subchap3.Add(p)
}
subchap4 := c.NewSubchapter(ch1, "Conclusion")
subchap4.SetMargins(0, 0, 5, 0)
for j := 0; j < 23; j++ {
subchap4.Add(p)
}
c.Draw(ch1)
for i := 0; i < 50; i++ {
ch2 := c.NewChapter("References")
for j := 0; j < 13; j++ {
ch2.Add(p)
}
c.Draw(ch2)
}
// Set a function to create the front Page.
c.CreateFrontPage(func(args FrontpageFunctionArgs) {
p := NewParagraph("Example Report")
p.SetWidth(c.Width())
p.SetTextAlignment(TextAlignmentCenter)
p.SetFontSize(32)
p.SetPos(0, 300)
c.Draw(p)
p.SetFontSize(22)
p.SetText("Example Report Data Results")
p.SetPos(0, 340)
c.Draw(p)
})
// Set a function to create the table of contents.
c.CreateTableOfContents(func(toc *TableOfContents) (*Chapter, error) {
ch := c.NewChapter("Table of contents")
ch.GetHeading().SetColor(ColorRGBFromArithmetic(0.5, 0.5, 0.5))
ch.GetHeading().SetFontSize(28)
ch.GetHeading().SetMargins(0, 0, 0, 30)
table := NewTable(2)
// Default, equal column sizes (4x0.25)...
table.SetColumnWidths(0.9, 0.1)
for _, entry := range toc.entries {
// Col 1. Chapter number, title.
var str string
if entry.Subchapter == 0 {
str = fmt.Sprintf("%d. %s", entry.Chapter, entry.Title)
} else {
str = fmt.Sprintf(" %d.%d. %s", entry.Chapter, entry.Subchapter, entry.Title)
}
p := NewParagraph(str)
p.SetFontSize(14)
cell := table.NewCell()
cell.SetContent(p)
// Set the paragraph width to the cell width.
p.SetWidth(cell.Width(c.Context()))
table.SetRowHeight(table.CurRow(), p.Height()*1.2)
// Col 1. Page number.
p = NewParagraph(fmt.Sprintf("%d", entry.PageNumber))
p.SetFontSize(14)
cell = table.NewCell()
cell.SetContent(p)
}
err := ch.Add(table)
if err != nil {
fmt.Printf("Error adding table: %v\n", err)
return nil, err
}
return ch, nil
})
addHeadersAndFooters(c)
return c
}
c := createDoc()
err := c.WriteToFile("/tmp/10_combine_duplicate_direct_objects_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c = createDoc()
c.SetOptimizer(optimize.New(optimize.Options{CombineDuplicateDirectObjects: true}))
err = c.WriteToFile("/tmp/10_combine_duplicate_direct_objects_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/10_combine_duplicate_direct_objects_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/10_combine_duplicate_direct_objects_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}
// TestOptimizeImagePPI tests optimizing PDFs to reduce output file size.
func TestOptimizeImagePPI(t *testing.T) {
c := New()
imgDataJpeg, err := ioutil.ReadFile(testImageFile1)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
imgJpeg, err := NewImageFromData(imgDataJpeg)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
// JPEG encoder (DCT) with quality factor 100.
encoder := core.NewDCTEncoder()
encoder.Quality = 100
encoder.Width = int(imgJpeg.Width())
encoder.Height = int(imgJpeg.Height())
imgJpeg.SetEncoder(encoder)
imgJpeg.SetPos(250, 350)
imgJpeg.Scale(0.25, 0.25)
err = c.Draw(imgJpeg)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c.NewPage()
imgData, err := ioutil.ReadFile(testImageFile1)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
}
img, err := NewImageFromData(imgData)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
}
img.SetPos(0, 100)
img.ScaleToWidth(0.1 * c.Width())
err = c.Draw(img)
if err != nil {
t.Errorf("Fail: %v\n", err)
t.FailNow()
}
err = c.Draw(imgJpeg)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
err = c.WriteToFile("/tmp/11_image_ppi_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c.SetOptimizer(optimize.New(optimize.Options{ImageUpperPPI: 144}))
err = c.WriteToFile("/tmp/11_image_ppi_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/11_image_ppi_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/11_image_ppi_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}
// TestCombineIdenticalIndirectObjects tests optimizing PDFs to reduce output file size.
func TestCombineIdenticalIndirectObjects(t *testing.T) {
c := New()
ch1 := c.NewChapter("Introduction")
subchap1 := c.NewSubchapter(ch1, "The fundamentals")
subchap1.SetMargins(0, 0, 5, 0)
//subCh1 := NewSubchapter(ch1, "Workflow")
p := NewParagraph("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt " +
"ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore " +
"eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt " +
"mollit anim id est laborum.")
p.SetTextAlignment(TextAlignmentJustify)
p.SetMargins(0, 0, 5, 0)
for j := 0; j < 5; j++ {
subchap1.Add(p)
}
subchap2 := c.NewSubchapter(ch1, "Mechanism")
subchap2.SetMargins(0, 0, 5, 0)
for j := 0; j < 15; j++ {
subchap2.Add(p)
}
subchap3 := c.NewSubchapter(ch1, "Discussion")
subchap3.SetMargins(0, 0, 5, 0)
for j := 0; j < 19; j++ {
subchap3.Add(p)
}
subchap4 := c.NewSubchapter(ch1, "Conclusion")
subchap4.SetMargins(0, 0, 5, 0)
for j := 0; j < 23; j++ {
subchap4.Add(p)
}
c.Draw(ch1)
for i := 0; i < 50; i++ {
ch2 := c.NewChapter("References")
for j := 0; j < 13; j++ {
ch2.Add(p)
}
c.Draw(ch2)
}
// Set a function to create the front Page.
c.CreateFrontPage(func(args FrontpageFunctionArgs) {
p := NewParagraph("Example Report")
p.SetWidth(c.Width())
p.SetTextAlignment(TextAlignmentCenter)
p.SetFontSize(32)
p.SetPos(0, 300)
c.Draw(p)
p.SetFontSize(22)
p.SetText("Example Report Data Results")
p.SetPos(0, 340)
c.Draw(p)
})
// Set a function to create the table of contents.
c.CreateTableOfContents(func(toc *TableOfContents) (*Chapter, error) {
ch := c.NewChapter("Table of contents")
ch.GetHeading().SetColor(ColorRGBFromArithmetic(0.5, 0.5, 0.5))
ch.GetHeading().SetFontSize(28)
ch.GetHeading().SetMargins(0, 0, 0, 30)
table := NewTable(2)
// Default, equal column sizes (4x0.25)...
table.SetColumnWidths(0.9, 0.1)
for _, entry := range toc.entries {
// Col 1. Chapter number, title.
var str string
if entry.Subchapter == 0 {
str = fmt.Sprintf("%d. %s", entry.Chapter, entry.Title)
} else {
str = fmt.Sprintf(" %d.%d. %s", entry.Chapter, entry.Subchapter, entry.Title)
}
p := NewParagraph(str)
p.SetFontSize(14)
cell := table.NewCell()
cell.SetContent(p)
// Set the paragraph width to the cell width.
p.SetWidth(cell.Width(c.Context()))
table.SetRowHeight(table.CurRow(), p.Height()*1.2)
// Col 1. Page number.
p = NewParagraph(fmt.Sprintf("%d", entry.PageNumber))
p.SetFontSize(14)
cell = table.NewCell()
cell.SetContent(p)
}
err := ch.Add(table)
if err != nil {
fmt.Printf("Error adding table: %v\n", err)
return nil, err
}
return ch, nil
})
addHeadersAndFooters(c)
err := c.WriteToFile("/tmp/12_identical_indirect_objects_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c.SetOptimizer(optimize.New(optimize.Options{CombineIdenticalIndirectObjects: true}))
err = c.WriteToFile("/tmp/12_identical_indirect_objects_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/12_identical_indirect_objects_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/12_identical_indirect_objects_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}
// TestCompressStreams tests optimizing PDFs to reduce output file size.
func TestCompressStreams(t *testing.T) {
createDoc := func() *Creator {
c := New()
p := NewParagraph("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt" +
"ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore" +
"eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt " +
"mollit anim id est laborum.")
p.SetMargins(0, 0, 5, 0)
c.Draw(p)
//c.NewPage()
page := c.pages[0]
page.AddContentStreamByString(`BT
/Arial 56 Tf
20 600 Td
(The multiline example text)Tj
/Arial 30 Tf
0 30 Td
60 TL
(example text)'
(example text)'
(example text)'
(example text)'
(example text)'
(example text)'
(example text)'
(example text)'
ET`)
return c
}
c := createDoc()
err := c.WriteToFile("/tmp/13_compress_streams_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c = createDoc()
c.SetOptimizer(optimize.New(optimize.Options{CompressStreams: true}))
err = c.WriteToFile("/tmp/13_compress_streams_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/13_compress_streams_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/13_compress_streams_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}
// TestAllOptimizations tests optimizing PDFs to reduce output file size.
func TestAllOptimizations(t *testing.T) {
createDoc := func() *Creator {
c := New()
ch1 := c.NewChapter("Introduction")
subchap1 := c.NewSubchapter(ch1, "The fundamentals")
subchap1.SetMargins(0, 0, 5, 0)
//subCh1 := NewSubchapter(ch1, "Workflow")
p := NewParagraph("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt " +
"ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore " +
"eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt " +
"mollit anim id est laborum.")
p.SetTextAlignment(TextAlignmentJustify)
p.SetMargins(0, 0, 5, 0)
for j := 0; j < 7; j++ {
subchap1.Add(p)
}
subchap2 := c.NewSubchapter(ch1, "Mechanism")
subchap2.SetMargins(0, 0, 5, 0)
for j := 0; j < 15; j++ {
subchap2.Add(p)
}
subchap3 := c.NewSubchapter(ch1, "Discussion")
subchap3.SetMargins(0, 0, 5, 0)
for j := 0; j < 19; j++ {
subchap3.Add(p)
}
subchap4 := c.NewSubchapter(ch1, "Conclusion")
subchap4.SetMargins(0, 0, 5, 0)
for j := 0; j < 23; j++ {
subchap4.Add(p)
}
c.Draw(ch1)
for i := 0; i < 50; i++ {
ch2 := c.NewChapter("References")
for j := 0; j < 13; j++ {
ch2.Add(p)
}
c.Draw(ch2)
}
// Set a function to create the front Page.
c.CreateFrontPage(func(args FrontpageFunctionArgs) {
p := NewParagraph("Example Report")
p.SetWidth(c.Width())
p.SetTextAlignment(TextAlignmentCenter)
p.SetFontSize(32)
p.SetPos(0, 300)
c.Draw(p)
p.SetFontSize(22)
p.SetText("Example Report Data Results")
p.SetPos(0, 340)
c.Draw(p)
})
// Set a function to create the table of contents.
c.CreateTableOfContents(func(toc *TableOfContents) (*Chapter, error) {
ch := c.NewChapter("Table of contents")
ch.GetHeading().SetColor(ColorRGBFromArithmetic(0.5, 0.5, 0.5))
ch.GetHeading().SetFontSize(28)
ch.GetHeading().SetMargins(0, 0, 0, 30)
table := NewTable(2)
// Default, equal column sizes (4x0.25)...
table.SetColumnWidths(0.9, 0.1)
for _, entry := range toc.entries {
// Col 1. Chapter number, title.
var str string
if entry.Subchapter == 0 {
str = fmt.Sprintf("%d. %s", entry.Chapter, entry.Title)
} else {
str = fmt.Sprintf(" %d.%d. %s", entry.Chapter, entry.Subchapter, entry.Title)
}
p := NewParagraph(str)
p.SetFontSize(14)
cell := table.NewCell()
cell.SetContent(p)
// Set the paragraph width to the cell width.
p.SetWidth(cell.Width(c.Context()))
table.SetRowHeight(table.CurRow(), p.Height()*1.2)
// Col 1. Page number.
p = NewParagraph(fmt.Sprintf("%d", entry.PageNumber))
p.SetFontSize(14)
cell = table.NewCell()
cell.SetContent(p)
}
err := ch.Add(table)
if err != nil {
fmt.Printf("Error adding table: %v\n", err)
return nil, err
}
return ch, nil
})
addHeadersAndFooters(c)
return c
}
c := createDoc()
err := c.WriteToFile("/tmp/14_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
c = createDoc()
c.SetOptimizer(optimize.New(optimize.Options{
CombineDuplicateDirectObjects: true,
CombineIdenticalIndirectObjects: true,
ImageUpperPPI: 50.0,
UseObjectStreams: true,
ImageQuality: 50,
CombineDuplicateStreams: true,
CompressStreams: true,
}))
err = c.WriteToFile("/tmp/14_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfo, err := os.Stat("/tmp/14_not_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
fileInfoOptimized, err := os.Stat("/tmp/14_optimized.pdf")
if err != nil {
t.Errorf("Fail: %v\n", err)
return
}
if fileInfoOptimized.Size() >= fileInfo.Size() {
t.Errorf("Optimization failed: size not changed %d vs %d", fileInfo.Size(), fileInfoOptimized.Size())
}
}

View File

@ -151,9 +151,9 @@ func NewPdfColorspaceFromPdfObject(obj PdfObject) (PdfColorspace, error) {
return nil, errors.New("Type error")
}
// determineColorspaceNameFromPdfObject determines PDF colorspace from a PdfObject. Returns the colorspace name and
// DetermineColorspaceNameFromPdfObject determines PDF colorspace from a PdfObject. Returns the colorspace name and
// an error on failure. If the colorspace was not found, will return an empty string.
func determineColorspaceNameFromPdfObject(obj PdfObject) (PdfObjectName, error) {
func DetermineColorspaceNameFromPdfObject(obj PdfObject) (PdfObjectName, error) {
var csName *PdfObjectName
var csArray *PdfObjectArray
@ -2179,7 +2179,7 @@ func newPdfColorspaceSpecialIndexedFromPdfObject(obj PdfObject) (*PdfColorspaceS
obj = array.Get(1)
// Base cs cannot be another /Indexed or /Pattern space.
baseName, err := determineColorspaceNameFromPdfObject(obj)
baseName, err := DetermineColorspaceNameFromPdfObject(obj)
if baseName == "Indexed" || baseName == "Pattern" {
common.Log.Debug("Error: Indexed colorspace cannot have Indexed/Pattern CS as base (%v)", baseName)
return nil, ErrRangeError

View File

@ -0,0 +1,34 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/model"
)
// Chain allows to use sequence of optimizers.
// It implements interface model.Optimizer.
type Chain struct {
optimizers []model.Optimizer
}
// Append appends optimizers to the chain.
func (c *Chain) Append(optimizers ...model.Optimizer) {
c.optimizers = append(c.optimizers, optimizers...)
}
// Optimize optimizes PDF objects to decrease PDF size.
func (c *Chain) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
optimizedObjects = objects
for _, optimizer := range c.optimizers {
optimizedObjects, err = optimizer.Optimize(optimizedObjects)
if err != nil {
return optimizedObjects, err
}
}
return optimizedObjects, nil
}

View File

@ -0,0 +1,71 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"crypto/md5"
"github.com/unidoc/unidoc/pdf/core"
)
// CombineDuplicateDirectObjects combines duplicated direct objects by its data hash.
// It implements interface model.Optimizer.
type CombineDuplicateDirectObjects struct {
}
// Optimize optimizes PDF objects to decrease PDF size.
func (dup *CombineDuplicateDirectObjects) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
dictsByHash := make(map[string][]*core.PdfObjectDictionary)
var processDict func(pDict *core.PdfObjectDictionary)
processDict = func(pDict *core.PdfObjectDictionary) {
for _, key := range pDict.Keys() {
obj := pDict.Get(key)
if dict, isDictObj := obj.(*core.PdfObjectDictionary); isDictObj {
hasher := md5.New()
hasher.Write([]byte(dict.DefaultWriteString()))
hash := string(hasher.Sum(nil))
dictsByHash[hash] = append(dictsByHash[hash], dict)
processDict(dict)
}
}
}
for _, obj := range objects {
ind, isIndirectObj := obj.(*core.PdfIndirectObject)
if !isIndirectObj {
continue
}
if dict, isDictObj := ind.PdfObject.(*core.PdfObjectDictionary); isDictObj {
processDict(dict)
}
}
indirects := make([]core.PdfObject, 0, len(dictsByHash))
replaceTable := make(map[core.PdfObject]core.PdfObject)
for _, dicts := range dictsByHash {
if len(dicts) < 2 {
continue
}
dict := core.MakeDict()
dict.Merge(dicts[0])
ind := core.MakeIndirectObject(dict)
indirects = append(indirects, ind)
for i := 0; i < len(dicts); i++ {
dict := dicts[i]
replaceTable[dict] = ind
}
}
optimizedObjects = make([]core.PdfObject, len(objects))
copy(optimizedObjects, objects)
optimizedObjects = append(indirects, optimizedObjects...)
replaceObjectsInPlace(optimizedObjects, replaceTable)
return optimizedObjects, nil
}

View File

@ -0,0 +1,53 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"crypto/md5"
"github.com/unidoc/unidoc/pdf/core"
)
// CombineDuplicateStreams combines duplicated streams by its data hash.
// It implements interface model.Optimizer.
type CombineDuplicateStreams struct {
}
// Optimize optimizes PDF objects to decrease PDF size.
func (dup *CombineDuplicateStreams) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
replaceTable := make(map[core.PdfObject]core.PdfObject)
toDelete := make(map[core.PdfObject]struct{})
streamsByHash := make(map[string][]*core.PdfObjectStream)
for _, obj := range objects {
if stream, isStreamObj := obj.(*core.PdfObjectStream); isStreamObj {
hasher := md5.New()
hasher.Write([]byte(stream.Stream))
hash := string(hasher.Sum(nil))
streamsByHash[hash] = append(streamsByHash[hash], stream)
}
}
for _, streams := range streamsByHash {
if len(streams) < 2 {
continue
}
firstStream := streams[0]
for i := 1; i < len(streams); i++ {
stream := streams[i]
replaceTable[stream] = firstStream
toDelete[stream] = struct{}{}
}
}
optimizedObjects = make([]core.PdfObject, 0, len(objects)-len(toDelete))
for _, obj := range objects {
if _, found := toDelete[obj]; found {
continue
}
optimizedObjects = append(optimizedObjects, obj)
}
replaceObjectsInPlace(optimizedObjects, replaceTable)
return optimizedObjects, nil
}

View File

@ -0,0 +1,64 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"crypto/md5"
"github.com/unidoc/unidoc/pdf/core"
)
// CombineIdenticalIndirectObjects combines identical indirect objects.
// It implements interface model.Optimizer.
type CombineIdenticalIndirectObjects struct {
}
// Optimize optimizes PDF objects to decrease PDF size.
func (c *CombineIdenticalIndirectObjects) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
replaceTable := make(map[core.PdfObject]core.PdfObject)
toDelete := make(map[core.PdfObject]struct{})
indWithDictByHash := make(map[string][]*core.PdfIndirectObject)
for _, obj := range objects {
ind, isIndirectObj := obj.(*core.PdfIndirectObject)
if !isIndirectObj {
continue
}
if dict, isDictObj := ind.PdfObject.(*core.PdfObjectDictionary); isDictObj {
if name, isName := dict.Get("Type").(*core.PdfObjectName); isName && *name == "Page" {
continue
}
hasher := md5.New()
hasher.Write([]byte(dict.DefaultWriteString()))
hash := string(hasher.Sum(nil))
indWithDictByHash[hash] = append(indWithDictByHash[hash], ind)
}
}
for _, dicts := range indWithDictByHash {
if len(dicts) < 2 {
continue
}
firstDict := dicts[0]
for i := 1; i < len(dicts); i++ {
dict := dicts[i]
replaceTable[dict] = firstDict
toDelete[dict] = struct{}{}
}
}
optimizedObjects = make([]core.PdfObject, 0, len(objects)-len(toDelete))
for _, obj := range objects {
if _, found := toDelete[obj]; found {
continue
}
optimizedObjects = append(optimizedObjects, obj)
}
replaceObjectsInPlace(optimizedObjects, replaceTable)
return optimizedObjects, nil
}

View File

@ -0,0 +1,45 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"github.com/unidoc/unidoc/pdf/core"
)
// CompressStreams compresses uncompressed streams.
// It implements interface model.Optimizer.
type CompressStreams struct {
}
// Optimize optimizes PDF objects to decrease PDF size.
func (c *CompressStreams) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
optimizedObjects = make([]core.PdfObject, len(objects))
copy(optimizedObjects, objects)
for _, obj := range objects {
stream, isStreamObj := core.GetStream(obj)
if !isStreamObj {
continue
}
if _, found := core.GetName(stream.PdfObjectDictionary.Get("Filter")); found {
continue
}
encoder := core.NewLZWEncoder()
encoder.EarlyChange = 0
var data []byte
data, err = encoder.EncodeBytes(stream.Stream)
if err != nil {
return optimizedObjects, err
}
dict := encoder.MakeStreamDict()
// compare compressed and uncompressed sizes
if len(data)+len(dict.DefaultWriteString()) < len(stream.Stream) {
stream.Stream = data
stream.PdfObjectDictionary.Merge(dict)
stream.PdfObjectDictionary.Set("Length", core.MakeInteger(int64(len(stream.Stream))))
}
}
return optimizedObjects, nil
}

138
pdf/model/optimize/image.go Normal file
View File

@ -0,0 +1,138 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/model"
)
// Image optimizes images by rewrite images into JPEG format with quality equals to ImageQuality.
// TODO(a5i): Add support for inline images.
// It implements interface model.Optimizer.
type Image struct {
ImageQuality int
}
// imageInfo is information about an image.
type imageInfo struct {
ColorSpace core.PdfObjectName
BitsPerComponent int
ColorComponents int
Width int
Height int
Stream *core.PdfObjectStream
PPI float64
}
// findImages returns images from objects.
func findImages(objects []core.PdfObject) []*imageInfo {
subTypeKey := core.PdfObjectName("Subtype")
streamProcessed := make(map[*core.PdfObjectStream]struct{})
var err error
var images []*imageInfo
for _, obj := range objects {
stream, ok := core.GetStream(obj)
if !ok {
continue
}
if _, found := streamProcessed[stream]; found {
continue
}
streamProcessed[stream] = struct{}{}
subTypeValue := stream.PdfObjectDictionary.Get(subTypeKey)
subType, ok := core.GetName(subTypeValue)
if !ok || string(*subType) != "Image" {
continue
}
img := &imageInfo{BitsPerComponent: 8, Stream: stream}
if img.ColorSpace, err = model.DetermineColorspaceNameFromPdfObject(stream.PdfObjectDictionary.Get("ColorSpace")); err != nil {
common.Log.Error("Error determine color space %s", err)
continue
}
if val, ok := core.GetIntVal(stream.PdfObjectDictionary.Get("BitsPerComponent")); ok {
img.BitsPerComponent = val
}
if val, ok := core.GetIntVal(stream.PdfObjectDictionary.Get("Width")); ok {
img.Width = val
}
if val, ok := core.GetIntVal(stream.PdfObjectDictionary.Get("Height")); ok {
img.Height = val
}
switch img.ColorSpace {
case "DeviceRGB":
img.ColorComponents = 3
case "DeviceGray":
img.ColorComponents = 1
default:
common.Log.Warning("Optimization is not supported for color space %s", img.ColorSpace)
continue
}
images = append(images, img)
}
return images
}
// Optimize optimizes PDF objects to decrease PDF size.
func (i *Image) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
if i.ImageQuality <= 0 {
return objects, nil
}
images := findImages(objects)
if len(images) == 0 {
return objects, nil
}
replaceTable := make(map[core.PdfObject]core.PdfObject)
imageMasks := make(map[core.PdfObject]struct{})
for _, img := range images {
obj := img.Stream.PdfObjectDictionary.Get(core.PdfObjectName("SMask"))
imageMasks[obj] = struct{}{}
}
for index, img := range images {
stream := img.Stream
if _, isMask := imageMasks[stream]; isMask {
continue
}
streamEncoder, err := core.NewEncoderFromStream(stream)
if err != nil {
common.Log.Warning("Error get encoder for the image stream %s")
continue
}
data, err := streamEncoder.DecodeStream(stream)
if err != nil {
common.Log.Warning("Error decode the image stream %s")
continue
}
encoder := core.NewDCTEncoder()
encoder.ColorComponents = img.ColorComponents
encoder.Quality = i.ImageQuality
encoder.BitsPerComponent = img.BitsPerComponent
encoder.Width = img.Width
encoder.Height = img.Height
streamData, err := encoder.EncodeBytes(data)
if err != nil {
return nil, err
}
newStream := &core.PdfObjectStream{Stream: streamData}
newStream.PdfObjectReference = stream.PdfObjectReference
newStream.PdfObjectDictionary = core.MakeDict()
newStream.PdfObjectDictionary.Merge(stream.PdfObjectDictionary)
fn := core.PdfObjectName(encoder.GetFilterName())
newStream.PdfObjectDictionary.Set(core.PdfObjectName("Filter"), &fn)
ln := core.PdfObjectInteger(int64(len(streamData)))
newStream.PdfObjectDictionary.Set(core.PdfObjectName("Length"), &ln)
replaceTable[stream] = newStream
images[index].Stream = newStream
}
optimizedObjects = make([]core.PdfObject, len(objects))
copy(optimizedObjects, objects)
replaceObjectsInPlace(optimizedObjects, replaceTable)
return optimizedObjects, nil
}

View File

@ -0,0 +1,203 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"fmt"
"image"
"math"
"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/contentstream"
"github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/model"
"golang.org/x/image/draw"
)
// ImagePPI optimizes images by scaling images such that the PPI (pixels per inch) is never higher than ImageUpperPPI.
// TODO(a5i): Add support for inline images.
// It implements interface model.Optimizer.
type ImagePPI struct {
ImageUpperPPI float64
}
func scaleImage(stream *core.PdfObjectStream, scale float64) error {
xImg, err := model.NewXObjectImageFromStream(stream)
if err != nil {
return err
}
i, err := xImg.ToImage()
if err != nil {
return err
}
goimg, err := i.ToGoImage()
if err != nil {
return err
}
newW := int(math.RoundToEven(float64(i.Width) * scale))
newH := int(math.RoundToEven(float64(i.Height) * scale))
rect := image.Rect(0, 0, newW, newH)
var newImage draw.Image
switch xImg.ColorSpace.String() {
case "DeviceRGB":
newImage = image.NewRGBA(rect)
case "DeviceGray":
newImage = image.NewGray(rect)
default:
return fmt.Errorf("Optimization is not supported for color space %s", xImg.ColorSpace.String())
}
draw.CatmullRom.Scale(newImage, newImage.Bounds(), goimg, goimg.Bounds(), draw.Over, &draw.Options{})
i, err = model.ImageHandling.NewImageFromGoImage(newImage)
if err != nil {
return err
}
xImg.SetImage(i, xImg.ColorSpace)
xImg.ToPdfObject()
return nil
}
// Optimize optimizes PDF objects to decrease PDF size.
func (i *ImagePPI) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
if i.ImageUpperPPI <= 0 {
return objects, nil
}
images := findImages(objects)
if len(images) == 0 {
return objects, nil
}
imageMasks := make(map[core.PdfObject]struct{})
for _, img := range images {
obj := img.Stream.PdfObjectDictionary.Get(core.PdfObjectName("SMask"))
imageMasks[obj] = struct{}{}
}
imageByStream := make(map[*core.PdfObjectStream]*imageInfo)
for _, img := range images {
imageByStream[img.Stream] = img
}
var catalog *core.PdfObjectDictionary
for _, obj := range objects {
if dict, isDict := core.GetDict(obj); catalog == nil && isDict {
if tp, ok := core.GetName(dict.Get(core.PdfObjectName("Type"))); ok && *tp == "Catalog" {
catalog = dict
}
}
}
if catalog == nil {
return objects, nil
}
pages, hasPages := core.GetDict(catalog.Get(core.PdfObjectName("Pages")))
if !hasPages {
return objects, nil
}
kids, hasKids := core.GetArray(pages.Get(core.PdfObjectName("Kids")))
if !hasKids {
return objects, nil
}
imageByName := make(map[string]*imageInfo)
for _, pageObj := range kids.Elements() {
page, ok := core.GetDict(pageObj)
if !ok {
continue
}
contents, hasContents := core.GetArray(page.Get("Contents"))
if !hasContents {
continue
}
resources, hasResources := core.GetDict(page.Get("Resources"))
if !hasResources {
continue
}
xObject, hasXObject := core.GetDict(resources.Get("XObject"))
if !hasXObject {
continue
}
xObjectKeys := xObject.Keys()
for _, key := range xObjectKeys {
if stream, isStream := core.GetStream(xObject.Get(key)); isStream {
if img, found := imageByStream[stream]; found {
imageByName[string(key)] = img
}
}
}
for _, obj := range contents.Elements() {
if stream, isStream := core.GetStream(obj); isStream {
streamEncoder, err := core.NewEncoderFromStream(stream)
if err != nil {
return nil, err
}
data, err := streamEncoder.DecodeStream(stream)
if err != nil {
return nil, err
}
p := contentstream.NewContentStreamParser(string(data))
operations, err := p.Parse()
if err != nil {
return nil, err
}
scaleX, scaleY := 1.0, 1.0
for _, operation := range *operations {
if operation.Operand == "Q" {
scaleX, scaleY = 1.0, 1.0
}
if operation.Operand == "cm" && len(operation.Params) == 6 {
if sx, ok := core.GetFloatVal(operation.Params[0]); ok {
scaleX = scaleX * sx
}
if sy, ok := core.GetFloatVal(operation.Params[3]); ok {
scaleY = scaleY * sy
}
if sx, ok := core.GetIntVal(operation.Params[0]); ok {
scaleX = scaleX * float64(sx)
}
if sy, ok := core.GetIntVal(operation.Params[3]); ok {
scaleY = scaleY * float64(sy)
}
}
if operation.Operand == "Do" && len(operation.Params) == 1 {
name, ok := core.GetName(operation.Params[0])
if !ok {
continue
}
if img, found := imageByName[string(*name)]; found {
wInch, hInch := scaleX/72.0, scaleY/72.0
xPPI, yPPI := float64(img.Width)/wInch, float64(img.Height)/hInch
if wInch == 0 || hInch == 0 {
xPPI = 72.0
yPPI = 72.0
}
img.PPI = math.Max(img.PPI, xPPI)
img.PPI = math.Max(img.PPI, yPPI)
}
}
}
}
}
}
for _, img := range images {
if _, isMask := imageMasks[img.Stream]; isMask {
continue
}
if img.PPI <= i.ImageUpperPPI {
continue
}
scale := i.ImageUpperPPI / img.PPI
if err := scaleImage(img.Stream, scale); err != nil {
common.Log.Debug("Error scale image keep original image: %s", err)
} else {
if mask, hasMask := core.GetStream(img.Stream.PdfObjectDictionary.Get(core.PdfObjectName("SMask"))); hasMask {
if err := scaleImage(mask, scale); err != nil {
return nil, err
}
}
}
}
return objects, nil
}

View File

@ -0,0 +1,40 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"github.com/unidoc/unidoc/pdf/core"
)
// ObjectStreams groups PDF objects to object streams.
// It implements interface model.Optimizer.
type ObjectStreams struct {
}
// Optimize optimizes PDF objects to decrease PDF size.
func (o *ObjectStreams) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
objStream := &core.PdfObjectStreams{}
skippedObjects := make([]core.PdfObject, 0, len(objects))
for _, obj := range objects {
if io, isIndirectObj := obj.(*core.PdfIndirectObject); isIndirectObj && io.GenerationNumber == 0 {
objStream.Append(obj)
} else {
skippedObjects = append(skippedObjects, obj)
}
}
if objStream.Len() == 0 {
return skippedObjects, nil
}
optimizedObjects = make([]core.PdfObject, 0, len(skippedObjects)+objStream.Len()+1)
if objStream.Len() > 1 {
optimizedObjects = append(optimizedObjects, objStream)
}
optimizedObjects = append(optimizedObjects, objStream.Elements()...)
optimizedObjects = append(optimizedObjects, skippedObjects...)
return optimizedObjects, nil
}

View File

@ -0,0 +1,84 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"github.com/unidoc/unidoc/pdf/core"
)
// New creates a optimizers chain from options.
func New(options Options) *Chain {
chain := new(Chain)
if options.ImageUpperPPI > 0 {
imageOptimizer := new(ImagePPI)
imageOptimizer.ImageUpperPPI = options.ImageUpperPPI
chain.Append(imageOptimizer)
}
if options.ImageQuality > 0 {
imageOptimizer := new(Image)
imageOptimizer.ImageQuality = options.ImageQuality
chain.Append(imageOptimizer)
}
if options.CombineDuplicateDirectObjects {
chain.Append(new(CombineDuplicateDirectObjects))
}
if options.CombineDuplicateStreams {
chain.Append(new(CombineDuplicateStreams))
}
if options.CombineIdenticalIndirectObjects {
chain.Append(new(CombineIdenticalIndirectObjects))
}
if options.UseObjectStreams {
chain.Append(new(ObjectStreams))
}
if options.CompressStreams {
chain.Append(new(CompressStreams))
}
return chain
}
// replaceObjectsInPlace replaces objects. objTo will be modified by the process.
func replaceObjectsInPlace(objects []core.PdfObject, objTo map[core.PdfObject]core.PdfObject) {
if objTo == nil || len(objTo) == 0 {
return
}
for i, obj := range objects {
if to, found := objTo[obj]; found {
objects[i] = to
continue
}
objTo[obj] = obj
switch t := obj.(type) {
case *core.PdfObjectArray:
values := make([]core.PdfObject, t.Len())
copy(values, t.Elements())
replaceObjectsInPlace(values, objTo)
for i, obj := range values {
t.Set(i, obj)
}
case *core.PdfObjectStreams:
replaceObjectsInPlace(t.Elements(), objTo)
case *core.PdfObjectStream:
values := []core.PdfObject{t.PdfObjectDictionary}
replaceObjectsInPlace(values, objTo)
t.PdfObjectDictionary = values[0].(*core.PdfObjectDictionary)
case *core.PdfObjectDictionary:
keys := t.Keys()
values := make([]core.PdfObject, len(keys))
for i, key := range keys {
values[i] = t.Get(key)
}
replaceObjectsInPlace(values, objTo)
for i, key := range keys {
t.Set(key, values[i])
}
case *core.PdfIndirectObject:
values := []core.PdfObject{t.PdfObject}
replaceObjectsInPlace(values, objTo)
t.PdfObject = values[0]
}
}
}

View File

@ -0,0 +1,17 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
// Options describes PDF optimization parameters.
type Options struct {
CombineDuplicateStreams bool
CombineDuplicateDirectObjects bool
ImageUpperPPI float64
ImageQuality int
UseObjectStreams bool
CombineIdenticalIndirectObjects bool
CompressStreams bool
}

18
pdf/model/optimizer.go Normal file
View File

@ -0,0 +1,18 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package model
import (
"github.com/unidoc/unidoc/pdf/core"
)
// Optimizer is the interface that performs optimization of PDF object structure for output writing.
//
// Optimize receives a slice of input `objects`, performs optimization, including removing, replacing objects and
// output the optimized slice of objects.
type Optimizer interface {
Optimize(objects []core.PdfObject) ([]core.PdfObject, error)
}

View File

@ -10,8 +10,10 @@ package model
import (
"bufio"
"bytes"
"crypto/md5"
"crypto/rand"
"encoding/binary"
"errors"
"fmt"
"io"
@ -25,6 +27,16 @@ import (
"github.com/unidoc/unidoc/pdf/model/fonts"
)
type crossReference struct {
Type int
// Type 1
Offset int64
Generation int64 // and Type 0
// Type 2
ObjectNumber int // and Type 0
Index int
}
var pdfCreator = ""
func getPdfProducer() string {
@ -79,6 +91,9 @@ type PdfWriter struct {
// Forms.
acroForm *PdfAcroForm
optimizer Optimizer
crossReferenceMap map[int]crossReference
}
// NewPdfWriter initializes a new PdfWriter.
@ -132,6 +147,108 @@ func NewPdfWriter() PdfWriter {
return w
}
// copyObject creates deep copy of the Pdf object and
// fills objectToObjectCopyMap to replace the old object to the copy of object if needed.
// Parameter objectToObjectCopyMap is needed to replace object references to its copies.
// Because many objects can contain references to another objects like pages to images.
func copyObject(obj PdfObject, objectToObjectCopyMap map[PdfObject]PdfObject) PdfObject {
if newObj, ok := objectToObjectCopyMap[obj]; ok {
return newObj
}
switch t := obj.(type) {
case *PdfObjectArray:
newObj := &PdfObjectArray{}
objectToObjectCopyMap[obj] = newObj
for _, val := range t.Elements() {
newObj.Append(copyObject(val, objectToObjectCopyMap))
}
return newObj
case *PdfObjectStreams:
newObj := &PdfObjectStreams{PdfObjectReference: t.PdfObjectReference}
objectToObjectCopyMap[obj] = newObj
for _, val := range t.Elements() {
newObj.Append(copyObject(val, objectToObjectCopyMap))
}
return newObj
case *PdfObjectStream:
newObj := &PdfObjectStream{
Stream: t.Stream,
PdfObjectReference: t.PdfObjectReference,
}
objectToObjectCopyMap[obj] = newObj
newObj.PdfObjectDictionary = copyObject(t.PdfObjectDictionary, objectToObjectCopyMap).(*PdfObjectDictionary)
return newObj
case *PdfObjectDictionary:
newObj := MakeDict()
objectToObjectCopyMap[obj] = newObj
for _, key := range t.Keys() {
val := t.Get(key)
newObj.Set(key, copyObject(val, objectToObjectCopyMap))
}
return newObj
case *PdfIndirectObject:
newObj := &PdfIndirectObject{
PdfObjectReference: t.PdfObjectReference,
}
objectToObjectCopyMap[obj] = newObj
newObj.PdfObject = copyObject(t.PdfObject, objectToObjectCopyMap)
return newObj
case *PdfObjectString:
newObj := &PdfObjectString{}
*newObj = *t
objectToObjectCopyMap[obj] = newObj
return newObj
case *PdfObjectName:
newObj := PdfObjectName(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
case *PdfObjectNull:
newObj := PdfObjectNull{}
objectToObjectCopyMap[obj] = &newObj
return &newObj
case *PdfObjectInteger:
newObj := PdfObjectInteger(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
case *PdfObjectReference:
newObj := PdfObjectReference(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
case *PdfObjectFloat:
newObj := PdfObjectFloat(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
case *PdfObjectBool:
newObj := PdfObjectBool(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
default:
common.Log.Info("TODO(a5i): implement copyObject for %+v", obj)
}
// return other objects as is
return obj
}
// copyObjects makes objects copy and set as working.
func (this *PdfWriter) copyObjects() {
objectToObjectCopyMap := make(map[PdfObject]PdfObject)
objects := make([]PdfObject, len(this.objects))
objectsMap := make(map[PdfObject]bool)
for i, obj := range this.objects {
newObject := copyObject(obj, objectToObjectCopyMap)
objects[i] = newObject
if this.objectsMap[obj] {
objectsMap[newObject] = true
}
}
this.objects = objects
this.objectsMap = objectsMap
this.infoObj = copyObject(this.infoObj, objectToObjectCopyMap).(*PdfIndirectObject)
this.root = copyObject(this.root, objectToObjectCopyMap).(*PdfIndirectObject)
}
// Set the PDF version of the output file.
func (this *PdfWriter) SetVersion(majorVersion, minorVersion int) {
this.majorVersion = majorVersion
@ -152,6 +269,16 @@ func (this *PdfWriter) SetOCProperties(ocProperties PdfObject) error {
return nil
}
// SetOptimizer sets the optimizer to optimize PDF before writing.
func (this *PdfWriter) SetOptimizer(optimizer Optimizer) {
this.optimizer = optimizer
}
// GetOptimizer returns current PDF optimizer.
func (this *PdfWriter) GetOptimizer() Optimizer {
return this.optimizer
}
func (this *PdfWriter) hasObject(obj PdfObject) bool {
// Check if already added.
for _, o := range this.objects {
@ -438,6 +565,7 @@ func (this *PdfWriter) writeObject(num int, obj PdfObject) {
common.Log.Trace("Write obj #%d\n", num)
if pobj, isIndirect := obj.(*PdfIndirectObject); isIndirect {
this.crossReferenceMap[num] = crossReference{Type: 1, Offset: this.writePos, Generation: pobj.GenerationNumber}
outStr := fmt.Sprintf("%d 0 obj\n", num)
outStr += pobj.PdfObject.DefaultWriteString()
outStr += "\nendobj\n"
@ -448,6 +576,7 @@ func (this *PdfWriter) writeObject(num int, obj PdfObject) {
// XXX/TODO: Add a default encoder if Filter not specified?
// Still need to make sure is encrypted.
if pobj, isStream := obj.(*PdfObjectStream); isStream {
this.crossReferenceMap[num] = crossReference{Type: 1, Offset: this.writePos, Generation: pobj.GenerationNumber}
outStr := fmt.Sprintf("%d 0 obj\n", num)
outStr += pobj.PdfObjectDictionary.DefaultWriteString()
outStr += "\nstream\n"
@ -457,6 +586,46 @@ func (this *PdfWriter) writeObject(num int, obj PdfObject) {
return
}
if ostreams, isObjStreams := obj.(*PdfObjectStreams); isObjStreams {
this.crossReferenceMap[num] = crossReference{Type: 1, Offset: this.writePos, Generation: ostreams.GenerationNumber}
outStr := fmt.Sprintf("%d 0 obj\n", num)
var offsets []string
var objData string
var offset int64
for index, obj := range ostreams.Elements() {
io, isIndirect := obj.(*PdfIndirectObject)
if !isIndirect {
common.Log.Error("Object streams N %d contains non indirect pdf object %v", num, obj)
}
data := io.PdfObject.DefaultWriteString() + " "
objData = objData + data
offsets = append(offsets, fmt.Sprintf("%d %d", io.ObjectNumber, offset))
this.crossReferenceMap[int(io.ObjectNumber)] = crossReference{Type: 2, ObjectNumber: num, Index: index}
offset = offset + int64(len([]byte(data)))
}
offsetsStr := strings.Join(offsets, " ") + " "
encoder := NewFlateEncoder()
//encoder := NewRawEncoder()
dict := encoder.MakeStreamDict()
dict.Set(PdfObjectName("Type"), MakeName("ObjStm"))
n := int64(ostreams.Len())
dict.Set(PdfObjectName("N"), MakeInteger(n))
first := int64(len(offsetsStr))
dict.Set(PdfObjectName("First"), MakeInteger(first))
data, _ := encoder.EncodeBytes([]byte(offsetsStr + objData))
length := int64(len(data))
dict.Set(PdfObjectName("Length"), MakeInteger(length))
outStr += dict.DefaultWriteString()
outStr += "\nstream\n"
this.writeString(outStr)
this.writeBytes(data)
this.writeString("\nendstream\nendobj\n")
return
}
this.writer.WriteString(obj.DefaultWriteString())
}
@ -472,6 +641,10 @@ func (this *PdfWriter) updateObjectNumbers() {
so.ObjectNumber = int64(idx + 1)
so.GenerationNumber = 0
}
if so, isObjectStreams := obj.(*PdfObjectStreams); isObjectStreams {
so.ObjectNumber = int64(idx + 1)
so.GenerationNumber = 0
}
}
}
@ -686,24 +859,53 @@ func (this *PdfWriter) Write(writer io.Writer) error {
}
// Set version in the catalog.
this.catalog.Set("Version", MakeName(fmt.Sprintf("%d.%d", this.majorVersion, this.minorVersion)))
this.copyObjects()
if this.optimizer != nil {
var err error
this.objects, err = this.optimizer.Optimize(this.objects)
if err != nil {
return err
}
}
w := bufio.NewWriter(writer)
this.writer = w
this.writePos = 0
useCrossReferenceStream := this.majorVersion > 1 || (this.majorVersion == 1 && this.minorVersion > 4)
objectsInObjectStreams := make(map[PdfObject]bool)
if !useCrossReferenceStream {
for _, obj := range this.objects {
if objStm, isObjectStreams := obj.(*PdfObjectStreams); isObjectStreams {
useCrossReferenceStream = true
for _, obj := range objStm.Elements() {
objectsInObjectStreams[obj] = true
if io, isIndirectObj := obj.(*PdfIndirectObject); isIndirectObj {
objectsInObjectStreams[io.PdfObject] = true
}
}
}
}
}
if useCrossReferenceStream && this.majorVersion == 1 && this.minorVersion < 5 {
this.minorVersion = 5
}
this.writeString(fmt.Sprintf("%%PDF-%d.%d\n", this.majorVersion, this.minorVersion))
this.writeString("%âãÏÓ\n")
this.updateObjectNumbers()
offsets := []int64{}
// Write objects
common.Log.Trace("Writing %d obj", len(this.objects))
this.crossReferenceMap = make(map[int]crossReference)
this.crossReferenceMap[0] = crossReference{Type: 0, ObjectNumber: 0, Generation: 0xFFFF}
for idx, obj := range this.objects {
if skip := objectsInObjectStreams[obj]; skip {
continue
}
common.Log.Trace("Writing %d", idx)
offset := this.writePos
offsets = append(offsets, offset)
// Encrypt prior to writing.
// Encrypt dictionary should not be encrypted.
@ -713,41 +915,90 @@ func (this *PdfWriter) Write(writer io.Writer) error {
common.Log.Debug("ERROR: Failed encrypting (%s)", err)
return err
}
}
this.writeObject(idx+1, obj)
}
xrefOffset := this.writePos
// Write xref table.
this.writeString("xref\r\n")
outStr := fmt.Sprintf("%d %d\r\n", 0, len(this.objects)+1)
this.writeString(outStr)
outStr = fmt.Sprintf("%.10d %.5d f\r\n", 0, 65535)
this.writeString(outStr)
for _, offset := range offsets {
outStr = fmt.Sprintf("%.10d %.5d n\r\n", offset, 0)
this.writeString(outStr)
}
if useCrossReferenceStream {
crossObjNumber := len(this.crossReferenceMap)
this.crossReferenceMap[crossObjNumber] = crossReference{Type: 1, ObjectNumber: crossObjNumber, Offset: xrefOffset}
crossReferenceData := bytes.NewBuffer(nil)
for idx := 0; idx < len(this.crossReferenceMap); idx++ {
ref := this.crossReferenceMap[idx]
switch ref.Type {
case 0:
binary.Write(crossReferenceData, binary.BigEndian, byte(0))
binary.Write(crossReferenceData, binary.BigEndian, uint32(0))
binary.Write(crossReferenceData, binary.BigEndian, uint16(0xFFFF))
case 1:
binary.Write(crossReferenceData, binary.BigEndian, byte(1))
binary.Write(crossReferenceData, binary.BigEndian, uint32(ref.Offset))
binary.Write(crossReferenceData, binary.BigEndian, uint16(ref.Generation))
case 2:
binary.Write(crossReferenceData, binary.BigEndian, byte(2))
binary.Write(crossReferenceData, binary.BigEndian, uint32(ref.ObjectNumber))
binary.Write(crossReferenceData, binary.BigEndian, uint16(ref.Index))
}
}
crossReferenceStream, err := MakeStream(crossReferenceData.Bytes(), NewFlateEncoder())
if err != nil {
return err
}
crossReferenceStream.ObjectNumber = int64(crossObjNumber)
crossReferenceStream.PdfObjectDictionary.Set("Type", MakeName("XRef"))
crossReferenceStream.PdfObjectDictionary.Set("W", MakeArray(MakeInteger(1), MakeInteger(4), MakeInteger(2)))
crossReferenceStream.PdfObjectDictionary.Set("Index", MakeArray(MakeInteger(0), MakeInteger(crossReferenceStream.ObjectNumber+1)))
crossReferenceStream.PdfObjectDictionary.Set("Size", MakeInteger(crossReferenceStream.ObjectNumber+1))
crossReferenceStream.PdfObjectDictionary.Set("Info", this.infoObj)
crossReferenceStream.PdfObjectDictionary.Set("Root", this.root)
// If encrypted!
if this.crypter != nil {
crossReferenceStream.Set("Encrypt", this.encryptObj)
crossReferenceStream.Set("ID", this.ids)
common.Log.Trace("Ids: %s", this.ids)
}
this.writeObject(int(crossReferenceStream.ObjectNumber), crossReferenceStream)
} else {
this.writeString("xref\r\n")
outStr := fmt.Sprintf("%d %d\r\n", 0, len(this.crossReferenceMap))
this.writeString(outStr)
for idx := 0; idx < len(this.crossReferenceMap); idx++ {
ref := this.crossReferenceMap[idx]
switch ref.Type {
case 0:
outStr = fmt.Sprintf("%.10d %.5d f\r\n", 0, 65535)
this.writeString(outStr)
case 1:
outStr = fmt.Sprintf("%.10d %.5d n\r\n", ref.Offset, 0)
this.writeString(outStr)
}
}
// Generate & write trailer
trailer := MakeDict()
trailer.Set("Info", this.infoObj)
trailer.Set("Root", this.root)
trailer.Set("Size", MakeInteger(int64(len(this.objects)+1)))
// If encrypted!
if this.crypter != nil {
trailer.Set("Encrypt", this.encryptObj)
trailer.Set("ID", this.ids)
common.Log.Trace("Ids: %s", this.ids)
}
this.writeString("trailer\n")
this.writeString(trailer.DefaultWriteString())
this.writeString("\n")
// Generate & write trailer
trailer := MakeDict()
trailer.Set("Info", this.infoObj)
trailer.Set("Root", this.root)
trailer.Set("Size", MakeInteger(int64(len(this.objects)+1)))
// If encrypted!
if this.crypter != nil {
trailer.Set("Encrypt", this.encryptObj)
trailer.Set("ID", this.ids)
common.Log.Trace("Ids: %s", this.ids)
}
this.writeString("trailer\n")
this.writeString(trailer.DefaultWriteString())
this.writeString("\n")
// Make offset reference.
outStr = fmt.Sprintf("startxref\n%d\n", xrefOffset)
outStr := fmt.Sprintf("startxref\n%d\n", xrefOffset)
this.writeString(outStr)
this.writeString("%%EOF\n")