From 7012c8b0972c5e332b41f78cc8d179731fec4d7f Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Fri, 8 Mar 2019 18:59:23 +0200 Subject: [PATCH 1/4] Add image extractor options --- pdf/extractor/image.go | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/pdf/extractor/image.go b/pdf/extractor/image.go index d9550b3f..4adaf376 100644 --- a/pdf/extractor/image.go +++ b/pdf/extractor/image.go @@ -12,10 +12,21 @@ import ( "github.com/unidoc/unidoc/pdf/model" ) +// ExtractImagesOpts contains options for controlling image extraction from +// PDF pages. +type ExtractPageImagesOpts struct { + IncludeInlineStencilMasks bool +} + // ExtractPageImages returns the image contents of the page extractor, including data // and position, size information for each image. -func (e *Extractor) ExtractPageImages() (*PageImages, error) { - ctx := &imageExtractContext{} +// A set of options to control page image extraction can be passed in. The opts +// parameter can be nil for the default options. By default, inline stencil masks +// are not extracted. +func (e *Extractor) ExtractPageImages(opts *ExtractPageImagesOpts) (*PageImages, error) { + ctx := &imageExtractContext{ + opts: opts, + } err := ctx.extractContentStreamImages(e.contents, e.resources) if err != nil { @@ -59,6 +70,9 @@ type imageExtractContext struct { // Cache to avoid processing same image many times. cacheXObjectImages map[*core.PdfObjectStream]*cachedImage + + // Extract options. + opts *ExtractPageImagesOpts } type cachedImage struct { @@ -76,6 +90,9 @@ func (ctx *imageExtractContext) extractContentStreamImages(contents string, reso if ctx.cacheXObjectImages == nil { ctx.cacheXObjectImages = map[*core.PdfObjectStream]*cachedImage{} } + if ctx.opts == nil { + ctx.opts = &ExtractPageImagesOpts{} + } processor := contentstream.NewContentStreamProcessor(*operations) processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "", @@ -95,6 +112,12 @@ func (ctx *imageExtractContext) processOperand(op *contentstream.ContentStreamOp return nil } + if isImageMask, ok := core.GetBoolVal(iimg.ImageMask); ok { + if isImageMask && !ctx.opts.IncludeInlineStencilMasks { + return nil + } + } + return ctx.extractInlineImage(iimg, gs, resources) } else if op.Operand == "Do" && len(op.Params) == 1 { // Do: XObject. From 85acc739774b7bdac53820d26ffa59177a57a6ea Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Fri, 8 Mar 2019 18:59:51 +0200 Subject: [PATCH 2/4] Adjust image extractor test --- pdf/extractor/image_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pdf/extractor/image_test.go b/pdf/extractor/image_test.go index 98bf8287..ca789f67 100644 --- a/pdf/extractor/image_test.go +++ b/pdf/extractor/image_test.go @@ -87,7 +87,7 @@ func TestImageExtractionBasic(t *testing.T) { pageExtractor, err := New(page) require.NoError(t, err) - pageImages, err := pageExtractor.ExtractPageImages() + pageImages, err := pageExtractor.ExtractPageImages(nil) require.NoError(t, err) assert.Equal(t, len(tcase.Expected), len(pageImages.Images)) @@ -177,7 +177,7 @@ func TestImageExtractionNestedCM(t *testing.T) { pageExtractor, err := New(page) require.NoError(t, err) - pageImages, err := pageExtractor.ExtractPageImages() + pageImages, err := pageExtractor.ExtractPageImages(nil) require.NoError(t, err) assert.Equal(t, len(tcase.Expected), len(pageImages.Images)) @@ -222,7 +222,7 @@ func TestImageExtractionMulti(t *testing.T) { pageExtractor, err := New(page) require.NoError(t, err) - pageImages, err := pageExtractor.ExtractPageImages() + pageImages, err := pageExtractor.ExtractPageImages(nil) require.NoError(t, err) assert.Equal(t, tcase.NumImages, len(pageImages.Images)) @@ -303,7 +303,7 @@ func TestImageExtractionRealWorld(t *testing.T) { pageExtractor, err := New(page) require.NoError(t, err) - pageImages, err := pageExtractor.ExtractPageImages() + pageImages, err := pageExtractor.ExtractPageImages(nil) require.NoError(t, err) if len(tcase.Expected) == 0 { @@ -328,7 +328,7 @@ func BenchmarkImageExtraction(b *testing.B) { pageExtractor, err := New(page) require.NoError(b, err) - pageImages, err := pageExtractor.ExtractPageImages() + pageImages, err := pageExtractor.ExtractPageImages(nil) require.NoError(b, err) cnt += len(pageImages.Images) From 08bdfb0117722eef483de87da2bb202e85a9b605 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Fri, 8 Mar 2019 19:03:37 +0200 Subject: [PATCH 3/4] Rename ExtractImagesOpts to ImageExtractOpts --- pdf/extractor/image.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pdf/extractor/image.go b/pdf/extractor/image.go index 4adaf376..c86a16ed 100644 --- a/pdf/extractor/image.go +++ b/pdf/extractor/image.go @@ -12,9 +12,9 @@ import ( "github.com/unidoc/unidoc/pdf/model" ) -// ExtractImagesOpts contains options for controlling image extraction from +// ImageExtractOpts contains options for controlling image extraction from // PDF pages. -type ExtractPageImagesOpts struct { +type ImageExtractOpts struct { IncludeInlineStencilMasks bool } @@ -23,7 +23,7 @@ type ExtractPageImagesOpts struct { // A set of options to control page image extraction can be passed in. The opts // parameter can be nil for the default options. By default, inline stencil masks // are not extracted. -func (e *Extractor) ExtractPageImages(opts *ExtractPageImagesOpts) (*PageImages, error) { +func (e *Extractor) ExtractPageImages(opts *ImageExtractOpts) (*PageImages, error) { ctx := &imageExtractContext{ opts: opts, } @@ -72,7 +72,7 @@ type imageExtractContext struct { cacheXObjectImages map[*core.PdfObjectStream]*cachedImage // Extract options. - opts *ExtractPageImagesOpts + opts *ImageExtractOpts } type cachedImage struct { @@ -91,7 +91,7 @@ func (ctx *imageExtractContext) extractContentStreamImages(contents string, reso ctx.cacheXObjectImages = map[*core.PdfObjectStream]*cachedImage{} } if ctx.opts == nil { - ctx.opts = &ExtractPageImagesOpts{} + ctx.opts = &ImageExtractOpts{} } processor := contentstream.NewContentStreamProcessor(*operations) From d478aeaf760462cfba2c66c0014c6286a7e696e0 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Fri, 8 Mar 2019 21:46:22 +0200 Subject: [PATCH 4/4] Rename ImageExtractOpts to ImageExtractOptions --- pdf/extractor/image.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pdf/extractor/image.go b/pdf/extractor/image.go index c86a16ed..194e786c 100644 --- a/pdf/extractor/image.go +++ b/pdf/extractor/image.go @@ -12,20 +12,20 @@ import ( "github.com/unidoc/unidoc/pdf/model" ) -// ImageExtractOpts contains options for controlling image extraction from +// ImageExtractOptions contains options for controlling image extraction from // PDF pages. -type ImageExtractOpts struct { +type ImageExtractOptions struct { IncludeInlineStencilMasks bool } // ExtractPageImages returns the image contents of the page extractor, including data // and position, size information for each image. -// A set of options to control page image extraction can be passed in. The opts +// A set of options to control page image extraction can be passed in. The options // parameter can be nil for the default options. By default, inline stencil masks // are not extracted. -func (e *Extractor) ExtractPageImages(opts *ImageExtractOpts) (*PageImages, error) { +func (e *Extractor) ExtractPageImages(options *ImageExtractOptions) (*PageImages, error) { ctx := &imageExtractContext{ - opts: opts, + options: options, } err := ctx.extractContentStreamImages(e.contents, e.resources) @@ -72,7 +72,7 @@ type imageExtractContext struct { cacheXObjectImages map[*core.PdfObjectStream]*cachedImage // Extract options. - opts *ImageExtractOpts + options *ImageExtractOptions } type cachedImage struct { @@ -90,8 +90,8 @@ func (ctx *imageExtractContext) extractContentStreamImages(contents string, reso if ctx.cacheXObjectImages == nil { ctx.cacheXObjectImages = map[*core.PdfObjectStream]*cachedImage{} } - if ctx.opts == nil { - ctx.opts = &ImageExtractOpts{} + if ctx.options == nil { + ctx.options = &ImageExtractOptions{} } processor := contentstream.NewContentStreamProcessor(*operations) @@ -113,7 +113,7 @@ func (ctx *imageExtractContext) processOperand(op *contentstream.ContentStreamOp } if isImageMask, ok := core.GetBoolVal(iimg.ImageMask); ok { - if isImageMask && !ctx.opts.IncludeInlineStencilMasks { + if isImageMask && !ctx.options.IncludeInlineStencilMasks { return nil } }