mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-07 19:29:16 +08:00
Merge pull request #363 from adrg/image-extract-opts
Add image extraction options
This commit is contained in:
commit
c41dd7e028
@ -12,10 +12,21 @@ import (
|
|||||||
"github.com/unidoc/unidoc/pdf/model"
|
"github.com/unidoc/unidoc/pdf/model"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// ImageExtractOptions contains options for controlling image extraction from
|
||||||
|
// PDF pages.
|
||||||
|
type ImageExtractOptions struct {
|
||||||
|
IncludeInlineStencilMasks bool
|
||||||
|
}
|
||||||
|
|
||||||
// ExtractPageImages returns the image contents of the page extractor, including data
|
// ExtractPageImages returns the image contents of the page extractor, including data
|
||||||
// and position, size information for each image.
|
// and position, size information for each image.
|
||||||
func (e *Extractor) ExtractPageImages() (*PageImages, error) {
|
// A set of options to control page image extraction can be passed in. The options
|
||||||
ctx := &imageExtractContext{}
|
// parameter can be nil for the default options. By default, inline stencil masks
|
||||||
|
// are not extracted.
|
||||||
|
func (e *Extractor) ExtractPageImages(options *ImageExtractOptions) (*PageImages, error) {
|
||||||
|
ctx := &imageExtractContext{
|
||||||
|
options: options,
|
||||||
|
}
|
||||||
|
|
||||||
err := ctx.extractContentStreamImages(e.contents, e.resources)
|
err := ctx.extractContentStreamImages(e.contents, e.resources)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -59,6 +70,9 @@ type imageExtractContext struct {
|
|||||||
|
|
||||||
// Cache to avoid processing same image many times.
|
// Cache to avoid processing same image many times.
|
||||||
cacheXObjectImages map[*core.PdfObjectStream]*cachedImage
|
cacheXObjectImages map[*core.PdfObjectStream]*cachedImage
|
||||||
|
|
||||||
|
// Extract options.
|
||||||
|
options *ImageExtractOptions
|
||||||
}
|
}
|
||||||
|
|
||||||
type cachedImage struct {
|
type cachedImage struct {
|
||||||
@ -76,6 +90,9 @@ func (ctx *imageExtractContext) extractContentStreamImages(contents string, reso
|
|||||||
if ctx.cacheXObjectImages == nil {
|
if ctx.cacheXObjectImages == nil {
|
||||||
ctx.cacheXObjectImages = map[*core.PdfObjectStream]*cachedImage{}
|
ctx.cacheXObjectImages = map[*core.PdfObjectStream]*cachedImage{}
|
||||||
}
|
}
|
||||||
|
if ctx.options == nil {
|
||||||
|
ctx.options = &ImageExtractOptions{}
|
||||||
|
}
|
||||||
|
|
||||||
processor := contentstream.NewContentStreamProcessor(*operations)
|
processor := contentstream.NewContentStreamProcessor(*operations)
|
||||||
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
|
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
|
||||||
@ -95,6 +112,12 @@ func (ctx *imageExtractContext) processOperand(op *contentstream.ContentStreamOp
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if isImageMask, ok := core.GetBoolVal(iimg.ImageMask); ok {
|
||||||
|
if isImageMask && !ctx.options.IncludeInlineStencilMasks {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ctx.extractInlineImage(iimg, gs, resources)
|
return ctx.extractInlineImage(iimg, gs, resources)
|
||||||
} else if op.Operand == "Do" && len(op.Params) == 1 {
|
} else if op.Operand == "Do" && len(op.Params) == 1 {
|
||||||
// Do: XObject.
|
// Do: XObject.
|
||||||
|
@ -87,7 +87,7 @@ func TestImageExtractionBasic(t *testing.T) {
|
|||||||
pageExtractor, err := New(page)
|
pageExtractor, err := New(page)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
pageImages, err := pageExtractor.ExtractPageImages()
|
pageImages, err := pageExtractor.ExtractPageImages(nil)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
|
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
|
||||||
@ -177,7 +177,7 @@ func TestImageExtractionNestedCM(t *testing.T) {
|
|||||||
pageExtractor, err := New(page)
|
pageExtractor, err := New(page)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
pageImages, err := pageExtractor.ExtractPageImages()
|
pageImages, err := pageExtractor.ExtractPageImages(nil)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
|
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
|
||||||
@ -222,7 +222,7 @@ func TestImageExtractionMulti(t *testing.T) {
|
|||||||
pageExtractor, err := New(page)
|
pageExtractor, err := New(page)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
pageImages, err := pageExtractor.ExtractPageImages()
|
pageImages, err := pageExtractor.ExtractPageImages(nil)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
assert.Equal(t, tcase.NumImages, len(pageImages.Images))
|
assert.Equal(t, tcase.NumImages, len(pageImages.Images))
|
||||||
@ -303,7 +303,7 @@ func TestImageExtractionRealWorld(t *testing.T) {
|
|||||||
pageExtractor, err := New(page)
|
pageExtractor, err := New(page)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
pageImages, err := pageExtractor.ExtractPageImages()
|
pageImages, err := pageExtractor.ExtractPageImages(nil)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
if len(tcase.Expected) == 0 {
|
if len(tcase.Expected) == 0 {
|
||||||
@ -328,7 +328,7 @@ func BenchmarkImageExtraction(b *testing.B) {
|
|||||||
pageExtractor, err := New(page)
|
pageExtractor, err := New(page)
|
||||||
require.NoError(b, err)
|
require.NoError(b, err)
|
||||||
|
|
||||||
pageImages, err := pageExtractor.ExtractPageImages()
|
pageImages, err := pageExtractor.ExtractPageImages(nil)
|
||||||
require.NoError(b, err)
|
require.NoError(b, err)
|
||||||
|
|
||||||
cnt += len(pageImages.Images)
|
cnt += len(pageImages.Images)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user