unipdf/extractor/image_test.go
2019-05-16 20:44:51 +00:00

360 lines
7.2 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"math"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/model"
)
func TestImageExtractionBasic(t *testing.T) {
type expectedImage struct {
X float64
Y float64
Width float64
Height float64
Angle int
}
testcases := []struct {
Name string
PageNum int
Path string
Expected []ImageMark
}{
{
"basic xobject",
1,
"./testdata/basic_xobject.pdf",
[]ImageMark{
{
Image: nil,
X: 0,
Y: 294.865385,
Width: 612,
Height: 197.134615,
Angle: 0,
},
},
},
{
"inline image",
1,
"./testdata/inline.pdf",
[]ImageMark{
{
Image: nil,
X: 0,
Y: -0.000000358,
Width: 12,
Height: 12,
Angle: 0,
},
},
},
}
for _, tcase := range testcases {
f, err := os.Open(tcase.Path)
require.NoError(t, err)
defer f.Close()
reader, err := model.NewPdfReader(f)
require.NoError(t, err)
page, err := reader.GetPage(tcase.PageNum)
require.NoError(t, err)
pageExtractor, err := New(page)
require.NoError(t, err)
pageImages, err := pageExtractor.ExtractPageImages(nil)
require.NoError(t, err)
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
for i, img := range pageImages.Images {
img.Image = nil // Discard image data.
assert.Equalf(t, tcase.Expected[i], img, "i = %d", i)
}
}
}
// Test position extraction with nested transform matrices.
func TestImageExtractionNestedCM(t *testing.T) {
testcases := []struct {
Name string
PageNum int
Path string
PrependCS string
AppendCS string
Expected []ImageMark
}{
{
"basic xobject - translate (100,50)",
1,
"./testdata/basic_xobject.pdf",
"1 0 0 1 100.0 50.0 cm q",
"Q",
[]ImageMark{
{
Image: nil,
X: 0 + 100.0,
Y: 294.865385 + 50.0,
Width: 612,
Height: 197.134615,
Angle: 0,
},
},
},
{
"basic xobject - scale (1.5,2)X",
1,
"./testdata/basic_xobject.pdf",
"1.5 0 0 2.0 0 0 cm q",
"Q",
[]ImageMark{
{
Image: nil,
X: 0,
Y: 294.865385 * 2.0,
Width: 612 * 1.5,
Height: 197.134615 * 2.0,
Angle: 0,
},
},
},
{
"basic xobject - translate (100,50) scale (1.5,2)X",
1,
"./testdata/basic_xobject.pdf",
"1.5 0 0 2.0 0 0 cm q 1 0 0 1 100.0 50.0 cm q",
"Q Q",
[]ImageMark{
{
Image: nil,
X: 100.0 * 1.5,
Y: (294.865385 + 50.0) * 2.0,
Width: 612 * 1.5,
Height: 197.134615 * 2.0,
Angle: 0,
},
},
},
}
for _, tcase := range testcases {
f, err := os.Open(tcase.Path)
require.NoError(t, err)
defer f.Close()
reader, err := model.NewPdfReader(f)
require.NoError(t, err)
page, err := reader.GetPage(tcase.PageNum)
require.NoError(t, err)
contentstr, err := page.GetAllContentStreams()
require.NoError(t, err)
// Modify the contentstream to alter the position by way of nested transform matrices.
contentstr = tcase.PrependCS + " " + contentstr + " " + tcase.AppendCS
err = page.SetContentStreams([]string{contentstr}, core.NewFlateEncoder())
require.NoError(t, err)
pageExtractor, err := New(page)
require.NoError(t, err)
pageImages, err := pageExtractor.ExtractPageImages(nil)
require.NoError(t, err)
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
for i, img := range pageImages.Images {
img.Image = nil // Discard image data.
assert.Equalf(t, tcase.Expected[i], img, "i = %d", i)
}
}
}
// Test multiple copies of same image XObject with different scales.
func TestImageExtractionMulti(t *testing.T) {
testcases := []struct {
PageNum int
Path string
NumImages int
DimensionFunc func(i int) (dy float64, w float64, h float64)
NumSamples int
}{
{
1,
"./testdata/multi.pdf",
12,
func(i int) (dy float64, w float64, h float64) {
w = 100 + 10*float64(i+1)
ar := 35.432692 / 110.0
h = w * ar
dy = h
return dy, w, h
},
416 * 134 * 3,
},
}
for _, tcase := range testcases {
f, err := os.Open(tcase.Path)
require.NoError(t, err)
defer f.Close()
reader, err := model.NewPdfReader(f)
require.NoError(t, err)
page, err := reader.GetPage(tcase.PageNum)
require.NoError(t, err)
pageExtractor, err := New(page)
require.NoError(t, err)
pageImages, err := pageExtractor.ExtractPageImages(nil)
require.NoError(t, err)
assert.Equal(t, tcase.NumImages, len(pageImages.Images))
for i, img := range pageImages.Images {
dy, w, h := tcase.DimensionFunc(i)
assert.Equalf(t, tcase.NumSamples, len(img.Image.GetSamples()), "i = %d", i)
// Comparison with tolerance.
assert.Truef(t, math.Abs(w-img.Width) < 0.00001, "i = %d", i)
assert.Truef(t, math.Abs(h-img.Height) < 0.00001, "i = %d", i)
if i > 0 {
measDY := pageImages.Images[i-1].Y - pageImages.Images[i].Y
assert.Truef(t, math.Abs(dy-measDY) < 0.00001, "i = %d", i)
}
}
}
}
func TestImageExtractionRealWorld(t *testing.T) {
if len(corpusFolder) == 0 && !forceTest {
t.Log("Corpus folder not set - skipping")
return
}
testcases := []struct {
Name string
PageNum int
Path string
Expected []ImageMark
}{
{
"ICC color space",
3,
"icnp12-qinghua.pdf",
[]ImageMark{
{
Image: nil,
Width: 2.877,
Height: 22.344,
X: 236.508,
Y: 685.248,
Angle: 0.0,
},
{
Image: nil,
Width: 247.44,
Height: 0.48,
X: 313.788,
Y: 715.248,
Angle: 0.0,
},
{
Image: nil,
Width: 247.44,
Height: 0.48,
X: 313.788,
Y: 594.648,
Angle: 0.0,
},
},
},
{
"Indexed color space",
1,
"MondayAM.pdf",
[]ImageMark{},
},
}
for _, tcase := range testcases {
inputPath := filepath.Join(corpusFolder, tcase.Path)
f, err := os.Open(inputPath)
require.NoError(t, err)
defer f.Close()
reader, err := model.NewPdfReader(f)
require.NoError(t, err)
page, err := reader.GetPage(tcase.PageNum)
require.NoError(t, err)
pageExtractor, err := New(page)
require.NoError(t, err)
pageImages, err := pageExtractor.ExtractPageImages(nil)
require.NoError(t, err)
if len(tcase.Expected) == 0 {
// This is to test that images parse without error only.
continue
}
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
for i, img := range pageImages.Images {
img.Image = nil // Discard image data.
assert.Equalf(t, tcase.Expected[i], img, "i = %d", i)
}
}
}
func BenchmarkImageExtraction(b *testing.B) {
cnt := 0
for i := 0; i < b.N; i++ {
f, err := os.Open("./testdata/basic_xobject.pdf")
require.NoError(b, err)
defer f.Close()
reader, err := model.NewPdfReader(f)
require.NoError(b, err)
page, err := reader.GetPage(1)
require.NoError(b, err)
pageExtractor, err := New(page)
require.NoError(b, err)
pageImages, err := pageExtractor.ExtractPageImages(nil)
require.NoError(b, err)
cnt += len(pageImages.Images)
}
assert.Equal(b, b.N, cnt)
}