mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-29 13:48:54 +08:00
199 lines
4.2 KiB
Go
199 lines
4.2 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"os"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"github.com/unidoc/unidoc/pdf/core"
|
|
"github.com/unidoc/unidoc/pdf/model"
|
|
)
|
|
|
|
// TODO(gunnsth): Create more test cases:
|
|
// - Inline image extraction.
|
|
// - Caching when extracting multiple copies of same image with different scaling.
|
|
// - Images within XObject forms.
|
|
// - A more complex example with more images.
|
|
// - Check extracted image data.
|
|
|
|
func loadPageFromPDFFile(filePath string, pageNum int) (*model.PdfPage, error) {
|
|
f, err := os.Open(filePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer f.Close()
|
|
|
|
pdfReader, err := model.NewPdfReader(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return pdfReader.GetPage(pageNum)
|
|
}
|
|
|
|
func TestImageExtractionBasic(t *testing.T) {
|
|
type expectedImage struct {
|
|
X float64
|
|
Y float64
|
|
Width float64
|
|
Height float64
|
|
Angle int
|
|
}
|
|
|
|
testcases := []struct {
|
|
Name string
|
|
PageNum int
|
|
Path string
|
|
Expected []ImageMark
|
|
}{
|
|
{
|
|
"basic xobject",
|
|
1,
|
|
"./testdata/basic_xobject.pdf",
|
|
[]ImageMark{
|
|
{
|
|
Image: nil,
|
|
X: 0,
|
|
Y: 294.865385,
|
|
Width: 612,
|
|
Height: 197.134615,
|
|
Angle: 0,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tcase := range testcases {
|
|
page, err := loadPageFromPDFFile(tcase.Path, tcase.PageNum)
|
|
require.NoError(t, err)
|
|
|
|
pageExtractor, err := New(page)
|
|
require.NoError(t, err)
|
|
|
|
pageImages, err := pageExtractor.ExtractPageImages()
|
|
require.NoError(t, err)
|
|
|
|
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
|
|
|
|
for i, img := range pageImages.Images {
|
|
img.Image = nil // Discard image data.
|
|
assert.Equalf(t, tcase.Expected[i], img, "i = %d", i)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Test position extraction with nested transform matrices.
|
|
func TestImageExtractionNestedCM(t *testing.T) {
|
|
testcases := []struct {
|
|
Name string
|
|
PageNum int
|
|
Path string
|
|
PrependCS string
|
|
AppendCS string
|
|
Expected []ImageMark
|
|
}{
|
|
{
|
|
"basic xobject - translate (100,50)",
|
|
1,
|
|
"./testdata/basic_xobject.pdf",
|
|
"1 0 0 1 100.0 50.0 cm q",
|
|
"Q",
|
|
[]ImageMark{
|
|
{
|
|
Image: nil,
|
|
X: 0 + 100.0,
|
|
Y: 294.865385 + 50.0,
|
|
Width: 612,
|
|
Height: 197.134615,
|
|
Angle: 0,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"basic xobject - scale (1.5,2)X",
|
|
1,
|
|
"./testdata/basic_xobject.pdf",
|
|
"1.5 0 0 2.0 0 0 cm q",
|
|
"Q",
|
|
[]ImageMark{
|
|
{
|
|
Image: nil,
|
|
X: 0,
|
|
Y: 294.865385 * 2.0,
|
|
Width: 612 * 1.5,
|
|
Height: 197.134615 * 2.0,
|
|
Angle: 0,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"basic xobject - translate (100,50) scale (1.5,2)X",
|
|
1,
|
|
"./testdata/basic_xobject.pdf",
|
|
"1.5 0 0 2.0 0 0 cm q 1 0 0 1 100.0 50.0 cm q",
|
|
"Q Q",
|
|
[]ImageMark{
|
|
{
|
|
Image: nil,
|
|
X: 100.0 * 1.5,
|
|
Y: (294.865385 + 50.0) * 2.0,
|
|
Width: 612 * 1.5,
|
|
Height: 197.134615 * 2.0,
|
|
Angle: 0,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tcase := range testcases {
|
|
page, err := loadPageFromPDFFile(tcase.Path, tcase.PageNum)
|
|
require.NoError(t, err)
|
|
|
|
contentstr, err := page.GetAllContentStreams()
|
|
require.NoError(t, err)
|
|
|
|
// Modify the contentstream to alter the position by way of nested transform matrices.
|
|
contentstr = tcase.PrependCS + " " + contentstr + " " + tcase.AppendCS
|
|
err = page.SetContentStreams([]string{contentstr}, core.NewFlateEncoder())
|
|
require.NoError(t, err)
|
|
|
|
pageExtractor, err := New(page)
|
|
require.NoError(t, err)
|
|
|
|
pageImages, err := pageExtractor.ExtractPageImages()
|
|
require.NoError(t, err)
|
|
|
|
assert.Equal(t, len(tcase.Expected), len(pageImages.Images))
|
|
|
|
for i, img := range pageImages.Images {
|
|
img.Image = nil // Discard image data.
|
|
assert.Equalf(t, tcase.Expected[i], img, "i = %d", i)
|
|
}
|
|
}
|
|
}
|
|
|
|
func BenchmarkImageExtraction(b *testing.B) {
|
|
cnt := 0
|
|
for i := 0; i < b.N; i++ {
|
|
page, err := loadPageFromPDFFile("./testdata/basic_xobject.pdf", 1)
|
|
require.NoError(b, err)
|
|
|
|
pageExtractor, err := New(page)
|
|
require.NoError(b, err)
|
|
|
|
pageImages, err := pageExtractor.ExtractPageImages()
|
|
require.NoError(b, err)
|
|
|
|
cnt += len(pageImages.Images)
|
|
}
|
|
|
|
assert.Equal(b, b.N, cnt)
|
|
}
|