mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00

* Add extract images test case, with memory profiling * Use TotalAlloc insted of Alloc for memory profiling * Remove calls to debug.FreeOSMemory from test cases
162 lines
4.6 KiB
Go
162 lines
4.6 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package e2etest
|
|
|
|
import (
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// Split tests splits a single page from a PDF, writes out and performs a sanity check on the output with ghostscript.
|
|
// Also checks memory use.
|
|
// Set environment variables:
|
|
// UNIDOC_E2E_FORCE_TESTS to "1" to force the tests to execute.
|
|
// UNIDOC_SPLIT_TESTDATA to the path of the corpus folder.
|
|
// UNIDOC_GS_BIN_PATH to the path of the ghostscript binary (gs) for validation.
|
|
var (
|
|
splitCorpusFolder = os.Getenv("UNIDOC_SPLIT_TESTDATA")
|
|
)
|
|
|
|
// knownHashes defines a list of known output hashes to ensure that the output is constant.
|
|
// If there is a change in hash need to find out why and update only if the change is accepted.
|
|
var knownHashes = map[string]string{
|
|
"bf7c9d5dabc7e7ec2fc0cf9db2d9c8e7aa456fca.pdf": "fdd638603c6f655babbc90358de66107",
|
|
"371dce2c2720581a3eef3f123e5741dd3566ef87.pdf": "4c5356ac623a96004d80315f24613fff",
|
|
"e815311526b50036db6e89c54af2b9626edecf30.pdf": "97dcfdde59a2f3a6eb105d0c31ebd3fb",
|
|
"3bf64014e0c9e4a56f1a9363f1b34fd707bd9fa0.pdf": "6f310c9fdd44d49766d3cc32d3053b89",
|
|
"004feecd47e2da4f2ed5cdbbf4791a77dd59ce20.pdf": "309a072a97d0566aa3f85edae504bb53",
|
|
"30c0a5cff80870cd58c2738d622f5d63e37dc90c.pdf": "67d7c2fbf21dd9d65c8bb9ab29dfec60",
|
|
"8f8ce400b9d66656cd09260035aa0cc3f7e46c82.pdf": "679650c27697a7b83ee792692daaff18",
|
|
"a35d386af4828b7221591343761191e8f9a28bc0.pdf": "1955d6cf29715652bea999bcbadc818b",
|
|
"e815699a5234540fda89ea3a2ece055349a0d535.pdf": "5a1d97ee1aabc5dcacbbf3cd164b964d",
|
|
}
|
|
|
|
func TestSplitting(t *testing.T) {
|
|
if len(splitCorpusFolder) == 0 {
|
|
if forceTest {
|
|
t.Fatalf("uNIDOC_SPLIT_TESTDATA not set")
|
|
}
|
|
}
|
|
|
|
files, err := ioutil.ReadDir(splitCorpusFolder)
|
|
if err != nil {
|
|
if forceTest {
|
|
t.Fatalf("Error opening %s: %v", splitCorpusFolder, err)
|
|
}
|
|
t.Skipf("Skipping split bench - unable to open UNIDOC_SPLIT_TESTDATA (%s)", splitCorpusFolder)
|
|
return
|
|
}
|
|
|
|
// Make a temporary folder and clean up after.
|
|
tempdir, err := ioutil.TempDir("", "unidoc_split")
|
|
require.NoError(t, err)
|
|
defer os.RemoveAll(tempdir)
|
|
|
|
matchcount := 0
|
|
for _, file := range files {
|
|
t.Logf("%s", file.Name())
|
|
fpath := filepath.Join(splitCorpusFolder, file.Name())
|
|
params := splitParams{
|
|
inputPath: fpath,
|
|
outPath: filepath.Join(tempdir, "split_1_"+file.Name()),
|
|
gsValidation: len(ghostscriptBinPath) > 0,
|
|
}
|
|
splitSinglePdf(t, params)
|
|
|
|
hash, err := hashFile(params.outPath)
|
|
require.NoError(t, err)
|
|
|
|
knownHash, has := knownHashes[file.Name()]
|
|
if has {
|
|
require.Equal(t, knownHash, hash)
|
|
matchcount++
|
|
} else {
|
|
t.Logf("%s - hash: %s not in the list of known hashes", file.Name(), hash)
|
|
}
|
|
}
|
|
|
|
// Ensure all the defined hashes were found.
|
|
require.Equal(t, len(knownHashes), matchcount)
|
|
|
|
t.Logf("Split benchmark complete for %d files in %s", len(files), splitCorpusFolder)
|
|
}
|
|
|
|
type splitParams struct {
|
|
inputPath string
|
|
outPath string
|
|
gsValidation bool
|
|
}
|
|
|
|
func splitSinglePdf(t *testing.T, params splitParams) {
|
|
measure := startMemoryMeasurement()
|
|
|
|
file, err := os.Open(params.inputPath)
|
|
require.NoError(t, err)
|
|
defer file.Close()
|
|
|
|
reader, err := model.NewPdfReaderLazy(file)
|
|
require.NoError(t, err)
|
|
|
|
isEncrypted, err := reader.IsEncrypted()
|
|
require.NoError(t, err)
|
|
if isEncrypted {
|
|
auth, err := reader.Decrypt([]byte(""))
|
|
require.NoError(t, err)
|
|
require.True(t, auth)
|
|
}
|
|
|
|
numPages, err := reader.GetNumPages()
|
|
require.NoError(t, err)
|
|
|
|
if numPages < 1 {
|
|
common.Log.Debug("Empty pdf - nothing to be done!")
|
|
return
|
|
}
|
|
|
|
model.SetPdfProducer("UniDoc")
|
|
writer := model.NewPdfWriter()
|
|
|
|
// Split the first page.
|
|
page, err := reader.GetPage(1)
|
|
require.NoError(t, err)
|
|
|
|
err = writer.AddPage(page)
|
|
require.NoError(t, err)
|
|
|
|
of, err := os.Create(params.outPath)
|
|
require.NoError(t, err)
|
|
defer of.Close()
|
|
|
|
err = writer.Write(of)
|
|
require.NoError(t, err)
|
|
|
|
measure.Stop()
|
|
summary := measure.Summary()
|
|
t.Logf("%s - summary %s", params.inputPath, summary)
|
|
|
|
// GS validation of input, output pdfs.
|
|
if params.gsValidation {
|
|
common.Log.Debug("Validating input file")
|
|
inputWarnings, err := validatePdf(params.inputPath, "")
|
|
require.NoError(t, err)
|
|
|
|
common.Log.Debug("Validating output file")
|
|
|
|
warnings, err := validatePdf(params.outPath, "")
|
|
if err != nil && warnings > inputWarnings {
|
|
common.Log.Debug("Input warnings %d vs output %d", inputWarnings, warnings)
|
|
t.Fatalf("Invalid PDF input %d/ output %d warnings", inputWarnings, warnings)
|
|
}
|
|
common.Log.Debug("Valid PDF!")
|
|
}
|
|
}
|