mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00

* Add ColorAt method for images * Avoid resample on image to Go image conversion * Avoid resample when converting grayscale image to RGB * Preserve old behavior of image to Go image conversion * Add missing case in the ToGoImage method * Fix grayscale to RGB image conversion * Improve code documentation * Fix color extraction for CMYK and 4 bit RGB * Add test case for the ColorAt image method * Avoid resampling when converting CMYK image to RGB * Add notice comment for the GetSamples/SetSamples image methods
181 lines
5.7 KiB
Go
181 lines
5.7 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package e2etest
|
|
|
|
import (
|
|
"archive/zip"
|
|
"fmt"
|
|
"image/jpeg"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/extractor"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// Extract images test writes out a zip file containing all images extracted
|
|
// from the subject PDF file and compares its hash with a known zip file hash.
|
|
// Also checks memory usage.
|
|
// Set environment variables:
|
|
// UNIDOC_E2E_FORCE_TESTS to "1" to force the tests to execute.
|
|
// UNIDOC_EXTRACT_IMAGES_TESTDATA to the path of the corpus folder.
|
|
var (
|
|
extractImagesCorpusFolder = os.Getenv("UNIDOC_EXTRACT_IMAGES_TESTDATA")
|
|
)
|
|
|
|
// knownExtrImgsHashes defines a list of known output hashes to ensure that the output is constant.
|
|
// If there is a change in hash need to find out why and update only if the change is accepted.
|
|
var knownExtrImgsHashes = map[string]string{
|
|
"1ecec6aa4abed1855fb88916d7feb8c9692daaf5.pdf": "64899eb2c683f2e0b1ce0e35b5377aed",
|
|
"7eee345c983461d44ae939b3f800a97642817c8d.pdf": "5a4cd00537f8a1f39fff2d4c6dd5cc53",
|
|
"52ab322c1697aca9bad37288f7c502e37fa657af.pdf": "2bddee02dff89a38c08322c9d2c779a6",
|
|
"0edf09fd438db2f18c1bb08fccc1f81a7b280bf2.pdf": "583f755b3fb1bd5697036616442687ab",
|
|
"cafe55316a45435c3817f4c1b6a19c9cd52db825.pdf": "b199badff0abb0311a2cbe35c7fce580",
|
|
"6773e6aa5d8a2d26362cf3fca2874b3a81025bae.pdf": "f052e3e333839508a8bdd8d1a3ba1973",
|
|
"d11a3ca55664828b69d7c39d83d5c0a63fcea89d.pdf": "29287cd44f009dce5aa9c2a0dc9a3c83",
|
|
"483933bf73cc4fcc264eb69214ff763ccf299e49.pdf": "627dcf88805786d03b2e76d367b42642",
|
|
"da1c5c4c4fe36f676dbca6ea01673c9fdf77c7a9.pdf": "aa7980a7d50a4f20ff368f035f9f1c5a",
|
|
"f856baf7ffcd96003b6bda800171cb0e5680f78e.pdf": "a9505d8c22f1fd063fbe0b05aa33a5fc",
|
|
"201c20676fe8da14a8130852c91ed58b48cba8fb.pdf": "ffcb78d126c04be9ca2497bb43b6e964",
|
|
"f0152456494aa09e5cf82c4afe9ecd2fdc2e8d72.pdf": "d0e68157aaa7f9f4406807512db3e676",
|
|
"d95643acea1ec3f6215bda35e4cd89dbd8898c44.pdf": "1739aed3e1cbfa5e98f8d7fef17a614b",
|
|
"110d793aeaa7accbe40b5ab9db249d5a103d3b50.pdf": "a57e347edddfd3f6032b85553b3537cd",
|
|
"d15a0aa289524619a971188372dd05fb712f1b2c.pdf": "477a95136f40c9103a807c5023ab8459",
|
|
"932e0dfa52c20ffe83b8178fb98296a0dab177d1.pdf": "b44d8b073f99ac3db28d7951e3c7d970",
|
|
"60a8c28da5c23081834adac4170755904d8c4166.pdf": "9167f381d5eed7a2e5fd10eca567c519",
|
|
"e51296be2615b9389482c9c16505286619b6cf36.pdf": "ec6e1f6297dd1cbda6ccba39e0c7d3d2",
|
|
}
|
|
|
|
func TestExtractImages(t *testing.T) {
|
|
if len(extractImagesCorpusFolder) == 0 {
|
|
if forceTest {
|
|
t.Fatalf("UNIDOC_EXTRACT_IMAGES_TESTDATA not set")
|
|
}
|
|
}
|
|
|
|
files, err := ioutil.ReadDir(extractImagesCorpusFolder)
|
|
if err != nil {
|
|
if forceTest {
|
|
t.Fatalf("Error opening %s: %v", extractImagesCorpusFolder, err)
|
|
}
|
|
t.Skipf("Skipping extract images bench - unable to open UNIDOC_EXTRACT_IMAGES_TESTDATA (%s)", extractImagesCorpusFolder)
|
|
return
|
|
}
|
|
|
|
// Make a temporary folder and clean up after.
|
|
tempdir, err := ioutil.TempDir("", "unidoc_extract_images")
|
|
require.NoError(t, err)
|
|
defer os.RemoveAll(tempdir)
|
|
|
|
matchcount := 0
|
|
for _, file := range files {
|
|
basename := filepath.Base(file.Name())
|
|
outName := strings.TrimSuffix(basename, filepath.Ext(basename)) + ".zip"
|
|
|
|
t.Logf("%s", file.Name())
|
|
fpath := filepath.Join(extractImagesCorpusFolder, file.Name())
|
|
params := extractImagesParams{
|
|
inputPath: fpath,
|
|
outPath: filepath.Join(tempdir, "extract_images_"+outName),
|
|
}
|
|
extractImagesSinglePdf(t, params)
|
|
|
|
hash, err := hashFile(params.outPath)
|
|
require.NoError(t, err)
|
|
|
|
knownHash, has := knownExtrImgsHashes[file.Name()]
|
|
if has {
|
|
require.Equal(t, knownHash, hash)
|
|
matchcount++
|
|
} else {
|
|
t.Logf("%s - hash: %s not in the list of known hashes", file.Name(), hash)
|
|
}
|
|
}
|
|
|
|
// Ensure all the defined hashes were found.
|
|
require.Equal(t, len(knownExtrImgsHashes), matchcount)
|
|
|
|
t.Logf("Extract images benchmark complete for %d files in %s", len(files), extractImagesCorpusFolder)
|
|
}
|
|
|
|
type extractImagesParams struct {
|
|
inputPath string
|
|
outPath string
|
|
}
|
|
|
|
func extractImagesSinglePdf(t *testing.T, params extractImagesParams) {
|
|
measure := startMemoryMeasurement()
|
|
|
|
// Create PDF reader.
|
|
file, err := os.Open(params.inputPath)
|
|
require.NoError(t, err)
|
|
defer file.Close()
|
|
|
|
reader, err := model.NewPdfReaderLazy(file)
|
|
require.NoError(t, err)
|
|
|
|
// Decrypt file, if necessary.
|
|
isEncrypted, err := reader.IsEncrypted()
|
|
require.NoError(t, err)
|
|
if isEncrypted {
|
|
auth, err := reader.Decrypt([]byte(""))
|
|
require.NoError(t, err)
|
|
require.True(t, auth)
|
|
}
|
|
|
|
numPages, err := reader.GetNumPages()
|
|
require.NoError(t, err)
|
|
|
|
if numPages < 1 {
|
|
common.Log.Debug("Empty pdf - nothing to be done!")
|
|
return
|
|
}
|
|
|
|
// Create output zip file.
|
|
outFile, err := os.Create(params.outPath)
|
|
require.NoError(t, err)
|
|
defer outFile.Close()
|
|
|
|
zipWriter := zip.NewWriter(outFile)
|
|
for i := 0; i < numPages; i++ {
|
|
page, err := reader.GetPage(i + 1)
|
|
require.NoError(t, err)
|
|
|
|
// Extract page images.
|
|
imgExtractor, err := extractor.New(page)
|
|
require.NoError(t, err)
|
|
|
|
extracted, err := imgExtractor.ExtractPageImages(nil)
|
|
require.NoError(t, err)
|
|
|
|
for idx, img := range extracted.Images {
|
|
// Convert extracted image to Go image.
|
|
goImg, err := img.Image.ToGoImage()
|
|
require.NoError(t, err)
|
|
|
|
// Create zip file.
|
|
imgFile, err := zipWriter.Create(fmt.Sprintf("p%d_%d.jpg", i+1, idx))
|
|
require.NoError(t, err)
|
|
|
|
// Write zip file.
|
|
err = jpeg.Encode(imgFile, goImg, &jpeg.Options{Quality: 100})
|
|
require.NoError(t, err)
|
|
}
|
|
}
|
|
|
|
err = zipWriter.Close()
|
|
require.NoError(t, err)
|
|
|
|
measure.Stop()
|
|
summary := measure.Summary()
|
|
t.Logf("%s - summary %s", params.inputPath, summary)
|
|
}
|