mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00
JBIG2 decoder benchmark patch
This commit is contained in:
parent
e85616cec2
commit
4b1c345214
@ -859,6 +859,10 @@ func (t *TextRegion) getUserTable(tablePosition int) (huffman.Tabler, error) {
|
||||
|
||||
func (t *TextRegion) initSymbols() error {
|
||||
for _, segment := range t.Header.RTSegments {
|
||||
if segment == nil {
|
||||
return errors.New("jbig2 - internal error - nil segment provided for the text region symbols")
|
||||
}
|
||||
|
||||
if segment.Type == 0 {
|
||||
s, err := segment.GetSegmentData()
|
||||
if err != nil {
|
||||
|
@ -6,12 +6,8 @@
|
||||
package tests
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
@ -19,48 +15,37 @@ import (
|
||||
"github.com/unidoc/unipdf/v3/internal/jbig2"
|
||||
)
|
||||
|
||||
// BenchmarkDecodeSingleJBIG2 benchmarks the jbig2 decoding.
|
||||
// In order to run the benchmark run the DecodeJBIG2Files with the UNIDOC_JBIG2_TESTDATA environmental variable.
|
||||
// Zipped files containing raw jbig2 streams shoud be created.
|
||||
func BenchmarkDecodeSingleJBIG2(b *testing.B) {
|
||||
// BenchmarkDecodeJBIG2Files benchmarks the decoding process of jbig2 encoded images stored within pdf files.
|
||||
// The function reads pdf files located in the directory provided as `UNIDOC_JBIG2_TESTDATA` environmental variable.
|
||||
// Then the function extracts the images and starts subBenchmarks for each image.
|
||||
func BenchmarkDecodeJBIG2Files(b *testing.B) {
|
||||
b.Helper()
|
||||
dirName := os.Getenv("UNIDOC_JBIG2_TESTDATA")
|
||||
dirName := os.Getenv(EnvDirectory)
|
||||
require.NotEmpty(b, dirName, "No Environment variable 'UNIDOC_JBIG2_TESTDATA' found")
|
||||
|
||||
jbig2Files, err := readJBIGZippedFiles(dirName)
|
||||
filenames, err := readFileNames(dirName)
|
||||
require.NoError(b, err)
|
||||
require.NotEmpty(b, filenames, "no files found within provided directory")
|
||||
|
||||
for _, file := range jbig2Files {
|
||||
zr, err := zip.OpenReader(filepath.Join(dirName, jbig2DecodedDirectory, file))
|
||||
require.NoError(b, err)
|
||||
for _, filename := range filenames {
|
||||
b.Run(rawFileName(filename), func(b *testing.B) {
|
||||
images, err := extractImages(dirName, filename)
|
||||
require.NoError(b, err)
|
||||
|
||||
defer zr.Close()
|
||||
for _, image := range images {
|
||||
b.Run(fmt.Sprintf("Page#%d/Image#%d-%d", image.pageNo, image.idx, len(image.jbig2Data)), func(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
d, err := jbig2.NewDocumentWithGlobals(image.jbig2Data, image.globals)
|
||||
require.NoError(b, err)
|
||||
|
||||
for _, zFile := range zr.File {
|
||||
if !strings.HasSuffix(zFile.Name, ".jbig2") {
|
||||
continue
|
||||
p, err := d.GetPage(1)
|
||||
require.NoError(b, err)
|
||||
|
||||
_, err = p.GetBitmap()
|
||||
require.NoError(b, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
sf, err := zFile.Open()
|
||||
require.NoError(b, err)
|
||||
|
||||
defer sf.Close()
|
||||
|
||||
data, err := ioutil.ReadAll(sf)
|
||||
require.NoError(b, err)
|
||||
|
||||
b.Run(fmt.Sprintf("%s/%d", rawFileName(zFile.Name), len(data)), func(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
d, err := jbig2.NewDocument(data)
|
||||
require.NoError(b, err)
|
||||
|
||||
p, err := d.GetPage(1)
|
||||
require.NoError(b, err)
|
||||
|
||||
_, err = p.GetBitmap()
|
||||
require.NoError(b, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -20,21 +20,59 @@ import (
|
||||
"github.com/unidoc/unipdf/v3/contentstream"
|
||||
"github.com/unidoc/unipdf/v3/core"
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
|
||||
"github.com/unidoc/unipdf/v3/internal/jbig2"
|
||||
)
|
||||
|
||||
const jbig2DecodedDirectory string = "jbig2_decoded_images"
|
||||
|
||||
func extractImagesOnPage(filename string, page *model.PdfPage) ([]*extractedImage, error) {
|
||||
contents, err := page.GetAllContentStreams()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return extractImagesInContentStream(filename, contents, page.Resources)
|
||||
}
|
||||
|
||||
type extractedImage struct {
|
||||
jbig2Data []byte
|
||||
pdfImage *model.XObjectImage
|
||||
name string
|
||||
pageNo int
|
||||
idx int
|
||||
hash string
|
||||
globals jbig2.Globals
|
||||
}
|
||||
|
||||
func (e *extractedImage) fullName() string {
|
||||
return fmt.Sprintf("%s_%d_%d", e.name, e.pageNo, e.idx)
|
||||
}
|
||||
|
||||
func extractImages(dirName string, filename string) ([]*extractedImage, error) {
|
||||
f, err := getFile(dirName, filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
reader, err := readPDF(f)
|
||||
if err != nil && err.Error() != "EOF not found" {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var numPages int
|
||||
numPages, err = reader.GetNumPages()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var (
|
||||
page *model.PdfPage
|
||||
images, tempImages []*extractedImage
|
||||
)
|
||||
for pageNo := 1; pageNo <= numPages; pageNo++ {
|
||||
page, err = reader.GetPage(pageNo)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tempImages, err = extractImagesOnPage(dirName, filename, page, pageNo)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
images = append(images, tempImages...)
|
||||
}
|
||||
return images, nil
|
||||
}
|
||||
|
||||
func extractImagesInContentStream(filename, contents string, resources *model.PdfPageResources) ([]*extractedImage, error) {
|
||||
@ -83,9 +121,15 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd
|
||||
return nil, err
|
||||
}
|
||||
|
||||
enc, ok := ximg.Filter.(*core.JBIG2Encoder)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("Filter encoder should be a JBIG2Encoder but is: %T", ximg.Filter)
|
||||
}
|
||||
|
||||
extracted := &extractedImage{
|
||||
pdfImage: ximg,
|
||||
jbig2Data: xobj.Stream,
|
||||
globals: enc.Globals,
|
||||
}
|
||||
|
||||
extractedImages = append(extractedImages, extracted)
|
||||
@ -118,9 +162,24 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd
|
||||
return extractedImages, nil
|
||||
}
|
||||
|
||||
type fileHash struct {
|
||||
fileName string
|
||||
hash string
|
||||
func extractImagesOnPage(dirname, filename string, page *model.PdfPage, pageNo int) ([]*extractedImage, error) {
|
||||
contents, err := page.GetAllContentStreams()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
images, err := extractImagesInContentStream(filepath.Join(dirname, filename), contents, page.Resources)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rawName := rawFileName(filename)
|
||||
for i, image := range images {
|
||||
image.name = rawName
|
||||
image.idx = i + 1
|
||||
image.pageNo = pageNo
|
||||
}
|
||||
return images, nil
|
||||
}
|
||||
|
||||
func getFile(dirName, filename string) (*os.File, error) {
|
||||
@ -189,27 +248,24 @@ func readPDF(f *os.File, password ...string) (*model.PdfReader, error) {
|
||||
return pdfReader, nil
|
||||
}
|
||||
|
||||
func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images ...*extractedImage) (hashes []fileHash, err error) {
|
||||
func writeExtractedImages(zw *zip.Writer, images ...*extractedImage) (err error) {
|
||||
h := md5.New()
|
||||
|
||||
// write images
|
||||
for idx, img := range images {
|
||||
fname := fmt.Sprintf("%s_%d_%d", rawFileName(filename), pageNo, idx)
|
||||
|
||||
common.Log.Trace("Writing file: '%s'", fname)
|
||||
f, err := zw.Create(fname + ".jpg")
|
||||
for _, img := range images {
|
||||
common.Log.Trace("Writing file: '%s'", img.fullName())
|
||||
f, err := zw.Create(img.fullName() + ".jpg")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
cimg, err := img.pdfImage.ToImage()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
gimg, err := cimg.ToGoImage()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
multiWriter := io.MultiWriter(f, h)
|
||||
@ -217,25 +273,11 @@ func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images ..
|
||||
// write to file
|
||||
q := &jpeg.Options{Quality: 100}
|
||||
if err = jpeg.Encode(multiWriter, gimg, q); err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
fh := fileHash{fileName: fname, hash: hex.EncodeToString(h.Sum(nil))}
|
||||
hashes = append(hashes, fh)
|
||||
img.hash = hex.EncodeToString(h.Sum(nil))
|
||||
h.Reset()
|
||||
|
||||
if err = writeJBIG2Stream(zw, fname+".jbig2", img.jbig2Data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return hashes, nil
|
||||
}
|
||||
|
||||
func writeJBIG2Stream(zw *zip.Writer, filename string, data []byte) error {
|
||||
f, err := zw.Create(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = f.Write(data)
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
@ -15,7 +15,6 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/unidoc/unipdf/v3/common"
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
)
|
||||
|
||||
// EnvDirectory is the environment variable that should contain directory path
|
||||
@ -27,7 +26,7 @@ var (
|
||||
// for each decoded testcase image should be updated.
|
||||
jbig2UpdateGoldens bool
|
||||
// keepImageFiles is the runtime flag that is used to keep the decoded jbig2 images
|
||||
// within the temporary directory: os.TempDir()/unipdf/jbig2
|
||||
// within the temporary directory: 'os.TempDir()/unipdf/jbig2'.
|
||||
keepImageFiles bool
|
||||
)
|
||||
|
||||
@ -40,7 +39,7 @@ func init() {
|
||||
// Requires environmental variable 'UNIDOC_JBIG2_TESTDATA' that contains the jbig2 testdata.
|
||||
// Decoded images are stored within zipped archive files - that has the same name as the pdf file.
|
||||
// In order to check the decoded images this function creates also the directory 'goldens'
|
||||
// which would have json files for each 'pdf' input, containing valid flags.
|
||||
// which would have json files for each 'pdf' input, containing valid image hashes.
|
||||
// If the 'jbig2-update-goldens' runtime flag is provided, the test function updates all the 'hashes'
|
||||
// for the decoded jbig2 images in related 'golden' files.
|
||||
// In order to check the decoded images use 'jbig2-store-images' flag, then the function would store them
|
||||
@ -72,31 +71,10 @@ func TestDecodeJBIG2Files(t *testing.T) {
|
||||
}
|
||||
}()
|
||||
|
||||
passwords := make(map[string]string)
|
||||
|
||||
for _, filename := range filenames {
|
||||
rawName := rawFileName(filename)
|
||||
t.Run(rawName, func(t *testing.T) {
|
||||
// get the file
|
||||
f, err := getFile(dirName, filename)
|
||||
require.NoError(t, err)
|
||||
defer f.Close()
|
||||
|
||||
var reader *model.PdfReader
|
||||
password, ok := passwords[filename]
|
||||
if ok {
|
||||
// read the pdf with the password
|
||||
reader, err = readPDF(f, password)
|
||||
} else {
|
||||
reader, err = readPDF(f)
|
||||
}
|
||||
if err != nil {
|
||||
if err.Error() != "EOF not found" {
|
||||
require.NoError(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
numPages, err := reader.GetNumPages()
|
||||
images, err := extractImages(dirName, filename)
|
||||
require.NoError(t, err)
|
||||
|
||||
// create zipped file
|
||||
@ -109,21 +87,10 @@ func TestDecodeJBIG2Files(t *testing.T) {
|
||||
zw := zip.NewWriter(w)
|
||||
defer zw.Close()
|
||||
|
||||
var allHashes []fileHash
|
||||
err = writeExtractedImages(zw, images...)
|
||||
require.NoError(t, err)
|
||||
|
||||
for pageNo := 1; pageNo <= numPages; pageNo++ {
|
||||
page, err := reader.GetPage(pageNo)
|
||||
require.NoError(t, err)
|
||||
|
||||
images, err := extractImagesOnPage(filepath.Join(dirName, rawName), page)
|
||||
require.NoError(t, err)
|
||||
|
||||
hashes, err := writeExtractedImages(zw, rawName, pageNo, images...)
|
||||
require.NoError(t, err)
|
||||
|
||||
allHashes = append(allHashes, hashes...)
|
||||
}
|
||||
checkGoldenFiles(t, dirName, rawName, allHashes...)
|
||||
checkGoldenFiles(t, dirName, rawName, images...)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ package tests
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@ -18,17 +19,17 @@ import (
|
||||
|
||||
// Goldens is a model used to store the jbig2 testcase 'golden files'.
|
||||
// The golden files stores the md5 'hash' value for each 'filename' key.
|
||||
// It is used to check if the decoded jbig2 image had changed using the image md5 hash.
|
||||
// It is used to check if the decoded jbig2 image had changed using it's md5 hash.
|
||||
type Goldens map[string]string
|
||||
|
||||
func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...fileHash) {
|
||||
func checkGoldenFiles(t *testing.T, dirname, filename string, images ...*extractedImage) {
|
||||
goldens, err := readGoldenFile(dirname, filename)
|
||||
require.NoError(t, err)
|
||||
|
||||
if jbig2UpdateGoldens {
|
||||
// copy all the file hashes into Goldens map.
|
||||
for _, fh := range readHashes {
|
||||
goldens[fh.fileName] = fh.hash
|
||||
for _, img := range images {
|
||||
goldens[img.fullName()] = img.hash
|
||||
}
|
||||
|
||||
err = writeGoldenFile(dirname, filename, goldens)
|
||||
@ -36,13 +37,13 @@ func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...file
|
||||
return
|
||||
}
|
||||
|
||||
for _, fh := range readHashes {
|
||||
t.Run(fh.fileName, func(t *testing.T) {
|
||||
single, exist := goldens[fh.fileName]
|
||||
for _, img := range images {
|
||||
t.Run(fmt.Sprintf("Page#%d/Image#%d", img.pageNo, img.idx), func(t *testing.T) {
|
||||
single, exist := goldens[img.fullName()]
|
||||
// check if the 'filename' key exists.
|
||||
if assert.True(t, exist, "hash doesn't exists") {
|
||||
// check if the md5 hash equals with the given fh.hash
|
||||
assert.Equal(t, fh.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", fh.hash, single)
|
||||
assert.Equal(t, img.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", img.hash, single)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user