JBIG2 decoder benchmark patch

This commit is contained in:
Jacek Kucharczyk 2019-07-16 17:40:22 +02:00 committed by Gunnsteinn Hall
parent e85616cec2
commit 4b1c345214
5 changed files with 125 additions and 126 deletions

View File

@ -859,6 +859,10 @@ func (t *TextRegion) getUserTable(tablePosition int) (huffman.Tabler, error) {
func (t *TextRegion) initSymbols() error {
for _, segment := range t.Header.RTSegments {
if segment == nil {
return errors.New("jbig2 - internal error - nil segment provided for the text region symbols")
}
if segment.Type == 0 {
s, err := segment.GetSegmentData()
if err != nil {

View File

@ -6,12 +6,8 @@
package tests
import (
"archive/zip"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/require"
@ -19,48 +15,37 @@ import (
"github.com/unidoc/unipdf/v3/internal/jbig2"
)
// BenchmarkDecodeSingleJBIG2 benchmarks the jbig2 decoding.
// In order to run the benchmark run the DecodeJBIG2Files with the UNIDOC_JBIG2_TESTDATA environmental variable.
// Zipped files containing raw jbig2 streams shoud be created.
func BenchmarkDecodeSingleJBIG2(b *testing.B) {
// BenchmarkDecodeJBIG2Files benchmarks the decoding process of jbig2 encoded images stored within pdf files.
// The function reads pdf files located in the directory provided as `UNIDOC_JBIG2_TESTDATA` environmental variable.
// Then the function extracts the images and starts subBenchmarks for each image.
func BenchmarkDecodeJBIG2Files(b *testing.B) {
b.Helper()
dirName := os.Getenv("UNIDOC_JBIG2_TESTDATA")
dirName := os.Getenv(EnvDirectory)
require.NotEmpty(b, dirName, "No Environment variable 'UNIDOC_JBIG2_TESTDATA' found")
jbig2Files, err := readJBIGZippedFiles(dirName)
filenames, err := readFileNames(dirName)
require.NoError(b, err)
require.NotEmpty(b, filenames, "no files found within provided directory")
for _, file := range jbig2Files {
zr, err := zip.OpenReader(filepath.Join(dirName, jbig2DecodedDirectory, file))
require.NoError(b, err)
for _, filename := range filenames {
b.Run(rawFileName(filename), func(b *testing.B) {
images, err := extractImages(dirName, filename)
require.NoError(b, err)
defer zr.Close()
for _, image := range images {
b.Run(fmt.Sprintf("Page#%d/Image#%d-%d", image.pageNo, image.idx, len(image.jbig2Data)), func(b *testing.B) {
for n := 0; n < b.N; n++ {
d, err := jbig2.NewDocumentWithGlobals(image.jbig2Data, image.globals)
require.NoError(b, err)
for _, zFile := range zr.File {
if !strings.HasSuffix(zFile.Name, ".jbig2") {
continue
p, err := d.GetPage(1)
require.NoError(b, err)
_, err = p.GetBitmap()
require.NoError(b, err)
}
})
}
sf, err := zFile.Open()
require.NoError(b, err)
defer sf.Close()
data, err := ioutil.ReadAll(sf)
require.NoError(b, err)
b.Run(fmt.Sprintf("%s/%d", rawFileName(zFile.Name), len(data)), func(b *testing.B) {
for n := 0; n < b.N; n++ {
d, err := jbig2.NewDocument(data)
require.NoError(b, err)
p, err := d.GetPage(1)
require.NoError(b, err)
_, err = p.GetBitmap()
require.NoError(b, err)
}
})
}
})
}
}

View File

@ -20,21 +20,59 @@ import (
"github.com/unidoc/unipdf/v3/contentstream"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/model"
"github.com/unidoc/unipdf/v3/internal/jbig2"
)
const jbig2DecodedDirectory string = "jbig2_decoded_images"
func extractImagesOnPage(filename string, page *model.PdfPage) ([]*extractedImage, error) {
contents, err := page.GetAllContentStreams()
if err != nil {
return nil, err
}
return extractImagesInContentStream(filename, contents, page.Resources)
}
type extractedImage struct {
jbig2Data []byte
pdfImage *model.XObjectImage
name string
pageNo int
idx int
hash string
globals jbig2.Globals
}
func (e *extractedImage) fullName() string {
return fmt.Sprintf("%s_%d_%d", e.name, e.pageNo, e.idx)
}
func extractImages(dirName string, filename string) ([]*extractedImage, error) {
f, err := getFile(dirName, filename)
if err != nil {
return nil, err
}
defer f.Close()
reader, err := readPDF(f)
if err != nil && err.Error() != "EOF not found" {
return nil, err
}
var numPages int
numPages, err = reader.GetNumPages()
if err != nil {
return nil, err
}
var (
page *model.PdfPage
images, tempImages []*extractedImage
)
for pageNo := 1; pageNo <= numPages; pageNo++ {
page, err = reader.GetPage(pageNo)
if err != nil {
return nil, err
}
tempImages, err = extractImagesOnPage(dirName, filename, page, pageNo)
if err != nil {
return nil, err
}
images = append(images, tempImages...)
}
return images, nil
}
func extractImagesInContentStream(filename, contents string, resources *model.PdfPageResources) ([]*extractedImage, error) {
@ -83,9 +121,15 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd
return nil, err
}
enc, ok := ximg.Filter.(*core.JBIG2Encoder)
if !ok {
return nil, fmt.Errorf("Filter encoder should be a JBIG2Encoder but is: %T", ximg.Filter)
}
extracted := &extractedImage{
pdfImage: ximg,
jbig2Data: xobj.Stream,
globals: enc.Globals,
}
extractedImages = append(extractedImages, extracted)
@ -118,9 +162,24 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd
return extractedImages, nil
}
type fileHash struct {
fileName string
hash string
func extractImagesOnPage(dirname, filename string, page *model.PdfPage, pageNo int) ([]*extractedImage, error) {
contents, err := page.GetAllContentStreams()
if err != nil {
return nil, err
}
images, err := extractImagesInContentStream(filepath.Join(dirname, filename), contents, page.Resources)
if err != nil {
return nil, err
}
rawName := rawFileName(filename)
for i, image := range images {
image.name = rawName
image.idx = i + 1
image.pageNo = pageNo
}
return images, nil
}
func getFile(dirName, filename string) (*os.File, error) {
@ -189,27 +248,24 @@ func readPDF(f *os.File, password ...string) (*model.PdfReader, error) {
return pdfReader, nil
}
func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images ...*extractedImage) (hashes []fileHash, err error) {
func writeExtractedImages(zw *zip.Writer, images ...*extractedImage) (err error) {
h := md5.New()
// write images
for idx, img := range images {
fname := fmt.Sprintf("%s_%d_%d", rawFileName(filename), pageNo, idx)
common.Log.Trace("Writing file: '%s'", fname)
f, err := zw.Create(fname + ".jpg")
for _, img := range images {
common.Log.Trace("Writing file: '%s'", img.fullName())
f, err := zw.Create(img.fullName() + ".jpg")
if err != nil {
return nil, err
return err
}
cimg, err := img.pdfImage.ToImage()
if err != nil {
return nil, err
return err
}
gimg, err := cimg.ToGoImage()
if err != nil {
return nil, err
return err
}
multiWriter := io.MultiWriter(f, h)
@ -217,25 +273,11 @@ func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images ..
// write to file
q := &jpeg.Options{Quality: 100}
if err = jpeg.Encode(multiWriter, gimg, q); err != nil {
return nil, err
return err
}
fh := fileHash{fileName: fname, hash: hex.EncodeToString(h.Sum(nil))}
hashes = append(hashes, fh)
img.hash = hex.EncodeToString(h.Sum(nil))
h.Reset()
if err = writeJBIG2Stream(zw, fname+".jbig2", img.jbig2Data); err != nil {
return nil, err
}
}
return hashes, nil
}
func writeJBIG2Stream(zw *zip.Writer, filename string, data []byte) error {
f, err := zw.Create(filename)
if err != nil {
return err
}
_, err = f.Write(data)
return err
return nil
}

View File

@ -15,7 +15,6 @@ import (
"github.com/stretchr/testify/require"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
)
// EnvDirectory is the environment variable that should contain directory path
@ -27,7 +26,7 @@ var (
// for each decoded testcase image should be updated.
jbig2UpdateGoldens bool
// keepImageFiles is the runtime flag that is used to keep the decoded jbig2 images
// within the temporary directory: os.TempDir()/unipdf/jbig2
// within the temporary directory: 'os.TempDir()/unipdf/jbig2'.
keepImageFiles bool
)
@ -40,7 +39,7 @@ func init() {
// Requires environmental variable 'UNIDOC_JBIG2_TESTDATA' that contains the jbig2 testdata.
// Decoded images are stored within zipped archive files - that has the same name as the pdf file.
// In order to check the decoded images this function creates also the directory 'goldens'
// which would have json files for each 'pdf' input, containing valid flags.
// which would have json files for each 'pdf' input, containing valid image hashes.
// If the 'jbig2-update-goldens' runtime flag is provided, the test function updates all the 'hashes'
// for the decoded jbig2 images in related 'golden' files.
// In order to check the decoded images use 'jbig2-store-images' flag, then the function would store them
@ -72,31 +71,10 @@ func TestDecodeJBIG2Files(t *testing.T) {
}
}()
passwords := make(map[string]string)
for _, filename := range filenames {
rawName := rawFileName(filename)
t.Run(rawName, func(t *testing.T) {
// get the file
f, err := getFile(dirName, filename)
require.NoError(t, err)
defer f.Close()
var reader *model.PdfReader
password, ok := passwords[filename]
if ok {
// read the pdf with the password
reader, err = readPDF(f, password)
} else {
reader, err = readPDF(f)
}
if err != nil {
if err.Error() != "EOF not found" {
require.NoError(t, err)
}
}
numPages, err := reader.GetNumPages()
images, err := extractImages(dirName, filename)
require.NoError(t, err)
// create zipped file
@ -109,21 +87,10 @@ func TestDecodeJBIG2Files(t *testing.T) {
zw := zip.NewWriter(w)
defer zw.Close()
var allHashes []fileHash
err = writeExtractedImages(zw, images...)
require.NoError(t, err)
for pageNo := 1; pageNo <= numPages; pageNo++ {
page, err := reader.GetPage(pageNo)
require.NoError(t, err)
images, err := extractImagesOnPage(filepath.Join(dirName, rawName), page)
require.NoError(t, err)
hashes, err := writeExtractedImages(zw, rawName, pageNo, images...)
require.NoError(t, err)
allHashes = append(allHashes, hashes...)
}
checkGoldenFiles(t, dirName, rawName, allHashes...)
checkGoldenFiles(t, dirName, rawName, images...)
})
}
}

View File

@ -7,6 +7,7 @@ package tests
import (
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
@ -18,17 +19,17 @@ import (
// Goldens is a model used to store the jbig2 testcase 'golden files'.
// The golden files stores the md5 'hash' value for each 'filename' key.
// It is used to check if the decoded jbig2 image had changed using the image md5 hash.
// It is used to check if the decoded jbig2 image had changed using it's md5 hash.
type Goldens map[string]string
func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...fileHash) {
func checkGoldenFiles(t *testing.T, dirname, filename string, images ...*extractedImage) {
goldens, err := readGoldenFile(dirname, filename)
require.NoError(t, err)
if jbig2UpdateGoldens {
// copy all the file hashes into Goldens map.
for _, fh := range readHashes {
goldens[fh.fileName] = fh.hash
for _, img := range images {
goldens[img.fullName()] = img.hash
}
err = writeGoldenFile(dirname, filename, goldens)
@ -36,13 +37,13 @@ func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...file
return
}
for _, fh := range readHashes {
t.Run(fh.fileName, func(t *testing.T) {
single, exist := goldens[fh.fileName]
for _, img := range images {
t.Run(fmt.Sprintf("Page#%d/Image#%d", img.pageNo, img.idx), func(t *testing.T) {
single, exist := goldens[img.fullName()]
// check if the 'filename' key exists.
if assert.True(t, exist, "hash doesn't exists") {
// check if the md5 hash equals with the given fh.hash
assert.Equal(t, fh.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", fh.hash, single)
assert.Equal(t, img.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", img.hash, single)
}
})
}