mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00
JBIG2 decoder benchmark patch
This commit is contained in:
parent
e85616cec2
commit
4b1c345214
@ -859,6 +859,10 @@ func (t *TextRegion) getUserTable(tablePosition int) (huffman.Tabler, error) {
|
|||||||
|
|
||||||
func (t *TextRegion) initSymbols() error {
|
func (t *TextRegion) initSymbols() error {
|
||||||
for _, segment := range t.Header.RTSegments {
|
for _, segment := range t.Header.RTSegments {
|
||||||
|
if segment == nil {
|
||||||
|
return errors.New("jbig2 - internal error - nil segment provided for the text region symbols")
|
||||||
|
}
|
||||||
|
|
||||||
if segment.Type == 0 {
|
if segment.Type == 0 {
|
||||||
s, err := segment.GetSegmentData()
|
s, err := segment.GetSegmentData()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -6,12 +6,8 @@
|
|||||||
package tests
|
package tests
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/zip"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
@ -19,39 +15,27 @@ import (
|
|||||||
"github.com/unidoc/unipdf/v3/internal/jbig2"
|
"github.com/unidoc/unipdf/v3/internal/jbig2"
|
||||||
)
|
)
|
||||||
|
|
||||||
// BenchmarkDecodeSingleJBIG2 benchmarks the jbig2 decoding.
|
// BenchmarkDecodeJBIG2Files benchmarks the decoding process of jbig2 encoded images stored within pdf files.
|
||||||
// In order to run the benchmark run the DecodeJBIG2Files with the UNIDOC_JBIG2_TESTDATA environmental variable.
|
// The function reads pdf files located in the directory provided as `UNIDOC_JBIG2_TESTDATA` environmental variable.
|
||||||
// Zipped files containing raw jbig2 streams shoud be created.
|
// Then the function extracts the images and starts subBenchmarks for each image.
|
||||||
func BenchmarkDecodeSingleJBIG2(b *testing.B) {
|
func BenchmarkDecodeJBIG2Files(b *testing.B) {
|
||||||
b.Helper()
|
b.Helper()
|
||||||
dirName := os.Getenv("UNIDOC_JBIG2_TESTDATA")
|
dirName := os.Getenv(EnvDirectory)
|
||||||
require.NotEmpty(b, dirName, "No Environment variable 'UNIDOC_JBIG2_TESTDATA' found")
|
require.NotEmpty(b, dirName, "No Environment variable 'UNIDOC_JBIG2_TESTDATA' found")
|
||||||
|
|
||||||
jbig2Files, err := readJBIGZippedFiles(dirName)
|
filenames, err := readFileNames(dirName)
|
||||||
|
require.NoError(b, err)
|
||||||
|
require.NotEmpty(b, filenames, "no files found within provided directory")
|
||||||
|
|
||||||
|
for _, filename := range filenames {
|
||||||
|
b.Run(rawFileName(filename), func(b *testing.B) {
|
||||||
|
images, err := extractImages(dirName, filename)
|
||||||
require.NoError(b, err)
|
require.NoError(b, err)
|
||||||
|
|
||||||
for _, file := range jbig2Files {
|
for _, image := range images {
|
||||||
zr, err := zip.OpenReader(filepath.Join(dirName, jbig2DecodedDirectory, file))
|
b.Run(fmt.Sprintf("Page#%d/Image#%d-%d", image.pageNo, image.idx, len(image.jbig2Data)), func(b *testing.B) {
|
||||||
require.NoError(b, err)
|
|
||||||
|
|
||||||
defer zr.Close()
|
|
||||||
|
|
||||||
for _, zFile := range zr.File {
|
|
||||||
if !strings.HasSuffix(zFile.Name, ".jbig2") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
sf, err := zFile.Open()
|
|
||||||
require.NoError(b, err)
|
|
||||||
|
|
||||||
defer sf.Close()
|
|
||||||
|
|
||||||
data, err := ioutil.ReadAll(sf)
|
|
||||||
require.NoError(b, err)
|
|
||||||
|
|
||||||
b.Run(fmt.Sprintf("%s/%d", rawFileName(zFile.Name), len(data)), func(b *testing.B) {
|
|
||||||
for n := 0; n < b.N; n++ {
|
for n := 0; n < b.N; n++ {
|
||||||
d, err := jbig2.NewDocument(data)
|
d, err := jbig2.NewDocumentWithGlobals(image.jbig2Data, image.globals)
|
||||||
require.NoError(b, err)
|
require.NoError(b, err)
|
||||||
|
|
||||||
p, err := d.GetPage(1)
|
p, err := d.GetPage(1)
|
||||||
@ -62,5 +46,6 @@ func BenchmarkDecodeSingleJBIG2(b *testing.B) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -20,21 +20,59 @@ import (
|
|||||||
"github.com/unidoc/unipdf/v3/contentstream"
|
"github.com/unidoc/unipdf/v3/contentstream"
|
||||||
"github.com/unidoc/unipdf/v3/core"
|
"github.com/unidoc/unipdf/v3/core"
|
||||||
"github.com/unidoc/unipdf/v3/model"
|
"github.com/unidoc/unipdf/v3/model"
|
||||||
|
|
||||||
|
"github.com/unidoc/unipdf/v3/internal/jbig2"
|
||||||
)
|
)
|
||||||
|
|
||||||
const jbig2DecodedDirectory string = "jbig2_decoded_images"
|
|
||||||
|
|
||||||
func extractImagesOnPage(filename string, page *model.PdfPage) ([]*extractedImage, error) {
|
|
||||||
contents, err := page.GetAllContentStreams()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return extractImagesInContentStream(filename, contents, page.Resources)
|
|
||||||
}
|
|
||||||
|
|
||||||
type extractedImage struct {
|
type extractedImage struct {
|
||||||
jbig2Data []byte
|
jbig2Data []byte
|
||||||
pdfImage *model.XObjectImage
|
pdfImage *model.XObjectImage
|
||||||
|
name string
|
||||||
|
pageNo int
|
||||||
|
idx int
|
||||||
|
hash string
|
||||||
|
globals jbig2.Globals
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *extractedImage) fullName() string {
|
||||||
|
return fmt.Sprintf("%s_%d_%d", e.name, e.pageNo, e.idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractImages(dirName string, filename string) ([]*extractedImage, error) {
|
||||||
|
f, err := getFile(dirName, filename)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
reader, err := readPDF(f)
|
||||||
|
if err != nil && err.Error() != "EOF not found" {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var numPages int
|
||||||
|
numPages, err = reader.GetNumPages()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
page *model.PdfPage
|
||||||
|
images, tempImages []*extractedImage
|
||||||
|
)
|
||||||
|
for pageNo := 1; pageNo <= numPages; pageNo++ {
|
||||||
|
page, err = reader.GetPage(pageNo)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
tempImages, err = extractImagesOnPage(dirName, filename, page, pageNo)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
images = append(images, tempImages...)
|
||||||
|
}
|
||||||
|
return images, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractImagesInContentStream(filename, contents string, resources *model.PdfPageResources) ([]*extractedImage, error) {
|
func extractImagesInContentStream(filename, contents string, resources *model.PdfPageResources) ([]*extractedImage, error) {
|
||||||
@ -83,9 +121,15 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enc, ok := ximg.Filter.(*core.JBIG2Encoder)
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Filter encoder should be a JBIG2Encoder but is: %T", ximg.Filter)
|
||||||
|
}
|
||||||
|
|
||||||
extracted := &extractedImage{
|
extracted := &extractedImage{
|
||||||
pdfImage: ximg,
|
pdfImage: ximg,
|
||||||
jbig2Data: xobj.Stream,
|
jbig2Data: xobj.Stream,
|
||||||
|
globals: enc.Globals,
|
||||||
}
|
}
|
||||||
|
|
||||||
extractedImages = append(extractedImages, extracted)
|
extractedImages = append(extractedImages, extracted)
|
||||||
@ -118,9 +162,24 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd
|
|||||||
return extractedImages, nil
|
return extractedImages, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type fileHash struct {
|
func extractImagesOnPage(dirname, filename string, page *model.PdfPage, pageNo int) ([]*extractedImage, error) {
|
||||||
fileName string
|
contents, err := page.GetAllContentStreams()
|
||||||
hash string
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
images, err := extractImagesInContentStream(filepath.Join(dirname, filename), contents, page.Resources)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
rawName := rawFileName(filename)
|
||||||
|
for i, image := range images {
|
||||||
|
image.name = rawName
|
||||||
|
image.idx = i + 1
|
||||||
|
image.pageNo = pageNo
|
||||||
|
}
|
||||||
|
return images, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFile(dirName, filename string) (*os.File, error) {
|
func getFile(dirName, filename string) (*os.File, error) {
|
||||||
@ -189,27 +248,24 @@ func readPDF(f *os.File, password ...string) (*model.PdfReader, error) {
|
|||||||
return pdfReader, nil
|
return pdfReader, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images ...*extractedImage) (hashes []fileHash, err error) {
|
func writeExtractedImages(zw *zip.Writer, images ...*extractedImage) (err error) {
|
||||||
h := md5.New()
|
h := md5.New()
|
||||||
|
|
||||||
// write images
|
for _, img := range images {
|
||||||
for idx, img := range images {
|
common.Log.Trace("Writing file: '%s'", img.fullName())
|
||||||
fname := fmt.Sprintf("%s_%d_%d", rawFileName(filename), pageNo, idx)
|
f, err := zw.Create(img.fullName() + ".jpg")
|
||||||
|
|
||||||
common.Log.Trace("Writing file: '%s'", fname)
|
|
||||||
f, err := zw.Create(fname + ".jpg")
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
cimg, err := img.pdfImage.ToImage()
|
cimg, err := img.pdfImage.ToImage()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
gimg, err := cimg.ToGoImage()
|
gimg, err := cimg.ToGoImage()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
multiWriter := io.MultiWriter(f, h)
|
multiWriter := io.MultiWriter(f, h)
|
||||||
@ -217,25 +273,11 @@ func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images ..
|
|||||||
// write to file
|
// write to file
|
||||||
q := &jpeg.Options{Quality: 100}
|
q := &jpeg.Options{Quality: 100}
|
||||||
if err = jpeg.Encode(multiWriter, gimg, q); err != nil {
|
if err = jpeg.Encode(multiWriter, gimg, q); err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
fh := fileHash{fileName: fname, hash: hex.EncodeToString(h.Sum(nil))}
|
img.hash = hex.EncodeToString(h.Sum(nil))
|
||||||
hashes = append(hashes, fh)
|
|
||||||
h.Reset()
|
h.Reset()
|
||||||
|
|
||||||
if err = writeJBIG2Stream(zw, fname+".jbig2", img.jbig2Data); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
}
|
return nil
|
||||||
return hashes, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func writeJBIG2Stream(zw *zip.Writer, filename string, data []byte) error {
|
|
||||||
f, err := zw.Create(filename)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
_, err = f.Write(data)
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
@ -15,7 +15,6 @@ import (
|
|||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/unidoc/unipdf/v3/common"
|
"github.com/unidoc/unipdf/v3/common"
|
||||||
"github.com/unidoc/unipdf/v3/model"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// EnvDirectory is the environment variable that should contain directory path
|
// EnvDirectory is the environment variable that should contain directory path
|
||||||
@ -27,7 +26,7 @@ var (
|
|||||||
// for each decoded testcase image should be updated.
|
// for each decoded testcase image should be updated.
|
||||||
jbig2UpdateGoldens bool
|
jbig2UpdateGoldens bool
|
||||||
// keepImageFiles is the runtime flag that is used to keep the decoded jbig2 images
|
// keepImageFiles is the runtime flag that is used to keep the decoded jbig2 images
|
||||||
// within the temporary directory: os.TempDir()/unipdf/jbig2
|
// within the temporary directory: 'os.TempDir()/unipdf/jbig2'.
|
||||||
keepImageFiles bool
|
keepImageFiles bool
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -40,7 +39,7 @@ func init() {
|
|||||||
// Requires environmental variable 'UNIDOC_JBIG2_TESTDATA' that contains the jbig2 testdata.
|
// Requires environmental variable 'UNIDOC_JBIG2_TESTDATA' that contains the jbig2 testdata.
|
||||||
// Decoded images are stored within zipped archive files - that has the same name as the pdf file.
|
// Decoded images are stored within zipped archive files - that has the same name as the pdf file.
|
||||||
// In order to check the decoded images this function creates also the directory 'goldens'
|
// In order to check the decoded images this function creates also the directory 'goldens'
|
||||||
// which would have json files for each 'pdf' input, containing valid flags.
|
// which would have json files for each 'pdf' input, containing valid image hashes.
|
||||||
// If the 'jbig2-update-goldens' runtime flag is provided, the test function updates all the 'hashes'
|
// If the 'jbig2-update-goldens' runtime flag is provided, the test function updates all the 'hashes'
|
||||||
// for the decoded jbig2 images in related 'golden' files.
|
// for the decoded jbig2 images in related 'golden' files.
|
||||||
// In order to check the decoded images use 'jbig2-store-images' flag, then the function would store them
|
// In order to check the decoded images use 'jbig2-store-images' flag, then the function would store them
|
||||||
@ -72,31 +71,10 @@ func TestDecodeJBIG2Files(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
passwords := make(map[string]string)
|
|
||||||
|
|
||||||
for _, filename := range filenames {
|
for _, filename := range filenames {
|
||||||
rawName := rawFileName(filename)
|
rawName := rawFileName(filename)
|
||||||
t.Run(rawName, func(t *testing.T) {
|
t.Run(rawName, func(t *testing.T) {
|
||||||
// get the file
|
images, err := extractImages(dirName, filename)
|
||||||
f, err := getFile(dirName, filename)
|
|
||||||
require.NoError(t, err)
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
var reader *model.PdfReader
|
|
||||||
password, ok := passwords[filename]
|
|
||||||
if ok {
|
|
||||||
// read the pdf with the password
|
|
||||||
reader, err = readPDF(f, password)
|
|
||||||
} else {
|
|
||||||
reader, err = readPDF(f)
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
if err.Error() != "EOF not found" {
|
|
||||||
require.NoError(t, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
numPages, err := reader.GetNumPages()
|
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
// create zipped file
|
// create zipped file
|
||||||
@ -109,21 +87,10 @@ func TestDecodeJBIG2Files(t *testing.T) {
|
|||||||
zw := zip.NewWriter(w)
|
zw := zip.NewWriter(w)
|
||||||
defer zw.Close()
|
defer zw.Close()
|
||||||
|
|
||||||
var allHashes []fileHash
|
err = writeExtractedImages(zw, images...)
|
||||||
|
|
||||||
for pageNo := 1; pageNo <= numPages; pageNo++ {
|
|
||||||
page, err := reader.GetPage(pageNo)
|
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
images, err := extractImagesOnPage(filepath.Join(dirName, rawName), page)
|
checkGoldenFiles(t, dirName, rawName, images...)
|
||||||
require.NoError(t, err)
|
|
||||||
|
|
||||||
hashes, err := writeExtractedImages(zw, rawName, pageNo, images...)
|
|
||||||
require.NoError(t, err)
|
|
||||||
|
|
||||||
allHashes = append(allHashes, hashes...)
|
|
||||||
}
|
|
||||||
checkGoldenFiles(t, dirName, rawName, allHashes...)
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@ package tests
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@ -18,17 +19,17 @@ import (
|
|||||||
|
|
||||||
// Goldens is a model used to store the jbig2 testcase 'golden files'.
|
// Goldens is a model used to store the jbig2 testcase 'golden files'.
|
||||||
// The golden files stores the md5 'hash' value for each 'filename' key.
|
// The golden files stores the md5 'hash' value for each 'filename' key.
|
||||||
// It is used to check if the decoded jbig2 image had changed using the image md5 hash.
|
// It is used to check if the decoded jbig2 image had changed using it's md5 hash.
|
||||||
type Goldens map[string]string
|
type Goldens map[string]string
|
||||||
|
|
||||||
func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...fileHash) {
|
func checkGoldenFiles(t *testing.T, dirname, filename string, images ...*extractedImage) {
|
||||||
goldens, err := readGoldenFile(dirname, filename)
|
goldens, err := readGoldenFile(dirname, filename)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
if jbig2UpdateGoldens {
|
if jbig2UpdateGoldens {
|
||||||
// copy all the file hashes into Goldens map.
|
// copy all the file hashes into Goldens map.
|
||||||
for _, fh := range readHashes {
|
for _, img := range images {
|
||||||
goldens[fh.fileName] = fh.hash
|
goldens[img.fullName()] = img.hash
|
||||||
}
|
}
|
||||||
|
|
||||||
err = writeGoldenFile(dirname, filename, goldens)
|
err = writeGoldenFile(dirname, filename, goldens)
|
||||||
@ -36,13 +37,13 @@ func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...file
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, fh := range readHashes {
|
for _, img := range images {
|
||||||
t.Run(fh.fileName, func(t *testing.T) {
|
t.Run(fmt.Sprintf("Page#%d/Image#%d", img.pageNo, img.idx), func(t *testing.T) {
|
||||||
single, exist := goldens[fh.fileName]
|
single, exist := goldens[img.fullName()]
|
||||||
// check if the 'filename' key exists.
|
// check if the 'filename' key exists.
|
||||||
if assert.True(t, exist, "hash doesn't exists") {
|
if assert.True(t, exist, "hash doesn't exists") {
|
||||||
// check if the md5 hash equals with the given fh.hash
|
// check if the md5 hash equals with the given fh.hash
|
||||||
assert.Equal(t, fh.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", fh.hash, single)
|
assert.Equal(t, img.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", img.hash, single)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user