diff --git a/internal/jbig2/segments/text-region.go b/internal/jbig2/segments/text-region.go index cf6d408c..921f9397 100644 --- a/internal/jbig2/segments/text-region.go +++ b/internal/jbig2/segments/text-region.go @@ -859,6 +859,10 @@ func (t *TextRegion) getUserTable(tablePosition int) (huffman.Tabler, error) { func (t *TextRegion) initSymbols() error { for _, segment := range t.Header.RTSegments { + if segment == nil { + return errors.New("jbig2 - internal error - nil segment provided for the text region symbols") + } + if segment.Type == 0 { s, err := segment.GetSegmentData() if err != nil { diff --git a/internal/jbig2/tests/bench_test.go b/internal/jbig2/tests/bench_test.go index f37d4c26..0b3a7b04 100644 --- a/internal/jbig2/tests/bench_test.go +++ b/internal/jbig2/tests/bench_test.go @@ -6,12 +6,8 @@ package tests import ( - "archive/zip" "fmt" - "io/ioutil" "os" - "path/filepath" - "strings" "testing" "github.com/stretchr/testify/require" @@ -19,48 +15,37 @@ import ( "github.com/unidoc/unipdf/v3/internal/jbig2" ) -// BenchmarkDecodeSingleJBIG2 benchmarks the jbig2 decoding. -// In order to run the benchmark run the DecodeJBIG2Files with the UNIDOC_JBIG2_TESTDATA environmental variable. -// Zipped files containing raw jbig2 streams shoud be created. -func BenchmarkDecodeSingleJBIG2(b *testing.B) { +// BenchmarkDecodeJBIG2Files benchmarks the decoding process of jbig2 encoded images stored within pdf files. +// The function reads pdf files located in the directory provided as `UNIDOC_JBIG2_TESTDATA` environmental variable. +// Then the function extracts the images and starts subBenchmarks for each image. +func BenchmarkDecodeJBIG2Files(b *testing.B) { b.Helper() - dirName := os.Getenv("UNIDOC_JBIG2_TESTDATA") + dirName := os.Getenv(EnvDirectory) require.NotEmpty(b, dirName, "No Environment variable 'UNIDOC_JBIG2_TESTDATA' found") - jbig2Files, err := readJBIGZippedFiles(dirName) + filenames, err := readFileNames(dirName) require.NoError(b, err) + require.NotEmpty(b, filenames, "no files found within provided directory") - for _, file := range jbig2Files { - zr, err := zip.OpenReader(filepath.Join(dirName, jbig2DecodedDirectory, file)) - require.NoError(b, err) + for _, filename := range filenames { + b.Run(rawFileName(filename), func(b *testing.B) { + images, err := extractImages(dirName, filename) + require.NoError(b, err) - defer zr.Close() + for _, image := range images { + b.Run(fmt.Sprintf("Page#%d/Image#%d-%d", image.pageNo, image.idx, len(image.jbig2Data)), func(b *testing.B) { + for n := 0; n < b.N; n++ { + d, err := jbig2.NewDocumentWithGlobals(image.jbig2Data, image.globals) + require.NoError(b, err) - for _, zFile := range zr.File { - if !strings.HasSuffix(zFile.Name, ".jbig2") { - continue + p, err := d.GetPage(1) + require.NoError(b, err) + + _, err = p.GetBitmap() + require.NoError(b, err) + } + }) } - - sf, err := zFile.Open() - require.NoError(b, err) - - defer sf.Close() - - data, err := ioutil.ReadAll(sf) - require.NoError(b, err) - - b.Run(fmt.Sprintf("%s/%d", rawFileName(zFile.Name), len(data)), func(b *testing.B) { - for n := 0; n < b.N; n++ { - d, err := jbig2.NewDocument(data) - require.NoError(b, err) - - p, err := d.GetPage(1) - require.NoError(b, err) - - _, err = p.GetBitmap() - require.NoError(b, err) - } - }) - } + }) } } diff --git a/internal/jbig2/tests/common_test.go b/internal/jbig2/tests/common_test.go index 128c5358..0704ffbd 100644 --- a/internal/jbig2/tests/common_test.go +++ b/internal/jbig2/tests/common_test.go @@ -20,21 +20,59 @@ import ( "github.com/unidoc/unipdf/v3/contentstream" "github.com/unidoc/unipdf/v3/core" "github.com/unidoc/unipdf/v3/model" + + "github.com/unidoc/unipdf/v3/internal/jbig2" ) -const jbig2DecodedDirectory string = "jbig2_decoded_images" - -func extractImagesOnPage(filename string, page *model.PdfPage) ([]*extractedImage, error) { - contents, err := page.GetAllContentStreams() - if err != nil { - return nil, err - } - return extractImagesInContentStream(filename, contents, page.Resources) -} - type extractedImage struct { jbig2Data []byte pdfImage *model.XObjectImage + name string + pageNo int + idx int + hash string + globals jbig2.Globals +} + +func (e *extractedImage) fullName() string { + return fmt.Sprintf("%s_%d_%d", e.name, e.pageNo, e.idx) +} + +func extractImages(dirName string, filename string) ([]*extractedImage, error) { + f, err := getFile(dirName, filename) + if err != nil { + return nil, err + } + defer f.Close() + + reader, err := readPDF(f) + if err != nil && err.Error() != "EOF not found" { + return nil, err + } + + var numPages int + numPages, err = reader.GetNumPages() + if err != nil { + return nil, err + } + + var ( + page *model.PdfPage + images, tempImages []*extractedImage + ) + for pageNo := 1; pageNo <= numPages; pageNo++ { + page, err = reader.GetPage(pageNo) + if err != nil { + return nil, err + } + + tempImages, err = extractImagesOnPage(dirName, filename, page, pageNo) + if err != nil { + return nil, err + } + images = append(images, tempImages...) + } + return images, nil } func extractImagesInContentStream(filename, contents string, resources *model.PdfPageResources) ([]*extractedImage, error) { @@ -83,9 +121,15 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd return nil, err } + enc, ok := ximg.Filter.(*core.JBIG2Encoder) + if !ok { + return nil, fmt.Errorf("Filter encoder should be a JBIG2Encoder but is: %T", ximg.Filter) + } + extracted := &extractedImage{ pdfImage: ximg, jbig2Data: xobj.Stream, + globals: enc.Globals, } extractedImages = append(extractedImages, extracted) @@ -118,9 +162,24 @@ func extractImagesInContentStream(filename, contents string, resources *model.Pd return extractedImages, nil } -type fileHash struct { - fileName string - hash string +func extractImagesOnPage(dirname, filename string, page *model.PdfPage, pageNo int) ([]*extractedImage, error) { + contents, err := page.GetAllContentStreams() + if err != nil { + return nil, err + } + + images, err := extractImagesInContentStream(filepath.Join(dirname, filename), contents, page.Resources) + if err != nil { + return nil, err + } + + rawName := rawFileName(filename) + for i, image := range images { + image.name = rawName + image.idx = i + 1 + image.pageNo = pageNo + } + return images, nil } func getFile(dirName, filename string) (*os.File, error) { @@ -189,27 +248,24 @@ func readPDF(f *os.File, password ...string) (*model.PdfReader, error) { return pdfReader, nil } -func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images ...*extractedImage) (hashes []fileHash, err error) { +func writeExtractedImages(zw *zip.Writer, images ...*extractedImage) (err error) { h := md5.New() - // write images - for idx, img := range images { - fname := fmt.Sprintf("%s_%d_%d", rawFileName(filename), pageNo, idx) - - common.Log.Trace("Writing file: '%s'", fname) - f, err := zw.Create(fname + ".jpg") + for _, img := range images { + common.Log.Trace("Writing file: '%s'", img.fullName()) + f, err := zw.Create(img.fullName() + ".jpg") if err != nil { - return nil, err + return err } cimg, err := img.pdfImage.ToImage() if err != nil { - return nil, err + return err } gimg, err := cimg.ToGoImage() if err != nil { - return nil, err + return err } multiWriter := io.MultiWriter(f, h) @@ -217,25 +273,11 @@ func writeExtractedImages(zw *zip.Writer, filename string, pageNo int, images .. // write to file q := &jpeg.Options{Quality: 100} if err = jpeg.Encode(multiWriter, gimg, q); err != nil { - return nil, err + return err } - fh := fileHash{fileName: fname, hash: hex.EncodeToString(h.Sum(nil))} - hashes = append(hashes, fh) + img.hash = hex.EncodeToString(h.Sum(nil)) h.Reset() - - if err = writeJBIG2Stream(zw, fname+".jbig2", img.jbig2Data); err != nil { - return nil, err - } } - return hashes, nil -} - -func writeJBIG2Stream(zw *zip.Writer, filename string, data []byte) error { - f, err := zw.Create(filename) - if err != nil { - return err - } - _, err = f.Write(data) - return err + return nil } diff --git a/internal/jbig2/tests/document_decode_test.go b/internal/jbig2/tests/document_decode_test.go index 53d16d3c..49764ec6 100644 --- a/internal/jbig2/tests/document_decode_test.go +++ b/internal/jbig2/tests/document_decode_test.go @@ -15,7 +15,6 @@ import ( "github.com/stretchr/testify/require" "github.com/unidoc/unipdf/v3/common" - "github.com/unidoc/unipdf/v3/model" ) // EnvDirectory is the environment variable that should contain directory path @@ -27,7 +26,7 @@ var ( // for each decoded testcase image should be updated. jbig2UpdateGoldens bool // keepImageFiles is the runtime flag that is used to keep the decoded jbig2 images - // within the temporary directory: os.TempDir()/unipdf/jbig2 + // within the temporary directory: 'os.TempDir()/unipdf/jbig2'. keepImageFiles bool ) @@ -40,7 +39,7 @@ func init() { // Requires environmental variable 'UNIDOC_JBIG2_TESTDATA' that contains the jbig2 testdata. // Decoded images are stored within zipped archive files - that has the same name as the pdf file. // In order to check the decoded images this function creates also the directory 'goldens' -// which would have json files for each 'pdf' input, containing valid flags. +// which would have json files for each 'pdf' input, containing valid image hashes. // If the 'jbig2-update-goldens' runtime flag is provided, the test function updates all the 'hashes' // for the decoded jbig2 images in related 'golden' files. // In order to check the decoded images use 'jbig2-store-images' flag, then the function would store them @@ -72,31 +71,10 @@ func TestDecodeJBIG2Files(t *testing.T) { } }() - passwords := make(map[string]string) - for _, filename := range filenames { rawName := rawFileName(filename) t.Run(rawName, func(t *testing.T) { - // get the file - f, err := getFile(dirName, filename) - require.NoError(t, err) - defer f.Close() - - var reader *model.PdfReader - password, ok := passwords[filename] - if ok { - // read the pdf with the password - reader, err = readPDF(f, password) - } else { - reader, err = readPDF(f) - } - if err != nil { - if err.Error() != "EOF not found" { - require.NoError(t, err) - } - } - - numPages, err := reader.GetNumPages() + images, err := extractImages(dirName, filename) require.NoError(t, err) // create zipped file @@ -109,21 +87,10 @@ func TestDecodeJBIG2Files(t *testing.T) { zw := zip.NewWriter(w) defer zw.Close() - var allHashes []fileHash + err = writeExtractedImages(zw, images...) + require.NoError(t, err) - for pageNo := 1; pageNo <= numPages; pageNo++ { - page, err := reader.GetPage(pageNo) - require.NoError(t, err) - - images, err := extractImagesOnPage(filepath.Join(dirName, rawName), page) - require.NoError(t, err) - - hashes, err := writeExtractedImages(zw, rawName, pageNo, images...) - require.NoError(t, err) - - allHashes = append(allHashes, hashes...) - } - checkGoldenFiles(t, dirName, rawName, allHashes...) + checkGoldenFiles(t, dirName, rawName, images...) }) } } diff --git a/internal/jbig2/tests/goldens_test.go b/internal/jbig2/tests/goldens_test.go index 89ae34ad..d28f4ee2 100644 --- a/internal/jbig2/tests/goldens_test.go +++ b/internal/jbig2/tests/goldens_test.go @@ -7,6 +7,7 @@ package tests import ( "encoding/json" + "fmt" "io" "os" "path/filepath" @@ -18,17 +19,17 @@ import ( // Goldens is a model used to store the jbig2 testcase 'golden files'. // The golden files stores the md5 'hash' value for each 'filename' key. -// It is used to check if the decoded jbig2 image had changed using the image md5 hash. +// It is used to check if the decoded jbig2 image had changed using it's md5 hash. type Goldens map[string]string -func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...fileHash) { +func checkGoldenFiles(t *testing.T, dirname, filename string, images ...*extractedImage) { goldens, err := readGoldenFile(dirname, filename) require.NoError(t, err) if jbig2UpdateGoldens { // copy all the file hashes into Goldens map. - for _, fh := range readHashes { - goldens[fh.fileName] = fh.hash + for _, img := range images { + goldens[img.fullName()] = img.hash } err = writeGoldenFile(dirname, filename, goldens) @@ -36,13 +37,13 @@ func checkGoldenFiles(t *testing.T, dirname, filename string, readHashes ...file return } - for _, fh := range readHashes { - t.Run(fh.fileName, func(t *testing.T) { - single, exist := goldens[fh.fileName] + for _, img := range images { + t.Run(fmt.Sprintf("Page#%d/Image#%d", img.pageNo, img.idx), func(t *testing.T) { + single, exist := goldens[img.fullName()] // check if the 'filename' key exists. if assert.True(t, exist, "hash doesn't exists") { // check if the md5 hash equals with the given fh.hash - assert.Equal(t, fh.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", fh.hash, single) + assert.Equal(t, img.hash, single, "hash: '%s' doesn't match the golden stored hash: '%s'", img.hash, single) } }) }