2019-07-14 23:18:40 +02:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package tests
|
|
|
|
|
|
|
|
import (
|
|
|
|
"archive/zip"
|
|
|
|
"crypto/md5"
|
|
|
|
"encoding/hex"
|
2020-03-27 12:47:41 +01:00
|
|
|
"flag"
|
2019-07-14 23:18:40 +02:00
|
|
|
"fmt"
|
2020-03-27 12:47:41 +01:00
|
|
|
_ "image/gif"
|
2019-07-14 23:18:40 +02:00
|
|
|
"image/jpeg"
|
2020-03-27 12:47:41 +01:00
|
|
|
_ "image/jpeg"
|
|
|
|
_ "image/png"
|
2019-07-14 23:18:40 +02:00
|
|
|
"io"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
|
|
"github.com/unidoc/unipdf/v3/contentstream"
|
|
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
|
|
"github.com/unidoc/unipdf/v3/model"
|
2019-07-16 17:40:22 +02:00
|
|
|
|
2020-03-27 12:47:41 +01:00
|
|
|
"github.com/unidoc/unipdf/v3/internal/jbig2/document"
|
2019-07-14 23:18:40 +02:00
|
|
|
)
|
|
|
|
|
2020-03-27 12:47:41 +01:00
|
|
|
// register basic image drivers - gif, jpeg, png
|
|
|
|
|
|
|
|
const (
|
|
|
|
// EnvJBIG2Directory is the environment variable that should contain directory path
|
|
|
|
// to the jbig2 encoded test files.
|
|
|
|
EnvJBIG2Directory = "UNIDOC_JBIG2_TESTDATA"
|
|
|
|
// EnvImageDirectory is the environment variable that should contain directory path
|
|
|
|
// to the images used for the encoding processes.
|
|
|
|
EnvImageDirectory = "UNIDOC_JBIG2_TEST_IMAGES"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
// updateGoldens is the runtime flag that states that the md5 hashes
|
|
|
|
// for each decoded test case image should be updated.
|
|
|
|
updateGoldens bool
|
|
|
|
// keepImageFiles is the runtime flag that is used to keep the decoded jbig2 images
|
|
|
|
// within the temporary directory: 'os.TempDir()/unipdf/jbig2'.
|
|
|
|
keepImageFiles bool
|
|
|
|
// logToFile is the flag that allows to log the trace values to the file.
|
|
|
|
logToFile bool
|
|
|
|
// keepEncodedFile is the runtime flag that is used to keep the jbig2 encoded images
|
|
|
|
// within the temporary directory: 'os.TempDir()/unipdf/jbig2'
|
|
|
|
keepEncodedFile bool
|
|
|
|
)
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
flag.BoolVar(&updateGoldens, "jbig2-update-goldens", false, "updates the golden file hashes on the run")
|
|
|
|
flag.BoolVar(&keepImageFiles, "jbig2-store-images", false, "stores the images in the temporary `os.TempDir`/unipdf/jbig2 directory")
|
|
|
|
flag.BoolVar(&keepEncodedFile, "jbig2-store-encoded", false, "stores the jbig2 encoded images in the temporary `os.TempDir`/unipdf/jbig2 directory")
|
|
|
|
flag.BoolVar(&logToFile, "jbig2-log-to-file", false, "logs trace messages into file localized at `os.TempDir`/unipdf/jbig2 directory")
|
|
|
|
}
|
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
type extractedImage struct {
|
|
|
|
jbig2Data []byte
|
|
|
|
pdfImage *model.XObjectImage
|
|
|
|
name string
|
|
|
|
pageNo int
|
|
|
|
idx int
|
|
|
|
hash string
|
2020-03-27 12:47:41 +01:00
|
|
|
globals *document.Globals
|
2019-07-16 17:40:22 +02:00
|
|
|
}
|
2019-07-14 23:18:40 +02:00
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
func (e *extractedImage) fullName() string {
|
|
|
|
return fmt.Sprintf("%s_%d_%d", e.name, e.pageNo, e.idx)
|
|
|
|
}
|
|
|
|
|
|
|
|
func extractImages(dirName string, filename string) ([]*extractedImage, error) {
|
|
|
|
f, err := getFile(dirName, filename)
|
2019-07-14 23:18:40 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2019-07-16 17:40:22 +02:00
|
|
|
defer f.Close()
|
2019-07-14 23:18:40 +02:00
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
reader, err := readPDF(f)
|
|
|
|
if err != nil && err.Error() != "EOF not found" {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var numPages int
|
|
|
|
numPages, err = reader.GetNumPages()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
page *model.PdfPage
|
|
|
|
images, tempImages []*extractedImage
|
|
|
|
)
|
|
|
|
for pageNo := 1; pageNo <= numPages; pageNo++ {
|
|
|
|
page, err = reader.GetPage(pageNo)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
tempImages, err = extractImagesOnPage(dirName, filename, page, pageNo)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
images = append(images, tempImages...)
|
|
|
|
}
|
|
|
|
return images, nil
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func extractImagesInContentStream(filename, contents string, resources *model.PdfPageResources) ([]*extractedImage, error) {
|
|
|
|
extractedImages := []*extractedImage{}
|
|
|
|
processedXObjects := make(map[string]bool)
|
|
|
|
cstreamParser := contentstream.NewContentStreamParser(contents)
|
|
|
|
|
|
|
|
operations, err := cstreamParser.Parse()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Range through all the content stream operations.
|
|
|
|
for _, op := range *operations {
|
|
|
|
if !(op.Operand == "Do" && len(op.Params) == 1) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Do: XObject.
|
|
|
|
name := op.Params[0].(*core.PdfObjectName)
|
|
|
|
|
|
|
|
// Only process each one once.
|
|
|
|
_, has := processedXObjects[string(*name)]
|
|
|
|
if has {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
processedXObjects[string(*name)] = true
|
|
|
|
|
|
|
|
xobj, xtype := resources.GetXObjectByName(*name)
|
|
|
|
if xtype == model.XObjectTypeImage {
|
|
|
|
filterObj := core.TraceToDirectObject(xobj.Get("Filter"))
|
|
|
|
if filterObj == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
method, ok := filterObj.(*core.PdfObjectName)
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if method.String() != core.StreamEncodingFilterNameJBIG2 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
ximg, err := resources.GetXObjectImageByName(*name)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
enc, ok := ximg.Filter.(*core.JBIG2Encoder)
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("Filter encoder should be a JBIG2Encoder but is: %T", ximg.Filter)
|
|
|
|
}
|
|
|
|
|
2019-07-14 23:18:40 +02:00
|
|
|
extracted := &extractedImage{
|
|
|
|
pdfImage: ximg,
|
|
|
|
jbig2Data: xobj.Stream,
|
2020-03-27 12:47:41 +01:00
|
|
|
globals: enc.Globals.ToDocumentGlobals(),
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
extractedImages = append(extractedImages, extracted)
|
|
|
|
} else if xtype == model.XObjectTypeForm {
|
|
|
|
// Go through the XObject Form content stream.
|
|
|
|
xform, err := resources.GetXObjectFormByName(*name)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
formContent, err := xform.GetContentStream()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process the content stream in the Form object too:
|
|
|
|
formResources := xform.Resources
|
|
|
|
if formResources == nil {
|
|
|
|
formResources = resources
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process the content stream in the Form object too:
|
|
|
|
images, err := extractImagesInContentStream(filename, string(formContent), formResources)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
extractedImages = append(extractedImages, images...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return extractedImages, nil
|
|
|
|
}
|
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
func extractImagesOnPage(dirname, filename string, page *model.PdfPage, pageNo int) ([]*extractedImage, error) {
|
|
|
|
contents, err := page.GetAllContentStreams()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
images, err := extractImagesInContentStream(filepath.Join(dirname, filename), contents, page.Resources)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
rawName := rawFileName(filename)
|
|
|
|
for i, image := range images {
|
|
|
|
image.name = rawName
|
|
|
|
image.idx = i + 1
|
|
|
|
image.pageNo = pageNo
|
|
|
|
}
|
|
|
|
return images, nil
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func getFile(dirName, filename string) (*os.File, error) {
|
|
|
|
return os.Open(filepath.Join(dirName, filename))
|
|
|
|
}
|
|
|
|
|
|
|
|
func rawFileName(filename string) string {
|
|
|
|
if i := strings.LastIndex(filename, "."); i != -1 {
|
|
|
|
return filename[:i]
|
|
|
|
}
|
|
|
|
return filename
|
|
|
|
}
|
|
|
|
|
2020-03-27 12:47:41 +01:00
|
|
|
func readFileNames(dirname, suffix string) ([]string, error) {
|
2019-07-14 23:18:40 +02:00
|
|
|
var files []string
|
|
|
|
err := filepath.Walk(dirname, func(path string, info os.FileInfo, err error) error {
|
2020-03-27 12:47:41 +01:00
|
|
|
if err != nil {
|
|
|
|
return err
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
2020-03-27 12:47:41 +01:00
|
|
|
if !info.IsDir() {
|
|
|
|
if suffix != "" && !strings.HasSuffix(strings.ToLower(info.Name()), suffix) {
|
|
|
|
return nil
|
|
|
|
}
|
2019-07-14 23:18:40 +02:00
|
|
|
files = append(files, info.Name())
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
return files, err
|
|
|
|
}
|
|
|
|
|
|
|
|
func readPDF(f *os.File, password ...string) (*model.PdfReader, error) {
|
|
|
|
pdfReader, err := model.NewPdfReader(f)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if is encrypted
|
|
|
|
isEncrypted, err := pdfReader.IsEncrypted()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if isEncrypted {
|
|
|
|
auth, err := pdfReader.Decrypt([]byte(""))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if !auth {
|
|
|
|
if len(password) > 0 {
|
|
|
|
auth, err = pdfReader.Decrypt([]byte(password[0]))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !auth {
|
|
|
|
return nil, fmt.Errorf("reading the file: '%s' failed. Invalid password provided", f.Name())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return pdfReader, nil
|
|
|
|
}
|
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
func writeExtractedImages(zw *zip.Writer, images ...*extractedImage) (err error) {
|
2019-07-14 23:18:40 +02:00
|
|
|
h := md5.New()
|
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
for _, img := range images {
|
|
|
|
common.Log.Trace("Writing file: '%s'", img.fullName())
|
|
|
|
f, err := zw.Create(img.fullName() + ".jpg")
|
2019-07-14 23:18:40 +02:00
|
|
|
if err != nil {
|
2019-07-16 17:40:22 +02:00
|
|
|
return err
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
cimg, err := img.pdfImage.ToImage()
|
|
|
|
if err != nil {
|
2019-07-16 17:40:22 +02:00
|
|
|
return err
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
gimg, err := cimg.ToGoImage()
|
|
|
|
if err != nil {
|
2019-07-16 17:40:22 +02:00
|
|
|
return err
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
multiWriter := io.MultiWriter(f, h)
|
|
|
|
|
|
|
|
// write to file
|
|
|
|
q := &jpeg.Options{Quality: 100}
|
|
|
|
if err = jpeg.Encode(multiWriter, gimg, q); err != nil {
|
2019-07-16 17:40:22 +02:00
|
|
|
return err
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|
|
|
|
|
2019-07-16 17:40:22 +02:00
|
|
|
img.hash = hex.EncodeToString(h.Sum(nil))
|
2019-07-14 23:18:40 +02:00
|
|
|
h.Reset()
|
|
|
|
}
|
2019-07-16 17:40:22 +02:00
|
|
|
return nil
|
2019-07-14 23:18:40 +02:00
|
|
|
}
|