Merge branch 'development' of https://github.com/unidoc/unipdf into development

This commit is contained in:
Gunnsteinn Hall 2020-06-16 21:19:49 +00:00
commit dbd2364470
20 changed files with 851 additions and 95 deletions

26
Jenkinsfile vendored
View File

@ -1,13 +1,10 @@
node {
// Install the desired Go version
def root = tool name: 'go 1.11.5', type: 'go'
def root = tool name: 'go 1.14.3', type: 'go'
env.GOROOT="${root}"
env.GOPATH="${WORKSPACE}/gopath"
// Hack for 1.11.5 testing work.
env.CGO_ENABLED="0"
env.PATH="${root}/bin:${env.GOPATH}/bin:${env.PATH}"
env.GOCACHE="off"
env.GOBIN="${WORKSPACE}/bin"
env.PATH="${root}/bin:${env.GOBIN}:${env.PATH}"
env.UNIDOC_EXTRACT_FORCETEST="1"
env.UNIDOC_E2E_FORCE_TESTS="1"
env.UNIDOC_EXTRACT_TESTDATA="/home/jenkins/corpus/unidoc-extractor-testdata"
@ -19,13 +16,13 @@ node {
env.UNIDOC_JBIG2_TESTDATA="/home/jenkins/corpus/jbig2-testdata"
env.UNIDOC_FDFMERGE_TESTDATA="/home/jenkins/corpus/fdfmerge-testdata"
env.UNIDOC_GS_BIN_PATH="/usr/bin/gs"
// Hack for 1.11.5 testing work.
env.CGO_ENABLED="0"
env.TMPDIR="${WORKSPACE}/temp"
sh "mkdir -p ${env.GOBIN}"
sh "mkdir -p ${env.TMPDIR}"
dir("${GOPATH}/src/github.com/unidoc/unipdf") {
dir("${WORKSPACE}/unipdf") {
sh 'go version'
stage('Checkout') {
@ -35,11 +32,9 @@ node {
stage('Prepare') {
// Get linter and other build tools.
sh 'go get -u golang.org/x/lint/golint'
sh 'go get golang.org/x/lint/golint'
sh 'go get github.com/tebeka/go2xunit'
sh 'go get github.com/t-yuki/gocover-cobertura'
// Get all dependencies (for tests also).
sh 'go get -t ./...'
}
stage('Linting') {
@ -53,7 +48,7 @@ node {
stage('Testing') {
// Go test - No tolerance.
sh "rm -f ${env.TMPDIR}/*.pdf"
sh '2>&1 go test -v ./... | tee gotest.txt'
sh '2>&1 go test -count=1 -v ./... | tee gotest.txt'
}
stage('Check generated PDFs') {
@ -62,7 +57,7 @@ node {
}
stage('Test coverage') {
sh 'go test -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
sh 'go test -count=1 -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
sh '/home/jenkins/codecov.sh'
sh 'gocover-cobertura < coverage.out > coverage.xml'
step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml'])
@ -80,7 +75,7 @@ node {
}
}
dir("${GOPATH}/src/github.com/unidoc/unipdf-examples") {
dir("${WORKSPACE}/unipdf-examples") {
stage('Build examples') {
// Output environment variables (useful for debugging).
sh("printenv")
@ -97,6 +92,9 @@ node {
echo "Pulling unipdf-examples on branch ${examplesBranch}"
git url: 'https://github.com/unidoc/unidoc-examples.git', branch: examplesBranch
// Use replace directive to use disk version of unipdf.
sh 'echo "replace github.com/unidoc/unipdf/v3 => ../unipdf" >>go.mod'
// Dependencies for examples.
sh './build_examples.sh'

View File

@ -948,7 +948,6 @@ func newDCTEncoderFromStream(streamObj *PdfObjectStream, multiEnc *MultiEncoder)
return nil, err
}
encoded = e
}
bufReader := bytes.NewReader(encoded)
@ -2158,6 +2157,9 @@ func newMultiEncoderFromStream(streamObj *PdfObjectStream) (*MultiEncoder, error
// GetFilterName returns the names of the underlying encoding filters,
// separated by spaces.
// Note: This is just a string, should not be used in /Filter dictionary entry. Use GetFilterArray for that.
// TODO(v4): Refactor to GetFilter() which can be used for /Filter (either Name or Array), this can be
// renamed to String() as a pretty string to use in debugging etc.
func (enc *MultiEncoder) GetFilterName() string {
name := ""
for idx, encoder := range enc.encoders {
@ -2169,6 +2171,16 @@ func (enc *MultiEncoder) GetFilterName() string {
return name
}
// GetFilterArray returns the names of the underlying encoding filters in an array that
// can be used as /Filter entry.
func (enc *MultiEncoder) GetFilterArray() *PdfObjectArray {
names := make([]PdfObject, len(enc.encoders))
for i, e := range enc.encoders {
names[i] = MakeName(e.GetFilterName())
}
return MakeArray(names...)
}
// MakeDecodeParams makes a new instance of an encoding dictionary based on
// the current encoder settings.
func (enc *MultiEncoder) MakeDecodeParams() PdfObject {
@ -2201,12 +2213,7 @@ func (enc *MultiEncoder) AddEncoder(encoder StreamEncoder) {
// MakeStreamDict makes a new instance of an encoding dictionary for a stream object.
func (enc *MultiEncoder) MakeStreamDict() *PdfObjectDictionary {
dict := MakeDict()
names := make([]PdfObject, len(enc.encoders))
for i, e := range enc.encoders {
names[i] = MakeName(e.GetFilterName())
}
dict.Set("Filter", MakeArray(names...))
dict.Set("Filter", enc.GetFilterArray())
// Pass all values from children, except Filter and DecodeParms.
for _, encoder := range enc.encoders {

View File

@ -42,9 +42,14 @@ func New(page *model.PdfPage) (*Extractor, error) {
// fmt.Printf("%s\n", contents)
// fmt.Println("========================= ::: =========================")
return NewFromContents(contents, page.Resources)
}
// NewFromContents creates a new extractor from contents and page resources.
func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) {
e := &Extractor{
contents: contents,
resources: page.Resources,
resources: resources,
fontCache: map[string]fontEntry{},
formResults: map[string]textResult{},
}

View File

@ -439,7 +439,11 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
return core.ErrTypeError
}
to.renderText(charcodes)
err := to.renderText(charcodes)
if err != nil {
common.Log.Debug("Render text error: %v", err)
return err
}
default:
common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
return core.ErrTypeError
@ -736,6 +740,7 @@ func (to *textObject) renderText(data []byte) error {
continue
}
// TODO(gunnsth): Assuming 1:1 charcode[i] <-> rune[i] mapping.
code := charcodes[i]
// The location of the text on the page in device coordinates is given by trm, the text
// rendering matrix.
@ -785,6 +790,8 @@ func (to *textObject) renderText(data []byte) error {
} else if font.Encoder() == nil {
common.Log.Debug("ERROR: No encoding. font=%s", font)
} else {
// TODO: This lookup seems confusing. Went from bytes <-> charcodes already.
// NOTE: This is needed to register runes by the font encoder - for subsetting (optimization).
original, ok := font.Encoder().CharcodeToRune(code)
if ok {
mark.original = string(original)

View File

@ -51,9 +51,7 @@ var doStress bool
func init() {
flag.BoolVar(&doStress, "extractor-stresstest", false, "Run text extractor stress tests.")
common.SetLogger(common.NewConsoleLogger(common.LogLevelInfo))
if flag.Lookup("test.v") != nil {
isTesting = true
}
isTesting = true
}
// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.

5
go.mod
View File

@ -5,12 +5,15 @@ go 1.11
require (
github.com/adrg/sysfont v0.1.0
github.com/boombuler/barcode v1.0.0
github.com/davecgh/go-spew v1.1.1
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0
github.com/sirupsen/logrus v1.6.0
github.com/stretchr/testify v1.4.0
github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a
github.com/unidoc/unitype v0.1.0
github.com/unidoc/unitype v0.2.0
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5
golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 // indirect
golang.org/x/text v0.3.2
)

10
go.sum
View File

@ -15,6 +15,8 @@ github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGw
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@ -24,6 +26,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sirupsen/logrus v1.5.0 h1:1N5EYkVAPEywqZRJd7cwnRtCb6xJx7NH3T3WUTF980Q=
github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s+Squfpo=
github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
@ -34,6 +38,10 @@ github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a h1:RLtvUhe4DsUDl6
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a/go.mod h1:j+qMWZVpZFTvDey3zxUkSgPJZEX33tDgU/QIA0IzCUw=
github.com/unidoc/unitype v0.1.0 h1:6zJYMl8XdwFBD45Cmg8Ge13WyE92jwLuK1tk2IsRb9s=
github.com/unidoc/unitype v0.1.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02 h1:zVMJh0ehLc0amGBcqIh7HWikIGXGBGpmW+Lvz1YVYH8=
github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
github.com/unidoc/unitype v0.2.0 h1:N+ZKjwz8UDU0qa1IYzstDLffvQEctFo+bo6b6ZqW+9M=
github.com/unidoc/unitype v0.2.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 h1:58fnuSXlxZmFdJyvtTFVmVhcMLU6v5fEb/ok4wyqtNU=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@ -45,6 +53,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 h1:opSr2sbRXk5X5/givKrrKj9HXxFpW2sdCiP8MJSKLQY=
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 h1:ogLJMz+qpzav7lGMh10LMvAkM/fAoGlaiiHYiFYdm80=
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=

View File

@ -13,47 +13,81 @@ import (
"github.com/unidoc/unipdf/v3/core"
)
// IdentityEncoder represents an 2-byte identity encoding
// IdentityEncoder represents an 2-byte identity encoding.
// NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode
// meaning, except via the ToUnicode maps.
// TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing.
// Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual
// runes and character codes, CMaps together.
type IdentityEncoder struct {
baseName string
// runes registered by encoder for tracking what runes are used for subsetting.
registeredMap map[rune]struct{}
}
// NewIdentityTextEncoder returns a new IdentityEncoder based on predefined
// encoding `baseName` and difference map `differences`.
func NewIdentityTextEncoder(baseName string) IdentityEncoder {
return IdentityEncoder{baseName}
func NewIdentityTextEncoder(baseName string) *IdentityEncoder {
return &IdentityEncoder{
baseName: baseName,
}
}
// RegisteredRunes returns the slice of runes that have been registered as used by the encoder.
func (enc *IdentityEncoder) RegisteredRunes() []rune {
runes := make([]rune, len(enc.registeredMap))
i := 0
for r := range enc.registeredMap {
runes[i] = r
i++
}
return runes
}
// String returns a string that describes `enc`.
func (enc IdentityEncoder) String() string {
func (enc *IdentityEncoder) String() string {
return enc.baseName
}
// Encode converts the Go unicode string to a PDF encoded string.
func (enc IdentityEncoder) Encode(str string) []byte {
func (enc *IdentityEncoder) Encode(str string) []byte {
return encodeString16bit(enc, str)
}
// Decode converts PDF encoded string to a Go unicode string.
func (enc IdentityEncoder) Decode(raw []byte) string {
func (enc *IdentityEncoder) Decode(raw []byte) string {
return decodeString16bit(enc, raw)
}
// RuneToCharcode converts rune `r` to a PDF character code.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
// TODO: Here the `r` is an actual rune.
func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
return CharCode(r), true
}
// CharcodeToRune converts PDF character code `code` to a rune.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
// TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both).
func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
// TODO: The rune(code) is confusing and is not an actual utf8 rune.
enc.registeredMap[rune(code)] = struct{}{}
return rune(code), true
}
// RuneToGlyph returns the glyph name for rune `r`.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
if r == ' ' {
return "space", true
}
@ -63,7 +97,7 @@ func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
// GlyphToRune returns the rune corresponding to glyph name `glyph`.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
// String with "uniXXXX" format where XXXX is the hexcode.
if glyph == "space" {
return ' ', true
@ -78,7 +112,7 @@ func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
}
// ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
func (enc IdentityEncoder) ToPdfObject() core.PdfObject {
func (enc *IdentityEncoder) ToPdfObject() core.PdfObject {
if enc.baseName != "" {
return core.MakeName(enc.baseName)
}

View File

@ -103,6 +103,9 @@ type simpleEncoding struct {
// one byte encoding: CharCode <-> byte
encode map[rune]byte
decode map[byte]rune
// runes registered by encoder for tracking what runes are used for subsetting.
registeredMap map[rune]struct{}
}
// Encode converts the Go unicode string to a PDF encoded string.
@ -213,6 +216,10 @@ func (enc *simpleEncoding) Charcodes() []CharCode {
func (enc *simpleEncoding) RuneToCharcode(r rune) (CharCode, bool) {
b, ok := enc.encode[r]
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
return CharCode(b), ok
}
@ -222,6 +229,10 @@ func (enc *simpleEncoding) CharcodeToRune(code CharCode) (rune, bool) {
}
b := byte(code)
r, ok := enc.decode[b]
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
return r, ok
}

View File

@ -103,31 +103,31 @@ type PdfAnnotationLink struct {
}
// GetAction returns the PDF action for the annotation link.
func (a *PdfAnnotationLink) GetAction() (*PdfAction, error) {
if a.action != nil {
return a.action, nil
func (link *PdfAnnotationLink) GetAction() (*PdfAction, error) {
if link.action != nil {
return link.action, nil
}
if a.A == nil {
if link.A == nil {
return nil, nil
}
if a.reader == nil {
if link.reader == nil {
return nil, nil
}
action, err := a.reader.loadAction(a.A)
action, err := link.reader.loadAction(link.A)
if err != nil {
return nil, err
}
a.action = action
link.action = action
return a.action, nil
return link.action, nil
}
// SetAction sets the PDF action for the annotation link.
func (a *PdfAnnotationLink) SetAction(action *PdfAction) {
a.action = action
func (link *PdfAnnotationLink) SetAction(action *PdfAction) {
link.action = action
if action == nil {
a.A = nil
link.A = nil
}
}

View File

@ -50,6 +50,7 @@ func (font *PdfFont) SubsetRegistered() error {
case *pdfFontType0:
err := t.subsetRegistered()
if err != nil {
common.Log.Debug("Subset error: %v", err)
return err
}
if t.container != nil {
@ -401,6 +402,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
charcodes := make([]textencoding.CharCode, 0, len(data)+len(data)%2)
if font.baseFields().isCIDFont() {
// Identity only?
if len(data) == 1 {
data = []byte{0, data[0]}
}
@ -413,6 +415,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
charcodes = append(charcodes, textencoding.CharCode(b))
}
} else {
// Simple font: byte -> charcode.
for _, b := range data {
charcodes = append(charcodes, textencoding.CharCode(b))
}
@ -755,8 +758,7 @@ func (base fontCommon) isCIDFont() bool {
// newFontBaseFieldsFromPdfObject returns `fontObj` as a dictionary the common fields from that
// dictionary in the fontCommon return. If there is a problem an error is returned.
// The fontCommon is the group of fields common to all PDF fonts.
func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon,
error) {
func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon, error) {
font := &fontCommon{}
if obj, ok := fontObj.(*core.PdfIndirectObject); ok {

View File

@ -127,6 +127,9 @@ func (font *pdfFontType0) baseFields() *fontCommon {
}
func (font *pdfFontType0) getFontDescriptor() *PdfFontDescriptor {
if font.fontDescriptor == nil && font.DescendantFont != nil {
return font.DescendantFont.FontDescriptor()
}
return font.fontDescriptor
}
@ -210,14 +213,19 @@ func (font *pdfFontType0) subsetRegistered() error {
common.Log.Debug("Missing font descriptor")
return nil
}
if font.encoder == nil {
common.Log.Debug("No encoder - subsetting ignored")
return nil
}
stream, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2)
if !ok {
common.Log.Debug("Embedded font object not found -- ABORT subsseting")
common.Log.Debug("Embedded font object not found -- ABORT subsetting")
return errors.New("fontfile2 not found")
}
decoded, err := core.DecodeStream(stream)
if err != nil {
common.Log.Debug("Decode error: %v", err)
return err
}
@ -227,21 +235,52 @@ func (font *pdfFontType0) subsetRegistered() error {
return err
}
tenc, ok := font.encoder.(*textencoding.TrueTypeFontEncoder)
if !ok {
return fmt.Errorf("unsupported encoder for subsetting: %T", cidfnt.encoder)
var runes []rune
var subset *unitype.Font
switch tenc := font.encoder.(type) {
case *textencoding.TrueTypeFontEncoder:
// Means the font has been loaded from TTF file.
runes = tenc.RegisteredRunes()
subset, err = fnt.SubsetKeepRunes(runes)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
// Reduce the encoder also.
tenc.SubsetRegistered()
case *textencoding.IdentityEncoder:
// IdentityEncoder typically means font was parsed from PDF file.
// TODO: These are not actual runes... but glyph ids ? Very confusing.
runes = tenc.RegisteredRunes()
indices := make([]unitype.GlyphIndex, len(runes))
for i, r := range runes {
indices[i] = unitype.GlyphIndex(r)
}
subset, err = fnt.SubsetKeepIndices(indices)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
case textencoding.SimpleEncoder:
// Simple encoding, bytes are 0-255
charcodes := tenc.Charcodes()
for _, c := range charcodes {
r, ok := tenc.CharcodeToRune(c)
if !ok {
common.Log.Debug("ERROR: unable convert charcode to rune: %d", c)
continue
}
runes = append(runes, r)
}
default:
return fmt.Errorf("unsupported encoder for subsetting: %T", font.encoder)
}
runes := tenc.RegisteredRunes()
subset, err := fnt.SubsetKeepRunes(runes)
if err != nil {
return err
}
// Reduce the encoder also.
tenc.SubsetRegistered()
var buf bytes.Buffer
err = subset.Write(&buf)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
@ -249,7 +288,7 @@ func (font *pdfFontType0) subsetRegistered() error {
if font.toUnicodeCmap != nil {
codeToUnicode := make(map[cmap.CharCode]rune, len(runes))
for _, r := range runes {
cc, ok := tenc.RuneToCharcode(r)
cc, ok := font.encoder.RuneToCharcode(r)
if !ok {
continue
}
@ -260,9 +299,16 @@ func (font *pdfFontType0) subsetRegistered() error {
stream, err = core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
}
cidfnt.fontDescriptor.FontFile2 = stream
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
if curstr, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2); ok {
// Replace the current stream (keep same object).
*curstr = *stream
} else {
cidfnt.fontDescriptor.FontFile2 = stream
}
// Set subset name.
tag := genSubsetTag()
@ -334,6 +380,7 @@ func newPdfFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon)
encoderName, ok := core.GetNameVal(d.Get("Encoding"))
if ok {
// TODO: Identity-H maps 16-bit character codes straight to glyph index (don't need actual runes).
if encoderName == "Identity-H" || encoderName == "Identity-V" {
font.encoder = textencoding.NewIdentityTextEncoder(encoderName)
} else if cmap.IsPredefinedCMap(encoderName) {

View File

@ -0,0 +1,139 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"github.com/unidoc/unipdf/v3/contentstream"
"github.com/unidoc/unipdf/v3/core"
)
// CleanContentstream cleans up redundant operands in content streams, including Page and XObject Form
// contents. This process includes:
// 1. Marked content operators are removed.
// 2. Some operands are simplified (shorter form).
// TODO: Add more reduction methods and improving the methods for identifying unnecessary operands.
type CleanContentstream struct {
}
// filterOps cleans up the content stream in `ops`:
// 1. Marked content operators are cleaned.
// 2. Tm with 1 0 0 1 params are converted to Td (slightly shorter for same transformation).
// TODO: Add operations that track the state and remove unnecessary operands, such as duplicates
// or ones setting default values, or ones not drawing anything.
func filterOps(ops *contentstream.ContentStreamOperations) *contentstream.ContentStreamOperations {
if ops == nil {
return nil
}
filtered := contentstream.ContentStreamOperations{}
for _, op := range *ops {
switch op.Operand {
case "BDC", "BMC", "EMC":
continue
case "Tm":
if len(op.Params) == 6 {
if nums, err := core.GetNumbersAsFloat(op.Params); err == nil {
if nums[0] == 1 && nums[1] == 0 && nums[2] == 0 && nums[3] == 1 {
op = &contentstream.ContentStreamOperation{
Params: []core.PdfObject{
op.Params[4],
op.Params[5],
},
Operand: "Td",
}
}
}
}
}
filtered = append(filtered, op)
}
return &filtered
}
// reduceContent performs content stream optimization of contents in `cstream` which can either be
// from Page Contents or XObject Form.
// NOTE: If from a Contents array, the operations may be unbalanced.
func reduceContent(cstream *core.PdfObjectStream) error {
decoded, err := core.DecodeStream(cstream)
if err != nil {
return err
}
csp := contentstream.NewContentStreamParser(string(decoded))
ops, err := csp.Parse()
if err != nil {
return err
}
ops = filterOps(ops)
cleaned := ops.Bytes()
if len(cleaned) >= len(decoded) {
// No need to replace if no improvement.
return nil
}
newstream, err := core.MakeStream(ops.Bytes(), core.NewFlateEncoder())
if err != nil {
return err
}
cstream.Stream = newstream.Stream
cstream.Merge(newstream.PdfObjectDictionary)
return nil
}
// Optimize optimizes PDF objects to decrease PDF size.
func (c *CleanContentstream) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
// Track which content streams to process.
queuedMap := map[*core.PdfObjectStream]struct{}{}
var queued []*core.PdfObjectStream
appendQueue := func(stream *core.PdfObjectStream) {
if _, has := queuedMap[stream]; !has {
queuedMap[stream] = struct{}{}
queued = append(queued, stream)
}
}
// Collect objects to process: XObject Form and Page Content streams.
for _, obj := range objects {
switch t := obj.(type) {
case *core.PdfIndirectObject:
switch ti := t.PdfObject.(type) {
case *core.PdfObjectDictionary:
if name, ok := core.GetName(ti.Get("Type")); !ok || name.String() != "Page" {
continue
}
if stream, ok := core.GetStream(ti.Get("Contents")); ok {
appendQueue(stream)
} else if array, ok := core.GetArray(ti.Get("Contents")); ok {
for _, el := range array.Elements() {
if stream, ok := core.GetStream(el); ok {
appendQueue(stream)
}
}
}
}
case *core.PdfObjectStream:
if name, ok := core.GetName(t.Get("Type")); !ok || name.String() != "XObject" {
continue
}
if name, ok := core.GetName(t.Get("Subtype")); !ok || name.String() != "Form" {
continue
}
appendQueue(t)
}
}
// Process the queued content streams.
for _, stream := range queued {
err = reduceContent(stream)
if err != nil {
return nil, err
}
}
return objects, nil
}

View File

@ -0,0 +1,353 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"bytes"
"errors"
"github.com/unidoc/unitype"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/extractor"
"github.com/unidoc/unipdf/v3/internal/textencoding"
"github.com/unidoc/unipdf/v3/model"
)
// CleanFonts cleans up embedded fonts, reducing font sizes.
type CleanFonts struct {
// Subset embedded fonts if encountered (if true).
// Otherwise attempts to reduce the font program.
Subset bool
}
func optimizeFontsWithSubsetting(objects []core.PdfObject) (processed map[*core.PdfObjectStream]struct{}, err error) {
// 1. Identify all fonts.
// 2. Identify content streams and their Resources dictionaries (both via page, forms and annotations).
// 3. Process content streams.
processed = map[*core.PdfObjectStream]struct{}{}
fontMap := map[*model.PdfFont]struct{}{}
objstr := getObjectStructure(objects)
for _, p := range objstr.pages {
pdict, ok := core.GetDict(p.PdfObject)
if !ok {
continue
}
resourcesDict, ok := core.GetDict(pdict.Get("Resources"))
if !ok {
continue
}
contents, _ := getPageContents(pdict.Get("Contents"))
resources, err := model.NewPdfPageResourcesFromDict(resourcesDict)
if err != nil {
return nil, err
}
allContents := []content{
{
content: contents,
resources: resources,
},
}
annotContents := getAnnotationContents(pdict.Get("Annots"))
if annotContents != nil {
allContents = append(allContents, annotContents...)
}
for _, cont := range allContents {
e, err := extractor.NewFromContents(cont.content, cont.resources)
if err != nil {
return nil, err
}
pt, _, _, err := e.ExtractPageText()
if err != nil {
return nil, err
}
for _, el := range pt.Marks().Elements() {
if el.Font == nil {
continue
}
if _, has := fontMap[el.Font]; !has {
fontMap[el.Font] = struct{}{}
}
}
}
}
// Map of font program stream to font. Multiple fonts can use the same font program.
fontFileMap := map[*core.PdfObjectStream][]*model.PdfFont{}
for font := range fontMap {
fontDesc := font.FontDescriptor()
if fontDesc == nil || fontDesc.FontFile2 == nil {
continue
}
stream, ok := core.GetStream(fontDesc.FontFile2)
if !ok {
continue
}
fontFileMap[stream] = append(fontFileMap[stream], font)
}
for stream := range fontFileMap {
var allRunes []rune
var allIndices []unitype.GlyphIndex
for _, font := range fontFileMap[stream] {
switch t := font.Encoder().(type) {
case *textencoding.IdentityEncoder:
// TODO: This terminology is wrong as those are not runes, just charcodes cast as runes.
// Identity encoder maps via 2-byte encoding directly from 2byte charcode to glyph index.
runes := t.RegisteredRunes()
indices := make([]unitype.GlyphIndex, len(runes))
for i, r := range runes {
indices[i] = unitype.GlyphIndex(r)
}
allIndices = append(allIndices, indices...)
case *textencoding.TrueTypeFontEncoder:
runes := t.RegisteredRunes()
allRunes = append(allRunes, runes...)
case textencoding.SimpleEncoder:
charcodes := t.Charcodes()
for _, c := range charcodes {
r, ok := t.CharcodeToRune(c)
if !ok {
common.Log.Debug("Charcode<->rune not found: %d", c)
continue
}
allRunes = append(allRunes, r)
}
}
}
err = subsetFontStream(stream, allRunes, allIndices)
if err != nil {
common.Log.Debug("ERROR subsetting font stream: %v", err)
return nil, err
}
processed[stream] = struct{}{}
}
return processed, nil
}
// Subsets the font program in `stream` with the subset based on the `runes` and glyph `indices`.
func subsetFontStream(stream *core.PdfObjectStream, runes []rune, indices []unitype.GlyphIndex) error {
stream, ok := core.GetStream(stream)
if !ok {
common.Log.Debug("Embedded font object not found -- ABORT subsetting")
return errors.New("fontfile2 not found")
}
decoded, err := core.DecodeStream(stream)
if err != nil {
common.Log.Debug("Decode error: %v", err)
return err
}
fnt, err := unitype.Parse(bytes.NewReader(decoded))
if err != nil {
common.Log.Debug("Error parsing %d byte font", len(stream.Stream))
return err
}
allIndices := indices
if len(runes) > 0 {
indices := fnt.LookupRunes(runes)
allIndices = append(allIndices, indices...)
}
fnt, err = fnt.SubsetKeepIndices(allIndices)
if err != nil {
common.Log.Debug("ERROR subsetting font: %v", err)
return err
}
var buf bytes.Buffer
err = fnt.Write(&buf)
if err != nil {
common.Log.Debug("ERROR Writing font: %v", err)
return err
}
if buf.Len() > len(decoded) {
common.Log.Debug("Re-written font is larger than original - skip")
return nil
}
newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
if err != nil {
common.Log.Debug("ERROR Writing font: %v", err)
return err
}
// Overwrite.
*stream = *newstream
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
return nil
}
// Optimize optimizes PDF objects to decrease PDF size.
func (c *CleanFonts) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
var processed map[*core.PdfObjectStream]struct{}
if c.Subset {
var err error
processed, err = optimizeFontsWithSubsetting(objects)
if err != nil {
return nil, err
}
}
// Clean font streams by loading and rewriting with minimal needed tables.
for _, obj := range objects {
stream, isStreamObj := core.GetStream(obj)
if !isStreamObj {
continue
}
if _, has := processed[stream]; has {
// Skip - has been processed.
continue
}
encoder, err := core.NewEncoderFromStream(stream)
if err != nil {
common.Log.Debug("ERROR getting encoder: %v - ignoring", err)
continue
}
decoded, err := encoder.DecodeStream(stream)
if err != nil {
common.Log.Debug("Decoding error : %v - ignoring", err)
continue
}
if len(decoded) < 4 {
continue
}
version := string(decoded[:4])
if version == "OTTO" {
// Fonts based on PostScript outlines not supported yet.
// See https://docs.microsoft.com/en-us/typography/opentype/spec/otff
continue
}
if version != "\x00\x01\x00\x00" && version != "true" {
continue
}
fnt, err := unitype.Parse(bytes.NewReader(decoded))
if err != nil {
common.Log.Debug("ERROR Parsing font: %v - ignoring", err)
continue
}
err = fnt.Optimize()
if err != nil {
continue
}
var buf bytes.Buffer
err = fnt.Write(&buf)
if err != nil {
common.Log.Debug("ERROR Writing font: %v - ignoring", err)
continue
}
if buf.Len() > len(decoded) {
common.Log.Debug("Re-written font is larger than original - skip")
continue
}
newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
if err != nil {
continue
}
// Overwrite.
*stream = *newstream
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
}
return objects, nil
}
// content describes page or font contents which is a content stream along with resources.
type content struct {
content string
resources *model.PdfPageResources
}
// Best effort to get annotation contents.
func getAnnotationContents(annotsObj core.PdfObject) []content {
if annotsObj == nil {
return nil
}
annotsArr, ok := core.GetArray(annotsObj)
if !ok {
common.Log.Debug("Annots not an array")
return nil
}
var annotContents []content
for _, obj := range annotsArr.Elements() {
annotDict, ok := core.GetDict(obj)
if !ok {
// Ignore any non dict elements.
common.Log.Debug("Ignoring non-dict element in Annots")
continue
}
// Appearance.
appDict, ok := core.GetDict(annotDict.Get("AP"))
if !ok {
common.Log.Debug("No AP entry - skipping")
continue
}
normal := core.TraceToDirectObject(appDict.Get("N"))
if normal == nil {
common.Log.Debug("No N entry - skipping")
continue
}
var stream *core.PdfObjectStream
switch t := normal.(type) {
case *core.PdfObjectDictionary:
appState, ok := core.GetName(annotDict.Get("AS"))
if !ok {
common.Log.Debug("No AS entry - skipping")
continue
}
stream, ok = core.GetStream(t.Get(*appState))
if !ok {
common.Log.Debug("Form not found - skipping")
continue
}
case *core.PdfObjectStream:
stream = t
}
if stream == nil {
common.Log.Debug("Form not found (nil) - skipping")
continue
}
xform, err := model.NewXObjectFormFromStream(stream)
if err != nil {
common.Log.Debug("Error loading form: %v - ignoring", err)
continue
}
contents, err := xform.GetContentStream()
if err != nil {
common.Log.Debug("Error decoding contents: %v", err)
continue
}
annotContents = append(annotContents, content{
content: string(contents),
resources: xform.Resources,
})
}
return annotContents
}

View File

@ -23,9 +23,17 @@ func (c *CompressStreams) Optimize(objects []core.PdfObject) (optimizedObjects [
if !isStreamObj {
continue
}
if _, found := core.GetName(stream.PdfObjectDictionary.Get("Filter")); found {
continue
// Skip objects that are already encoded.
// TODO: Try filter combinations, and ignoring inefficient filters.
if obj := stream.Get("Filter"); obj != nil {
if _, skip := core.GetName(obj); skip {
continue
}
if arr, ok := core.GetArray(obj); ok && arr.Len() > 0 {
continue
}
}
encoder := core.NewFlateEncoder() // Most mainstream compressor and probably most robust.
var data []byte
data, err = encoder.EncodeBytes(stream.Stream)

View File

@ -110,28 +110,51 @@ func (i *Image) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfO
common.Log.Warning("Error decode the image stream %s")
continue
}
encoder := core.NewDCTEncoder()
encoder.ColorComponents = img.ColorComponents
encoder.Quality = i.ImageQuality
encoder.BitsPerComponent = img.BitsPerComponent
encoder.Width = img.Width
encoder.Height = img.Height
streamData, err := encoder.EncodeBytes(data)
dctenc := core.NewDCTEncoder()
dctenc.ColorComponents = img.ColorComponents
dctenc.Quality = i.ImageQuality
dctenc.BitsPerComponent = img.BitsPerComponent
dctenc.Width = img.Width
dctenc.Height = img.Height
streamData, err := dctenc.EncodeBytes(data)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return nil, err
}
var filter core.StreamEncoder
filter = dctenc
// Check if combining with FlateEncoding improves things further.
{
flate := core.NewFlateEncoder()
multienc := core.NewMultiEncoder()
multienc.AddEncoder(flate)
multienc.AddEncoder(dctenc)
encoded, err := multienc.EncodeBytes(data)
if err != nil {
return nil, err
}
if len(encoded) < len(streamData) {
common.Log.Debug("Multi enc improves: %d to %d (orig %d)",
len(streamData), len(encoded), len(stream.Stream))
streamData = encoded
filter = multienc
}
}
originalSize := len(stream.Stream)
if originalSize < len(streamData) {
// Worse - ignoring.
continue
}
newStream := &core.PdfObjectStream{Stream: streamData}
newStream.PdfObjectReference = stream.PdfObjectReference
newStream.PdfObjectDictionary = core.MakeDict()
newStream.PdfObjectDictionary.Merge(stream.PdfObjectDictionary)
fn := core.PdfObjectName(encoder.GetFilterName())
newStream.PdfObjectDictionary.Set(core.PdfObjectName("Filter"), &fn)
ln := core.PdfObjectInteger(int64(len(streamData)))
newStream.PdfObjectDictionary.Set(core.PdfObjectName("Length"), &ln)
newStream.Merge(stream.PdfObjectDictionary)
newStream.Merge(filter.MakeStreamDict())
newStream.Set("Length", core.MakeInteger(int64(len(streamData))))
replaceTable[stream] = newStream
images[index].Stream = newStream
}

View File

@ -12,6 +12,12 @@ import (
// New creates a optimizers chain from options.
func New(options Options) *Chain {
chain := new(Chain)
if options.CleanFonts || options.SubsetFonts {
chain.Append(&CleanFonts{Subset: options.SubsetFonts})
}
if options.CleanContentstream {
chain.Append(new(CleanContentstream))
}
if options.ImageUpperPPI > 0 {
imageOptimizer := new(ImagePPI)
imageOptimizer.ImageUpperPPI = options.ImageUpperPPI

View File

@ -14,4 +14,7 @@ type Options struct {
UseObjectStreams bool
CombineIdenticalIndirectObjects bool
CompressStreams bool
CleanFonts bool
SubsetFonts bool
CleanContentstream bool
}

102
model/optimize/utils.go Normal file
View File

@ -0,0 +1,102 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package optimize
import (
"bytes"
"github.com/unidoc/unipdf/v3/core"
)
type objectStructure struct {
catalogDict *core.PdfObjectDictionary
pagesDict *core.PdfObjectDictionary
pages []*core.PdfIndirectObject
}
// getObjectStructure identifies the Catalog and Pages dictionary and finds a list of pages.
func getObjectStructure(objects []core.PdfObject) objectStructure {
objstr := objectStructure{}
found := false
for _, obj := range objects {
switch t := obj.(type) {
case *core.PdfIndirectObject:
dict, is := core.GetDict(t)
if !is {
continue
}
kind, is := core.GetName(dict.Get("Type"))
if !is {
continue
}
switch kind.String() {
case "Catalog":
objstr.catalogDict = dict
found = true
}
}
if found {
break
}
}
if !found {
return objstr
}
pagesDict, ok := core.GetDict(objstr.catalogDict.Get("Pages"))
if !ok {
return objstr
}
objstr.pagesDict = pagesDict
kids, ok := core.GetArray(pagesDict.Get("Kids"))
if !ok {
return objstr
}
for _, obj := range kids.Elements() {
pobj, ok := core.GetIndirect(obj)
if !ok {
break
}
objstr.pages = append(objstr.pages, pobj)
}
return objstr
}
// getPageContents loads the page content stream as a string from a /Contents entry.
// Either a single stream, or an array of streams. Returns the list of objects that
// can be used if need to replace.
func getPageContents(contentsObj core.PdfObject) (contents string, objs []core.PdfObject) {
var buf bytes.Buffer
switch t := contentsObj.(type) {
case *core.PdfIndirectObject:
objs = append(objs, t)
contentsObj = t.PdfObject
}
switch t := contentsObj.(type) {
case *core.PdfObjectStream:
if decoded, err := core.DecodeStream(t); err == nil {
buf.Write(decoded)
objs = append(objs, t)
}
case *core.PdfObjectArray:
for _, elobj := range t.Elements() {
switch el := elobj.(type) {
case *core.PdfObjectStream:
if decoded, err := core.DecodeStream(el); err == nil {
buf.Write(decoded)
objs = append(objs, el)
}
}
}
}
return buf.String(), objs
}

View File

@ -281,42 +281,42 @@ func (r *PdfReader) newPdfPageFromDict(p *core.PdfObjectDictionary) (*PdfPage, e
// GetAnnotations returns the list of page annotations for `page`. If not loaded attempts to load the
// annotations, otherwise returns the loaded list.
func (page *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) {
if page.annotations != nil {
return page.annotations, nil
func (p *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) {
if p.annotations != nil {
return p.annotations, nil
}
if page.Annots == nil {
page.annotations = []*PdfAnnotation{}
if p.Annots == nil {
p.annotations = []*PdfAnnotation{}
return nil, nil
}
if page.reader == nil {
page.annotations = []*PdfAnnotation{}
if p.reader == nil {
p.annotations = []*PdfAnnotation{}
return nil, nil
}
annots, err := page.reader.loadAnnotations(page.Annots)
annots, err := p.reader.loadAnnotations(p.Annots)
if err != nil {
return nil, err
}
if annots == nil {
page.annotations = []*PdfAnnotation{}
p.annotations = []*PdfAnnotation{}
}
page.annotations = annots
return page.annotations, nil
p.annotations = annots
return p.annotations, nil
}
// AddAnnotation appends `annot` to the list of page annotations.
func (page *PdfPage) AddAnnotation(annot *PdfAnnotation) {
if page.annotations == nil {
page.GetAnnotations() // Ensure has been loaded.
func (p *PdfPage) AddAnnotation(annot *PdfAnnotation) {
if p.annotations == nil {
p.GetAnnotations() // Ensure has been loaded.
}
page.annotations = append(page.annotations, annot)
p.annotations = append(p.annotations, annot)
}
// SetAnnotations sets the annotations list.
func (page *PdfPage) SetAnnotations(annotations []*PdfAnnotation) {
page.annotations = annotations
func (p *PdfPage) SetAnnotations(annotations []*PdfAnnotation) {
p.annotations = annotations
}
// loadAnnotations loads and returns the PDF annotations from the input annotations object (array).