diff --git a/Jenkinsfile b/Jenkinsfile index f30b0aa0..9b08e69f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,13 +1,10 @@ node { // Install the desired Go version - def root = tool name: 'go 1.11.5', type: 'go' + def root = tool name: 'go 1.14.3', type: 'go' env.GOROOT="${root}" - env.GOPATH="${WORKSPACE}/gopath" - // Hack for 1.11.5 testing work. - env.CGO_ENABLED="0" - env.PATH="${root}/bin:${env.GOPATH}/bin:${env.PATH}" - env.GOCACHE="off" + env.GOBIN="${WORKSPACE}/bin" + env.PATH="${root}/bin:${env.GOBIN}:${env.PATH}" env.UNIDOC_EXTRACT_FORCETEST="1" env.UNIDOC_E2E_FORCE_TESTS="1" env.UNIDOC_EXTRACT_TESTDATA="/home/jenkins/corpus/unidoc-extractor-testdata" @@ -19,13 +16,13 @@ node { env.UNIDOC_JBIG2_TESTDATA="/home/jenkins/corpus/jbig2-testdata" env.UNIDOC_FDFMERGE_TESTDATA="/home/jenkins/corpus/fdfmerge-testdata" env.UNIDOC_GS_BIN_PATH="/usr/bin/gs" - // Hack for 1.11.5 testing work. env.CGO_ENABLED="0" env.TMPDIR="${WORKSPACE}/temp" + sh "mkdir -p ${env.GOBIN}" sh "mkdir -p ${env.TMPDIR}" - dir("${GOPATH}/src/github.com/unidoc/unipdf") { + dir("${WORKSPACE}/unipdf") { sh 'go version' stage('Checkout') { @@ -35,11 +32,9 @@ node { stage('Prepare') { // Get linter and other build tools. - sh 'go get -u golang.org/x/lint/golint' + sh 'go get golang.org/x/lint/golint' sh 'go get github.com/tebeka/go2xunit' sh 'go get github.com/t-yuki/gocover-cobertura' - // Get all dependencies (for tests also). - sh 'go get -t ./...' } stage('Linting') { @@ -53,7 +48,7 @@ node { stage('Testing') { // Go test - No tolerance. sh "rm -f ${env.TMPDIR}/*.pdf" - sh '2>&1 go test -v ./... | tee gotest.txt' + sh '2>&1 go test -count=1 -v ./... | tee gotest.txt' } stage('Check generated PDFs') { @@ -62,7 +57,7 @@ node { } stage('Test coverage') { - sh 'go test -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...' + sh 'go test -count=1 -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...' sh '/home/jenkins/codecov.sh' sh 'gocover-cobertura < coverage.out > coverage.xml' step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml']) @@ -80,7 +75,7 @@ node { } } - dir("${GOPATH}/src/github.com/unidoc/unipdf-examples") { + dir("${WORKSPACE}/unipdf-examples") { stage('Build examples') { // Output environment variables (useful for debugging). sh("printenv") @@ -97,6 +92,9 @@ node { echo "Pulling unipdf-examples on branch ${examplesBranch}" git url: 'https://github.com/unidoc/unidoc-examples.git', branch: examplesBranch + + // Use replace directive to use disk version of unipdf. + sh 'echo "replace github.com/unidoc/unipdf/v3 => ../unipdf" >>go.mod' // Dependencies for examples. sh './build_examples.sh' diff --git a/core/encoding.go b/core/encoding.go index 474fc7b2..cc5baf5c 100644 --- a/core/encoding.go +++ b/core/encoding.go @@ -948,7 +948,6 @@ func newDCTEncoderFromStream(streamObj *PdfObjectStream, multiEnc *MultiEncoder) return nil, err } encoded = e - } bufReader := bytes.NewReader(encoded) @@ -2158,6 +2157,9 @@ func newMultiEncoderFromStream(streamObj *PdfObjectStream) (*MultiEncoder, error // GetFilterName returns the names of the underlying encoding filters, // separated by spaces. +// Note: This is just a string, should not be used in /Filter dictionary entry. Use GetFilterArray for that. +// TODO(v4): Refactor to GetFilter() which can be used for /Filter (either Name or Array), this can be +// renamed to String() as a pretty string to use in debugging etc. func (enc *MultiEncoder) GetFilterName() string { name := "" for idx, encoder := range enc.encoders { @@ -2169,6 +2171,16 @@ func (enc *MultiEncoder) GetFilterName() string { return name } +// GetFilterArray returns the names of the underlying encoding filters in an array that +// can be used as /Filter entry. +func (enc *MultiEncoder) GetFilterArray() *PdfObjectArray { + names := make([]PdfObject, len(enc.encoders)) + for i, e := range enc.encoders { + names[i] = MakeName(e.GetFilterName()) + } + return MakeArray(names...) +} + // MakeDecodeParams makes a new instance of an encoding dictionary based on // the current encoder settings. func (enc *MultiEncoder) MakeDecodeParams() PdfObject { @@ -2201,12 +2213,7 @@ func (enc *MultiEncoder) AddEncoder(encoder StreamEncoder) { // MakeStreamDict makes a new instance of an encoding dictionary for a stream object. func (enc *MultiEncoder) MakeStreamDict() *PdfObjectDictionary { dict := MakeDict() - - names := make([]PdfObject, len(enc.encoders)) - for i, e := range enc.encoders { - names[i] = MakeName(e.GetFilterName()) - } - dict.Set("Filter", MakeArray(names...)) + dict.Set("Filter", enc.GetFilterArray()) // Pass all values from children, except Filter and DecodeParms. for _, encoder := range enc.encoders { diff --git a/extractor/extractor.go b/extractor/extractor.go index 152d834e..0441ce58 100644 --- a/extractor/extractor.go +++ b/extractor/extractor.go @@ -42,9 +42,14 @@ func New(page *model.PdfPage) (*Extractor, error) { // fmt.Printf("%s\n", contents) // fmt.Println("========================= ::: =========================") + return NewFromContents(contents, page.Resources) +} + +// NewFromContents creates a new extractor from contents and page resources. +func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) { e := &Extractor{ contents: contents, - resources: page.Resources, + resources: resources, fontCache: map[string]fontEntry{}, formResults: map[string]textResult{}, } diff --git a/extractor/text.go b/extractor/text.go index 659b3051..e38e4874 100644 --- a/extractor/text.go +++ b/extractor/text.go @@ -439,7 +439,11 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error { common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args) return core.ErrTypeError } - to.renderText(charcodes) + err := to.renderText(charcodes) + if err != nil { + common.Log.Debug("Render text error: %v", err) + return err + } default: common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args) return core.ErrTypeError @@ -736,6 +740,7 @@ func (to *textObject) renderText(data []byte) error { continue } + // TODO(gunnsth): Assuming 1:1 charcode[i] <-> rune[i] mapping. code := charcodes[i] // The location of the text on the page in device coordinates is given by trm, the text // rendering matrix. @@ -785,6 +790,8 @@ func (to *textObject) renderText(data []byte) error { } else if font.Encoder() == nil { common.Log.Debug("ERROR: No encoding. font=%s", font) } else { + // TODO: This lookup seems confusing. Went from bytes <-> charcodes already. + // NOTE: This is needed to register runes by the font encoder - for subsetting (optimization). original, ok := font.Encoder().CharcodeToRune(code) if ok { mark.original = string(original) diff --git a/extractor/text_test.go b/extractor/text_test.go index cdfe47a9..89b920f3 100644 --- a/extractor/text_test.go +++ b/extractor/text_test.go @@ -51,9 +51,7 @@ var doStress bool func init() { flag.BoolVar(&doStress, "extractor-stresstest", false, "Run text extractor stress tests.") common.SetLogger(common.NewConsoleLogger(common.LogLevelInfo)) - if flag.Lookup("test.v") != nil { - isTesting = true - } + isTesting = true } // TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`. diff --git a/go.mod b/go.mod index de6add69..753d2d89 100644 --- a/go.mod +++ b/go.mod @@ -5,12 +5,15 @@ go 1.11 require ( github.com/adrg/sysfont v0.1.0 github.com/boombuler/barcode v1.0.0 + github.com/davecgh/go-spew v1.1.1 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 + github.com/sirupsen/logrus v1.6.0 github.com/stretchr/testify v1.4.0 github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a - github.com/unidoc/unitype v0.1.0 + github.com/unidoc/unitype v0.2.0 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b + golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 // indirect golang.org/x/text v0.3.2 ) diff --git a/go.sum b/go.sum index 4552b823..2910c898 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,8 @@ github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGw github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8= +github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= @@ -24,6 +26,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/sirupsen/logrus v1.5.0 h1:1N5EYkVAPEywqZRJd7cwnRtCb6xJx7NH3T3WUTF980Q= github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s+Squfpo= +github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I= +github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= @@ -34,6 +38,10 @@ github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a h1:RLtvUhe4DsUDl6 github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a/go.mod h1:j+qMWZVpZFTvDey3zxUkSgPJZEX33tDgU/QIA0IzCUw= github.com/unidoc/unitype v0.1.0 h1:6zJYMl8XdwFBD45Cmg8Ge13WyE92jwLuK1tk2IsRb9s= github.com/unidoc/unitype v0.1.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU= +github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02 h1:zVMJh0ehLc0amGBcqIh7HWikIGXGBGpmW+Lvz1YVYH8= +github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU= +github.com/unidoc/unitype v0.2.0 h1:N+ZKjwz8UDU0qa1IYzstDLffvQEctFo+bo6b6ZqW+9M= +github.com/unidoc/unitype v0.2.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 h1:58fnuSXlxZmFdJyvtTFVmVhcMLU6v5fEb/ok4wyqtNU= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -45,6 +53,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 h1:opSr2sbRXk5X5/givKrrKj9HXxFpW2sdCiP8MJSKLQY= golang.org/x/sys v0.0.0-20200413165638-669c56c373c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 h1:ogLJMz+qpzav7lGMh10LMvAkM/fAoGlaiiHYiFYdm80= +golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= diff --git a/internal/textencoding/identity.go b/internal/textencoding/identity.go index 10877558..ccfdfeea 100644 --- a/internal/textencoding/identity.go +++ b/internal/textencoding/identity.go @@ -13,47 +13,81 @@ import ( "github.com/unidoc/unipdf/v3/core" ) -// IdentityEncoder represents an 2-byte identity encoding +// IdentityEncoder represents an 2-byte identity encoding. +// NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode +// meaning, except via the ToUnicode maps. +// TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing. +// Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual +// runes and character codes, CMaps together. type IdentityEncoder struct { baseName string + + // runes registered by encoder for tracking what runes are used for subsetting. + registeredMap map[rune]struct{} } // NewIdentityTextEncoder returns a new IdentityEncoder based on predefined // encoding `baseName` and difference map `differences`. -func NewIdentityTextEncoder(baseName string) IdentityEncoder { - return IdentityEncoder{baseName} +func NewIdentityTextEncoder(baseName string) *IdentityEncoder { + return &IdentityEncoder{ + baseName: baseName, + } +} + +// RegisteredRunes returns the slice of runes that have been registered as used by the encoder. +func (enc *IdentityEncoder) RegisteredRunes() []rune { + runes := make([]rune, len(enc.registeredMap)) + i := 0 + for r := range enc.registeredMap { + runes[i] = r + i++ + } + return runes } // String returns a string that describes `enc`. -func (enc IdentityEncoder) String() string { +func (enc *IdentityEncoder) String() string { return enc.baseName } // Encode converts the Go unicode string to a PDF encoded string. -func (enc IdentityEncoder) Encode(str string) []byte { +func (enc *IdentityEncoder) Encode(str string) []byte { return encodeString16bit(enc, str) } // Decode converts PDF encoded string to a Go unicode string. -func (enc IdentityEncoder) Decode(raw []byte) string { +func (enc *IdentityEncoder) Decode(raw []byte) string { return decodeString16bit(enc, raw) } // RuneToCharcode converts rune `r` to a PDF character code. // The bool return flag is true if there was a match, and false otherwise. -func (enc IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) { +// TODO: Here the `r` is an actual rune. +func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) { + if enc.registeredMap == nil { + enc.registeredMap = map[rune]struct{}{} + } + enc.registeredMap[r] = struct{}{} // Register use (subsetting). + return CharCode(r), true } // CharcodeToRune converts PDF character code `code` to a rune. // The bool return flag is true if there was a match, and false otherwise. -func (enc IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) { +// TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both). +func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) { + if enc.registeredMap == nil { + enc.registeredMap = map[rune]struct{}{} + } + + // TODO: The rune(code) is confusing and is not an actual utf8 rune. + enc.registeredMap[rune(code)] = struct{}{} return rune(code), true } // RuneToGlyph returns the glyph name for rune `r`. // The bool return flag is true if there was a match, and false otherwise. -func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) { +func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) { if r == ' ' { return "space", true } @@ -63,7 +97,7 @@ func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) { // GlyphToRune returns the rune corresponding to glyph name `glyph`. // The bool return flag is true if there was a match, and false otherwise. -func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) { +func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) { // String with "uniXXXX" format where XXXX is the hexcode. if glyph == "space" { return ' ', true @@ -78,7 +112,7 @@ func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) { } // ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file. -func (enc IdentityEncoder) ToPdfObject() core.PdfObject { +func (enc *IdentityEncoder) ToPdfObject() core.PdfObject { if enc.baseName != "" { return core.MakeName(enc.baseName) } diff --git a/internal/textencoding/simple.go b/internal/textencoding/simple.go index cd2f1061..2ddd385c 100644 --- a/internal/textencoding/simple.go +++ b/internal/textencoding/simple.go @@ -103,6 +103,9 @@ type simpleEncoding struct { // one byte encoding: CharCode <-> byte encode map[rune]byte decode map[byte]rune + + // runes registered by encoder for tracking what runes are used for subsetting. + registeredMap map[rune]struct{} } // Encode converts the Go unicode string to a PDF encoded string. @@ -213,6 +216,10 @@ func (enc *simpleEncoding) Charcodes() []CharCode { func (enc *simpleEncoding) RuneToCharcode(r rune) (CharCode, bool) { b, ok := enc.encode[r] + if enc.registeredMap == nil { + enc.registeredMap = map[rune]struct{}{} + } + enc.registeredMap[r] = struct{}{} // Register use (subsetting). return CharCode(b), ok } @@ -222,6 +229,10 @@ func (enc *simpleEncoding) CharcodeToRune(code CharCode) (rune, bool) { } b := byte(code) r, ok := enc.decode[b] + if enc.registeredMap == nil { + enc.registeredMap = map[rune]struct{}{} + } + enc.registeredMap[r] = struct{}{} // Register use (subsetting). return r, ok } diff --git a/model/annotations.go b/model/annotations.go index 49450818..dc3766fe 100644 --- a/model/annotations.go +++ b/model/annotations.go @@ -103,31 +103,31 @@ type PdfAnnotationLink struct { } // GetAction returns the PDF action for the annotation link. -func (a *PdfAnnotationLink) GetAction() (*PdfAction, error) { - if a.action != nil { - return a.action, nil +func (link *PdfAnnotationLink) GetAction() (*PdfAction, error) { + if link.action != nil { + return link.action, nil } - if a.A == nil { + if link.A == nil { return nil, nil } - if a.reader == nil { + if link.reader == nil { return nil, nil } - action, err := a.reader.loadAction(a.A) + action, err := link.reader.loadAction(link.A) if err != nil { return nil, err } - a.action = action + link.action = action - return a.action, nil + return link.action, nil } // SetAction sets the PDF action for the annotation link. -func (a *PdfAnnotationLink) SetAction(action *PdfAction) { - a.action = action +func (link *PdfAnnotationLink) SetAction(action *PdfAction) { + link.action = action if action == nil { - a.A = nil + link.A = nil } } diff --git a/model/font.go b/model/font.go index 5a8cb2fb..dcf2e44c 100644 --- a/model/font.go +++ b/model/font.go @@ -50,6 +50,7 @@ func (font *PdfFont) SubsetRegistered() error { case *pdfFontType0: err := t.subsetRegistered() if err != nil { + common.Log.Debug("Subset error: %v", err) return err } if t.container != nil { @@ -401,6 +402,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode { charcodes := make([]textencoding.CharCode, 0, len(data)+len(data)%2) if font.baseFields().isCIDFont() { + // Identity only? if len(data) == 1 { data = []byte{0, data[0]} } @@ -413,6 +415,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode { charcodes = append(charcodes, textencoding.CharCode(b)) } } else { + // Simple font: byte -> charcode. for _, b := range data { charcodes = append(charcodes, textencoding.CharCode(b)) } @@ -755,8 +758,7 @@ func (base fontCommon) isCIDFont() bool { // newFontBaseFieldsFromPdfObject returns `fontObj` as a dictionary the common fields from that // dictionary in the fontCommon return. If there is a problem an error is returned. // The fontCommon is the group of fields common to all PDF fonts. -func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon, - error) { +func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon, error) { font := &fontCommon{} if obj, ok := fontObj.(*core.PdfIndirectObject); ok { diff --git a/model/font_composite.go b/model/font_composite.go index 23d69df9..829d2036 100644 --- a/model/font_composite.go +++ b/model/font_composite.go @@ -127,6 +127,9 @@ func (font *pdfFontType0) baseFields() *fontCommon { } func (font *pdfFontType0) getFontDescriptor() *PdfFontDescriptor { + if font.fontDescriptor == nil && font.DescendantFont != nil { + return font.DescendantFont.FontDescriptor() + } return font.fontDescriptor } @@ -210,14 +213,19 @@ func (font *pdfFontType0) subsetRegistered() error { common.Log.Debug("Missing font descriptor") return nil } + if font.encoder == nil { + common.Log.Debug("No encoder - subsetting ignored") + return nil + } stream, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2) if !ok { - common.Log.Debug("Embedded font object not found -- ABORT subsseting") + common.Log.Debug("Embedded font object not found -- ABORT subsetting") return errors.New("fontfile2 not found") } decoded, err := core.DecodeStream(stream) if err != nil { + common.Log.Debug("Decode error: %v", err) return err } @@ -227,21 +235,52 @@ func (font *pdfFontType0) subsetRegistered() error { return err } - tenc, ok := font.encoder.(*textencoding.TrueTypeFontEncoder) - if !ok { - return fmt.Errorf("unsupported encoder for subsetting: %T", cidfnt.encoder) + var runes []rune + var subset *unitype.Font + switch tenc := font.encoder.(type) { + case *textencoding.TrueTypeFontEncoder: + // Means the font has been loaded from TTF file. + runes = tenc.RegisteredRunes() + subset, err = fnt.SubsetKeepRunes(runes) + if err != nil { + common.Log.Debug("ERROR: %v", err) + return err + } + // Reduce the encoder also. + tenc.SubsetRegistered() + case *textencoding.IdentityEncoder: + // IdentityEncoder typically means font was parsed from PDF file. + // TODO: These are not actual runes... but glyph ids ? Very confusing. + runes = tenc.RegisteredRunes() + indices := make([]unitype.GlyphIndex, len(runes)) + for i, r := range runes { + indices[i] = unitype.GlyphIndex(r) + } + + subset, err = fnt.SubsetKeepIndices(indices) + if err != nil { + common.Log.Debug("ERROR: %v", err) + return err + } + case textencoding.SimpleEncoder: + // Simple encoding, bytes are 0-255 + charcodes := tenc.Charcodes() + for _, c := range charcodes { + r, ok := tenc.CharcodeToRune(c) + if !ok { + common.Log.Debug("ERROR: unable convert charcode to rune: %d", c) + continue + } + runes = append(runes, r) + } + default: + return fmt.Errorf("unsupported encoder for subsetting: %T", font.encoder) } - runes := tenc.RegisteredRunes() - subset, err := fnt.SubsetKeepRunes(runes) - if err != nil { - return err - } - // Reduce the encoder also. - tenc.SubsetRegistered() var buf bytes.Buffer err = subset.Write(&buf) if err != nil { + common.Log.Debug("ERROR: %v", err) return err } @@ -249,7 +288,7 @@ func (font *pdfFontType0) subsetRegistered() error { if font.toUnicodeCmap != nil { codeToUnicode := make(map[cmap.CharCode]rune, len(runes)) for _, r := range runes { - cc, ok := tenc.RuneToCharcode(r) + cc, ok := font.encoder.RuneToCharcode(r) if !ok { continue } @@ -260,9 +299,16 @@ func (font *pdfFontType0) subsetRegistered() error { stream, err = core.MakeStream(buf.Bytes(), core.NewFlateEncoder()) if err != nil { + common.Log.Debug("ERROR: %v", err) return err } - cidfnt.fontDescriptor.FontFile2 = stream + stream.Set("Length1", core.MakeInteger(int64(buf.Len()))) + if curstr, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2); ok { + // Replace the current stream (keep same object). + *curstr = *stream + } else { + cidfnt.fontDescriptor.FontFile2 = stream + } // Set subset name. tag := genSubsetTag() @@ -334,6 +380,7 @@ func newPdfFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon) encoderName, ok := core.GetNameVal(d.Get("Encoding")) if ok { + // TODO: Identity-H maps 16-bit character codes straight to glyph index (don't need actual runes). if encoderName == "Identity-H" || encoderName == "Identity-V" { font.encoder = textencoding.NewIdentityTextEncoder(encoderName) } else if cmap.IsPredefinedCMap(encoderName) { diff --git a/model/optimize/clean_contentstream.go b/model/optimize/clean_contentstream.go new file mode 100644 index 00000000..34cff469 --- /dev/null +++ b/model/optimize/clean_contentstream.go @@ -0,0 +1,139 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +package optimize + +import ( + "github.com/unidoc/unipdf/v3/contentstream" + "github.com/unidoc/unipdf/v3/core" +) + +// CleanContentstream cleans up redundant operands in content streams, including Page and XObject Form +// contents. This process includes: +// 1. Marked content operators are removed. +// 2. Some operands are simplified (shorter form). +// TODO: Add more reduction methods and improving the methods for identifying unnecessary operands. +type CleanContentstream struct { +} + +// filterOps cleans up the content stream in `ops`: +// 1. Marked content operators are cleaned. +// 2. Tm with 1 0 0 1 params are converted to Td (slightly shorter for same transformation). +// TODO: Add operations that track the state and remove unnecessary operands, such as duplicates +// or ones setting default values, or ones not drawing anything. +func filterOps(ops *contentstream.ContentStreamOperations) *contentstream.ContentStreamOperations { + if ops == nil { + return nil + } + + filtered := contentstream.ContentStreamOperations{} + for _, op := range *ops { + switch op.Operand { + case "BDC", "BMC", "EMC": + continue + case "Tm": + if len(op.Params) == 6 { + if nums, err := core.GetNumbersAsFloat(op.Params); err == nil { + if nums[0] == 1 && nums[1] == 0 && nums[2] == 0 && nums[3] == 1 { + op = &contentstream.ContentStreamOperation{ + Params: []core.PdfObject{ + op.Params[4], + op.Params[5], + }, + Operand: "Td", + } + } + } + } + } + filtered = append(filtered, op) + } + return &filtered +} + +// reduceContent performs content stream optimization of contents in `cstream` which can either be +// from Page Contents or XObject Form. +// NOTE: If from a Contents array, the operations may be unbalanced. +func reduceContent(cstream *core.PdfObjectStream) error { + decoded, err := core.DecodeStream(cstream) + if err != nil { + return err + } + + csp := contentstream.NewContentStreamParser(string(decoded)) + ops, err := csp.Parse() + if err != nil { + return err + } + + ops = filterOps(ops) + cleaned := ops.Bytes() + if len(cleaned) >= len(decoded) { + // No need to replace if no improvement. + return nil + } + + newstream, err := core.MakeStream(ops.Bytes(), core.NewFlateEncoder()) + if err != nil { + return err + } + cstream.Stream = newstream.Stream + cstream.Merge(newstream.PdfObjectDictionary) + return nil +} + +// Optimize optimizes PDF objects to decrease PDF size. +func (c *CleanContentstream) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) { + // Track which content streams to process. + queuedMap := map[*core.PdfObjectStream]struct{}{} + var queued []*core.PdfObjectStream + appendQueue := func(stream *core.PdfObjectStream) { + if _, has := queuedMap[stream]; !has { + queuedMap[stream] = struct{}{} + queued = append(queued, stream) + } + } + + // Collect objects to process: XObject Form and Page Content streams. + for _, obj := range objects { + switch t := obj.(type) { + case *core.PdfIndirectObject: + switch ti := t.PdfObject.(type) { + case *core.PdfObjectDictionary: + if name, ok := core.GetName(ti.Get("Type")); !ok || name.String() != "Page" { + continue + } + + if stream, ok := core.GetStream(ti.Get("Contents")); ok { + appendQueue(stream) + } else if array, ok := core.GetArray(ti.Get("Contents")); ok { + for _, el := range array.Elements() { + if stream, ok := core.GetStream(el); ok { + appendQueue(stream) + } + } + } + } + case *core.PdfObjectStream: + if name, ok := core.GetName(t.Get("Type")); !ok || name.String() != "XObject" { + continue + } + if name, ok := core.GetName(t.Get("Subtype")); !ok || name.String() != "Form" { + continue + } + appendQueue(t) + } + } + + // Process the queued content streams. + for _, stream := range queued { + err = reduceContent(stream) + if err != nil { + return nil, err + } + } + + return objects, nil +} diff --git a/model/optimize/clean_fonts.go b/model/optimize/clean_fonts.go new file mode 100644 index 00000000..d0bfdccc --- /dev/null +++ b/model/optimize/clean_fonts.go @@ -0,0 +1,353 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +package optimize + +import ( + "bytes" + "errors" + + "github.com/unidoc/unitype" + + "github.com/unidoc/unipdf/v3/common" + "github.com/unidoc/unipdf/v3/core" + "github.com/unidoc/unipdf/v3/extractor" + "github.com/unidoc/unipdf/v3/internal/textencoding" + "github.com/unidoc/unipdf/v3/model" +) + +// CleanFonts cleans up embedded fonts, reducing font sizes. +type CleanFonts struct { + // Subset embedded fonts if encountered (if true). + // Otherwise attempts to reduce the font program. + Subset bool +} + +func optimizeFontsWithSubsetting(objects []core.PdfObject) (processed map[*core.PdfObjectStream]struct{}, err error) { + // 1. Identify all fonts. + // 2. Identify content streams and their Resources dictionaries (both via page, forms and annotations). + // 3. Process content streams. + processed = map[*core.PdfObjectStream]struct{}{} + + fontMap := map[*model.PdfFont]struct{}{} + + objstr := getObjectStructure(objects) + for _, p := range objstr.pages { + pdict, ok := core.GetDict(p.PdfObject) + if !ok { + continue + } + resourcesDict, ok := core.GetDict(pdict.Get("Resources")) + if !ok { + continue + } + contents, _ := getPageContents(pdict.Get("Contents")) + resources, err := model.NewPdfPageResourcesFromDict(resourcesDict) + if err != nil { + return nil, err + } + + allContents := []content{ + { + content: contents, + resources: resources, + }, + } + + annotContents := getAnnotationContents(pdict.Get("Annots")) + if annotContents != nil { + allContents = append(allContents, annotContents...) + } + + for _, cont := range allContents { + e, err := extractor.NewFromContents(cont.content, cont.resources) + if err != nil { + return nil, err + } + + pt, _, _, err := e.ExtractPageText() + if err != nil { + return nil, err + } + + for _, el := range pt.Marks().Elements() { + if el.Font == nil { + continue + } + if _, has := fontMap[el.Font]; !has { + fontMap[el.Font] = struct{}{} + } + } + } + } + + // Map of font program stream to font. Multiple fonts can use the same font program. + fontFileMap := map[*core.PdfObjectStream][]*model.PdfFont{} + for font := range fontMap { + fontDesc := font.FontDescriptor() + if fontDesc == nil || fontDesc.FontFile2 == nil { + continue + } + stream, ok := core.GetStream(fontDesc.FontFile2) + if !ok { + continue + } + fontFileMap[stream] = append(fontFileMap[stream], font) + } + + for stream := range fontFileMap { + var allRunes []rune + var allIndices []unitype.GlyphIndex + + for _, font := range fontFileMap[stream] { + switch t := font.Encoder().(type) { + case *textencoding.IdentityEncoder: + // TODO: This terminology is wrong as those are not runes, just charcodes cast as runes. + // Identity encoder maps via 2-byte encoding directly from 2byte charcode to glyph index. + runes := t.RegisteredRunes() + indices := make([]unitype.GlyphIndex, len(runes)) + for i, r := range runes { + indices[i] = unitype.GlyphIndex(r) + } + allIndices = append(allIndices, indices...) + case *textencoding.TrueTypeFontEncoder: + runes := t.RegisteredRunes() + allRunes = append(allRunes, runes...) + case textencoding.SimpleEncoder: + charcodes := t.Charcodes() + for _, c := range charcodes { + r, ok := t.CharcodeToRune(c) + if !ok { + common.Log.Debug("Charcode<->rune not found: %d", c) + continue + } + allRunes = append(allRunes, r) + } + } + } + + err = subsetFontStream(stream, allRunes, allIndices) + if err != nil { + common.Log.Debug("ERROR subsetting font stream: %v", err) + return nil, err + } + processed[stream] = struct{}{} + } + return processed, nil +} + +// Subsets the font program in `stream` with the subset based on the `runes` and glyph `indices`. +func subsetFontStream(stream *core.PdfObjectStream, runes []rune, indices []unitype.GlyphIndex) error { + stream, ok := core.GetStream(stream) + if !ok { + common.Log.Debug("Embedded font object not found -- ABORT subsetting") + return errors.New("fontfile2 not found") + } + decoded, err := core.DecodeStream(stream) + if err != nil { + common.Log.Debug("Decode error: %v", err) + return err + } + + fnt, err := unitype.Parse(bytes.NewReader(decoded)) + if err != nil { + common.Log.Debug("Error parsing %d byte font", len(stream.Stream)) + return err + } + + allIndices := indices + if len(runes) > 0 { + indices := fnt.LookupRunes(runes) + allIndices = append(allIndices, indices...) + } + + fnt, err = fnt.SubsetKeepIndices(allIndices) + if err != nil { + common.Log.Debug("ERROR subsetting font: %v", err) + return err + } + + var buf bytes.Buffer + err = fnt.Write(&buf) + if err != nil { + common.Log.Debug("ERROR Writing font: %v", err) + return err + } + if buf.Len() > len(decoded) { + common.Log.Debug("Re-written font is larger than original - skip") + return nil + } + + newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder()) + if err != nil { + common.Log.Debug("ERROR Writing font: %v", err) + return err + } + // Overwrite. + *stream = *newstream + stream.Set("Length1", core.MakeInteger(int64(buf.Len()))) + + return nil +} + +// Optimize optimizes PDF objects to decrease PDF size. +func (c *CleanFonts) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) { + var processed map[*core.PdfObjectStream]struct{} + if c.Subset { + var err error + processed, err = optimizeFontsWithSubsetting(objects) + if err != nil { + return nil, err + } + } + + // Clean font streams by loading and rewriting with minimal needed tables. + for _, obj := range objects { + stream, isStreamObj := core.GetStream(obj) + if !isStreamObj { + continue + } + if _, has := processed[stream]; has { + // Skip - has been processed. + continue + } + + encoder, err := core.NewEncoderFromStream(stream) + if err != nil { + common.Log.Debug("ERROR getting encoder: %v - ignoring", err) + continue + } + + decoded, err := encoder.DecodeStream(stream) + if err != nil { + common.Log.Debug("Decoding error : %v - ignoring", err) + continue + } + if len(decoded) < 4 { + continue + } + + version := string(decoded[:4]) + if version == "OTTO" { + // Fonts based on PostScript outlines not supported yet. + // See https://docs.microsoft.com/en-us/typography/opentype/spec/otff + continue + } + if version != "\x00\x01\x00\x00" && version != "true" { + continue + } + + fnt, err := unitype.Parse(bytes.NewReader(decoded)) + if err != nil { + common.Log.Debug("ERROR Parsing font: %v - ignoring", err) + continue + } + err = fnt.Optimize() + if err != nil { + continue + } + + var buf bytes.Buffer + err = fnt.Write(&buf) + if err != nil { + common.Log.Debug("ERROR Writing font: %v - ignoring", err) + continue + } + if buf.Len() > len(decoded) { + common.Log.Debug("Re-written font is larger than original - skip") + continue + } + + newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder()) + if err != nil { + continue + } + // Overwrite. + *stream = *newstream + stream.Set("Length1", core.MakeInteger(int64(buf.Len()))) + } + return objects, nil +} + +// content describes page or font contents which is a content stream along with resources. +type content struct { + content string + resources *model.PdfPageResources +} + +// Best effort to get annotation contents. +func getAnnotationContents(annotsObj core.PdfObject) []content { + if annotsObj == nil { + return nil + } + annotsArr, ok := core.GetArray(annotsObj) + if !ok { + common.Log.Debug("Annots not an array") + return nil + } + + var annotContents []content + for _, obj := range annotsArr.Elements() { + annotDict, ok := core.GetDict(obj) + if !ok { + // Ignore any non dict elements. + common.Log.Debug("Ignoring non-dict element in Annots") + continue + } + + // Appearance. + appDict, ok := core.GetDict(annotDict.Get("AP")) + if !ok { + common.Log.Debug("No AP entry - skipping") + continue + } + + normal := core.TraceToDirectObject(appDict.Get("N")) + if normal == nil { + common.Log.Debug("No N entry - skipping") + continue + } + + var stream *core.PdfObjectStream + switch t := normal.(type) { + case *core.PdfObjectDictionary: + appState, ok := core.GetName(annotDict.Get("AS")) + if !ok { + common.Log.Debug("No AS entry - skipping") + continue + } + stream, ok = core.GetStream(t.Get(*appState)) + if !ok { + common.Log.Debug("Form not found - skipping") + continue + } + case *core.PdfObjectStream: + stream = t + } + if stream == nil { + common.Log.Debug("Form not found (nil) - skipping") + continue + } + + xform, err := model.NewXObjectFormFromStream(stream) + if err != nil { + common.Log.Debug("Error loading form: %v - ignoring", err) + continue + } + + contents, err := xform.GetContentStream() + if err != nil { + common.Log.Debug("Error decoding contents: %v", err) + continue + } + + annotContents = append(annotContents, content{ + content: string(contents), + resources: xform.Resources, + }) + } + + return annotContents +} diff --git a/model/optimize/compress_streams.go b/model/optimize/compress_streams.go index e6cd7f45..0beb521e 100644 --- a/model/optimize/compress_streams.go +++ b/model/optimize/compress_streams.go @@ -23,9 +23,17 @@ func (c *CompressStreams) Optimize(objects []core.PdfObject) (optimizedObjects [ if !isStreamObj { continue } - if _, found := core.GetName(stream.PdfObjectDictionary.Get("Filter")); found { - continue + // Skip objects that are already encoded. + // TODO: Try filter combinations, and ignoring inefficient filters. + if obj := stream.Get("Filter"); obj != nil { + if _, skip := core.GetName(obj); skip { + continue + } + if arr, ok := core.GetArray(obj); ok && arr.Len() > 0 { + continue + } } + encoder := core.NewFlateEncoder() // Most mainstream compressor and probably most robust. var data []byte data, err = encoder.EncodeBytes(stream.Stream) diff --git a/model/optimize/image.go b/model/optimize/image.go index 6917c100..3feeefa7 100644 --- a/model/optimize/image.go +++ b/model/optimize/image.go @@ -110,28 +110,51 @@ func (i *Image) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfO common.Log.Warning("Error decode the image stream %s") continue } - encoder := core.NewDCTEncoder() - encoder.ColorComponents = img.ColorComponents - encoder.Quality = i.ImageQuality - encoder.BitsPerComponent = img.BitsPerComponent - encoder.Width = img.Width - encoder.Height = img.Height - streamData, err := encoder.EncodeBytes(data) + dctenc := core.NewDCTEncoder() + dctenc.ColorComponents = img.ColorComponents + dctenc.Quality = i.ImageQuality + dctenc.BitsPerComponent = img.BitsPerComponent + dctenc.Width = img.Width + dctenc.Height = img.Height + streamData, err := dctenc.EncodeBytes(data) if err != nil { + common.Log.Debug("ERROR: %v", err) return nil, err } + + var filter core.StreamEncoder + filter = dctenc + + // Check if combining with FlateEncoding improves things further. + { + flate := core.NewFlateEncoder() + multienc := core.NewMultiEncoder() + multienc.AddEncoder(flate) + multienc.AddEncoder(dctenc) + + encoded, err := multienc.EncodeBytes(data) + if err != nil { + return nil, err + } + if len(encoded) < len(streamData) { + common.Log.Debug("Multi enc improves: %d to %d (orig %d)", + len(streamData), len(encoded), len(stream.Stream)) + streamData = encoded + filter = multienc + } + } + originalSize := len(stream.Stream) if originalSize < len(streamData) { + // Worse - ignoring. continue } newStream := &core.PdfObjectStream{Stream: streamData} newStream.PdfObjectReference = stream.PdfObjectReference newStream.PdfObjectDictionary = core.MakeDict() - newStream.PdfObjectDictionary.Merge(stream.PdfObjectDictionary) - fn := core.PdfObjectName(encoder.GetFilterName()) - newStream.PdfObjectDictionary.Set(core.PdfObjectName("Filter"), &fn) - ln := core.PdfObjectInteger(int64(len(streamData))) - newStream.PdfObjectDictionary.Set(core.PdfObjectName("Length"), &ln) + newStream.Merge(stream.PdfObjectDictionary) + newStream.Merge(filter.MakeStreamDict()) + newStream.Set("Length", core.MakeInteger(int64(len(streamData)))) replaceTable[stream] = newStream images[index].Stream = newStream } diff --git a/model/optimize/optimizer.go b/model/optimize/optimizer.go index a7b23d28..4ba1814e 100644 --- a/model/optimize/optimizer.go +++ b/model/optimize/optimizer.go @@ -12,6 +12,12 @@ import ( // New creates a optimizers chain from options. func New(options Options) *Chain { chain := new(Chain) + if options.CleanFonts || options.SubsetFonts { + chain.Append(&CleanFonts{Subset: options.SubsetFonts}) + } + if options.CleanContentstream { + chain.Append(new(CleanContentstream)) + } if options.ImageUpperPPI > 0 { imageOptimizer := new(ImagePPI) imageOptimizer.ImageUpperPPI = options.ImageUpperPPI diff --git a/model/optimize/options.go b/model/optimize/options.go index db024510..584de061 100644 --- a/model/optimize/options.go +++ b/model/optimize/options.go @@ -14,4 +14,7 @@ type Options struct { UseObjectStreams bool CombineIdenticalIndirectObjects bool CompressStreams bool + CleanFonts bool + SubsetFonts bool + CleanContentstream bool } diff --git a/model/optimize/utils.go b/model/optimize/utils.go new file mode 100644 index 00000000..f2389984 --- /dev/null +++ b/model/optimize/utils.go @@ -0,0 +1,102 @@ +/* + * This file is subject to the terms and conditions defined in + * file 'LICENSE.md', which is part of this source code package. + */ + +package optimize + +import ( + "bytes" + + "github.com/unidoc/unipdf/v3/core" +) + +type objectStructure struct { + catalogDict *core.PdfObjectDictionary + pagesDict *core.PdfObjectDictionary + pages []*core.PdfIndirectObject +} + +// getObjectStructure identifies the Catalog and Pages dictionary and finds a list of pages. +func getObjectStructure(objects []core.PdfObject) objectStructure { + objstr := objectStructure{} + found := false + for _, obj := range objects { + switch t := obj.(type) { + case *core.PdfIndirectObject: + dict, is := core.GetDict(t) + if !is { + continue + } + kind, is := core.GetName(dict.Get("Type")) + if !is { + continue + } + + switch kind.String() { + case "Catalog": + objstr.catalogDict = dict + found = true + } + } + if found { + break + } + } + + if !found { + return objstr + } + + pagesDict, ok := core.GetDict(objstr.catalogDict.Get("Pages")) + if !ok { + return objstr + } + objstr.pagesDict = pagesDict + + kids, ok := core.GetArray(pagesDict.Get("Kids")) + if !ok { + return objstr + } + for _, obj := range kids.Elements() { + pobj, ok := core.GetIndirect(obj) + if !ok { + break + } + objstr.pages = append(objstr.pages, pobj) + } + + return objstr +} + +// getPageContents loads the page content stream as a string from a /Contents entry. +// Either a single stream, or an array of streams. Returns the list of objects that +// can be used if need to replace. +func getPageContents(contentsObj core.PdfObject) (contents string, objs []core.PdfObject) { + var buf bytes.Buffer + + switch t := contentsObj.(type) { + case *core.PdfIndirectObject: + objs = append(objs, t) + contentsObj = t.PdfObject + } + + switch t := contentsObj.(type) { + case *core.PdfObjectStream: + if decoded, err := core.DecodeStream(t); err == nil { + buf.Write(decoded) + objs = append(objs, t) + } + case *core.PdfObjectArray: + for _, elobj := range t.Elements() { + switch el := elobj.(type) { + case *core.PdfObjectStream: + if decoded, err := core.DecodeStream(el); err == nil { + buf.Write(decoded) + objs = append(objs, el) + } + } + } + } + return buf.String(), objs +} diff --git a/model/page.go b/model/page.go index a541784e..d10e4016 100644 --- a/model/page.go +++ b/model/page.go @@ -281,42 +281,42 @@ func (r *PdfReader) newPdfPageFromDict(p *core.PdfObjectDictionary) (*PdfPage, e // GetAnnotations returns the list of page annotations for `page`. If not loaded attempts to load the // annotations, otherwise returns the loaded list. -func (page *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) { - if page.annotations != nil { - return page.annotations, nil +func (p *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) { + if p.annotations != nil { + return p.annotations, nil } - if page.Annots == nil { - page.annotations = []*PdfAnnotation{} + if p.Annots == nil { + p.annotations = []*PdfAnnotation{} return nil, nil } - if page.reader == nil { - page.annotations = []*PdfAnnotation{} + if p.reader == nil { + p.annotations = []*PdfAnnotation{} return nil, nil } - annots, err := page.reader.loadAnnotations(page.Annots) + annots, err := p.reader.loadAnnotations(p.Annots) if err != nil { return nil, err } if annots == nil { - page.annotations = []*PdfAnnotation{} + p.annotations = []*PdfAnnotation{} } - page.annotations = annots - return page.annotations, nil + p.annotations = annots + return p.annotations, nil } // AddAnnotation appends `annot` to the list of page annotations. -func (page *PdfPage) AddAnnotation(annot *PdfAnnotation) { - if page.annotations == nil { - page.GetAnnotations() // Ensure has been loaded. +func (p *PdfPage) AddAnnotation(annot *PdfAnnotation) { + if p.annotations == nil { + p.GetAnnotations() // Ensure has been loaded. } - page.annotations = append(page.annotations, annot) + p.annotations = append(p.annotations, annot) } // SetAnnotations sets the annotations list. -func (page *PdfPage) SetAnnotations(annotations []*PdfAnnotation) { - page.annotations = annotations +func (p *PdfPage) SetAnnotations(annotations []*PdfAnnotation) { + p.annotations = annotations } // loadAnnotations loads and returns the PDF annotations from the input annotations object (array).