mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00
Merge branch 'development' of https://github.com/unidoc/unipdf into development
This commit is contained in:
commit
dbd2364470
26
Jenkinsfile
vendored
26
Jenkinsfile
vendored
@ -1,13 +1,10 @@
|
||||
node {
|
||||
// Install the desired Go version
|
||||
def root = tool name: 'go 1.11.5', type: 'go'
|
||||
def root = tool name: 'go 1.14.3', type: 'go'
|
||||
|
||||
env.GOROOT="${root}"
|
||||
env.GOPATH="${WORKSPACE}/gopath"
|
||||
// Hack for 1.11.5 testing work.
|
||||
env.CGO_ENABLED="0"
|
||||
env.PATH="${root}/bin:${env.GOPATH}/bin:${env.PATH}"
|
||||
env.GOCACHE="off"
|
||||
env.GOBIN="${WORKSPACE}/bin"
|
||||
env.PATH="${root}/bin:${env.GOBIN}:${env.PATH}"
|
||||
env.UNIDOC_EXTRACT_FORCETEST="1"
|
||||
env.UNIDOC_E2E_FORCE_TESTS="1"
|
||||
env.UNIDOC_EXTRACT_TESTDATA="/home/jenkins/corpus/unidoc-extractor-testdata"
|
||||
@ -19,13 +16,13 @@ node {
|
||||
env.UNIDOC_JBIG2_TESTDATA="/home/jenkins/corpus/jbig2-testdata"
|
||||
env.UNIDOC_FDFMERGE_TESTDATA="/home/jenkins/corpus/fdfmerge-testdata"
|
||||
env.UNIDOC_GS_BIN_PATH="/usr/bin/gs"
|
||||
// Hack for 1.11.5 testing work.
|
||||
env.CGO_ENABLED="0"
|
||||
|
||||
env.TMPDIR="${WORKSPACE}/temp"
|
||||
sh "mkdir -p ${env.GOBIN}"
|
||||
sh "mkdir -p ${env.TMPDIR}"
|
||||
|
||||
dir("${GOPATH}/src/github.com/unidoc/unipdf") {
|
||||
dir("${WORKSPACE}/unipdf") {
|
||||
sh 'go version'
|
||||
|
||||
stage('Checkout') {
|
||||
@ -35,11 +32,9 @@ node {
|
||||
|
||||
stage('Prepare') {
|
||||
// Get linter and other build tools.
|
||||
sh 'go get -u golang.org/x/lint/golint'
|
||||
sh 'go get golang.org/x/lint/golint'
|
||||
sh 'go get github.com/tebeka/go2xunit'
|
||||
sh 'go get github.com/t-yuki/gocover-cobertura'
|
||||
// Get all dependencies (for tests also).
|
||||
sh 'go get -t ./...'
|
||||
}
|
||||
|
||||
stage('Linting') {
|
||||
@ -53,7 +48,7 @@ node {
|
||||
stage('Testing') {
|
||||
// Go test - No tolerance.
|
||||
sh "rm -f ${env.TMPDIR}/*.pdf"
|
||||
sh '2>&1 go test -v ./... | tee gotest.txt'
|
||||
sh '2>&1 go test -count=1 -v ./... | tee gotest.txt'
|
||||
}
|
||||
|
||||
stage('Check generated PDFs') {
|
||||
@ -62,7 +57,7 @@ node {
|
||||
}
|
||||
|
||||
stage('Test coverage') {
|
||||
sh 'go test -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
|
||||
sh 'go test -count=1 -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
|
||||
sh '/home/jenkins/codecov.sh'
|
||||
sh 'gocover-cobertura < coverage.out > coverage.xml'
|
||||
step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml'])
|
||||
@ -80,7 +75,7 @@ node {
|
||||
}
|
||||
}
|
||||
|
||||
dir("${GOPATH}/src/github.com/unidoc/unipdf-examples") {
|
||||
dir("${WORKSPACE}/unipdf-examples") {
|
||||
stage('Build examples') {
|
||||
// Output environment variables (useful for debugging).
|
||||
sh("printenv")
|
||||
@ -97,6 +92,9 @@ node {
|
||||
|
||||
echo "Pulling unipdf-examples on branch ${examplesBranch}"
|
||||
git url: 'https://github.com/unidoc/unidoc-examples.git', branch: examplesBranch
|
||||
|
||||
// Use replace directive to use disk version of unipdf.
|
||||
sh 'echo "replace github.com/unidoc/unipdf/v3 => ../unipdf" >>go.mod'
|
||||
|
||||
// Dependencies for examples.
|
||||
sh './build_examples.sh'
|
||||
|
@ -948,7 +948,6 @@ func newDCTEncoderFromStream(streamObj *PdfObjectStream, multiEnc *MultiEncoder)
|
||||
return nil, err
|
||||
}
|
||||
encoded = e
|
||||
|
||||
}
|
||||
|
||||
bufReader := bytes.NewReader(encoded)
|
||||
@ -2158,6 +2157,9 @@ func newMultiEncoderFromStream(streamObj *PdfObjectStream) (*MultiEncoder, error
|
||||
|
||||
// GetFilterName returns the names of the underlying encoding filters,
|
||||
// separated by spaces.
|
||||
// Note: This is just a string, should not be used in /Filter dictionary entry. Use GetFilterArray for that.
|
||||
// TODO(v4): Refactor to GetFilter() which can be used for /Filter (either Name or Array), this can be
|
||||
// renamed to String() as a pretty string to use in debugging etc.
|
||||
func (enc *MultiEncoder) GetFilterName() string {
|
||||
name := ""
|
||||
for idx, encoder := range enc.encoders {
|
||||
@ -2169,6 +2171,16 @@ func (enc *MultiEncoder) GetFilterName() string {
|
||||
return name
|
||||
}
|
||||
|
||||
// GetFilterArray returns the names of the underlying encoding filters in an array that
|
||||
// can be used as /Filter entry.
|
||||
func (enc *MultiEncoder) GetFilterArray() *PdfObjectArray {
|
||||
names := make([]PdfObject, len(enc.encoders))
|
||||
for i, e := range enc.encoders {
|
||||
names[i] = MakeName(e.GetFilterName())
|
||||
}
|
||||
return MakeArray(names...)
|
||||
}
|
||||
|
||||
// MakeDecodeParams makes a new instance of an encoding dictionary based on
|
||||
// the current encoder settings.
|
||||
func (enc *MultiEncoder) MakeDecodeParams() PdfObject {
|
||||
@ -2201,12 +2213,7 @@ func (enc *MultiEncoder) AddEncoder(encoder StreamEncoder) {
|
||||
// MakeStreamDict makes a new instance of an encoding dictionary for a stream object.
|
||||
func (enc *MultiEncoder) MakeStreamDict() *PdfObjectDictionary {
|
||||
dict := MakeDict()
|
||||
|
||||
names := make([]PdfObject, len(enc.encoders))
|
||||
for i, e := range enc.encoders {
|
||||
names[i] = MakeName(e.GetFilterName())
|
||||
}
|
||||
dict.Set("Filter", MakeArray(names...))
|
||||
dict.Set("Filter", enc.GetFilterArray())
|
||||
|
||||
// Pass all values from children, except Filter and DecodeParms.
|
||||
for _, encoder := range enc.encoders {
|
||||
|
@ -42,9 +42,14 @@ func New(page *model.PdfPage) (*Extractor, error) {
|
||||
// fmt.Printf("%s\n", contents)
|
||||
// fmt.Println("========================= ::: =========================")
|
||||
|
||||
return NewFromContents(contents, page.Resources)
|
||||
}
|
||||
|
||||
// NewFromContents creates a new extractor from contents and page resources.
|
||||
func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) {
|
||||
e := &Extractor{
|
||||
contents: contents,
|
||||
resources: page.Resources,
|
||||
resources: resources,
|
||||
fontCache: map[string]fontEntry{},
|
||||
formResults: map[string]textResult{},
|
||||
}
|
||||
|
@ -439,7 +439,11 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
|
||||
common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
|
||||
return core.ErrTypeError
|
||||
}
|
||||
to.renderText(charcodes)
|
||||
err := to.renderText(charcodes)
|
||||
if err != nil {
|
||||
common.Log.Debug("Render text error: %v", err)
|
||||
return err
|
||||
}
|
||||
default:
|
||||
common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
|
||||
return core.ErrTypeError
|
||||
@ -736,6 +740,7 @@ func (to *textObject) renderText(data []byte) error {
|
||||
continue
|
||||
}
|
||||
|
||||
// TODO(gunnsth): Assuming 1:1 charcode[i] <-> rune[i] mapping.
|
||||
code := charcodes[i]
|
||||
// The location of the text on the page in device coordinates is given by trm, the text
|
||||
// rendering matrix.
|
||||
@ -785,6 +790,8 @@ func (to *textObject) renderText(data []byte) error {
|
||||
} else if font.Encoder() == nil {
|
||||
common.Log.Debug("ERROR: No encoding. font=%s", font)
|
||||
} else {
|
||||
// TODO: This lookup seems confusing. Went from bytes <-> charcodes already.
|
||||
// NOTE: This is needed to register runes by the font encoder - for subsetting (optimization).
|
||||
original, ok := font.Encoder().CharcodeToRune(code)
|
||||
if ok {
|
||||
mark.original = string(original)
|
||||
|
@ -51,9 +51,7 @@ var doStress bool
|
||||
func init() {
|
||||
flag.BoolVar(&doStress, "extractor-stresstest", false, "Run text extractor stress tests.")
|
||||
common.SetLogger(common.NewConsoleLogger(common.LogLevelInfo))
|
||||
if flag.Lookup("test.v") != nil {
|
||||
isTesting = true
|
||||
}
|
||||
isTesting = true
|
||||
}
|
||||
|
||||
// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
|
||||
|
5
go.mod
5
go.mod
@ -5,12 +5,15 @@ go 1.11
|
||||
require (
|
||||
github.com/adrg/sysfont v0.1.0
|
||||
github.com/boombuler/barcode v1.0.0
|
||||
github.com/davecgh/go-spew v1.1.1
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0
|
||||
github.com/sirupsen/logrus v1.6.0
|
||||
github.com/stretchr/testify v1.4.0
|
||||
github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df
|
||||
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a
|
||||
github.com/unidoc/unitype v0.1.0
|
||||
github.com/unidoc/unitype v0.2.0
|
||||
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5
|
||||
golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b
|
||||
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 // indirect
|
||||
golang.org/x/text v0.3.2
|
||||
)
|
||||
|
10
go.sum
10
go.sum
@ -15,6 +15,8 @@ github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGw
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
||||
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
@ -24,6 +26,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/sirupsen/logrus v1.5.0 h1:1N5EYkVAPEywqZRJd7cwnRtCb6xJx7NH3T3WUTF980Q=
|
||||
github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s+Squfpo=
|
||||
github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
|
||||
github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
|
||||
@ -34,6 +38,10 @@ github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a h1:RLtvUhe4DsUDl6
|
||||
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a/go.mod h1:j+qMWZVpZFTvDey3zxUkSgPJZEX33tDgU/QIA0IzCUw=
|
||||
github.com/unidoc/unitype v0.1.0 h1:6zJYMl8XdwFBD45Cmg8Ge13WyE92jwLuK1tk2IsRb9s=
|
||||
github.com/unidoc/unitype v0.1.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
|
||||
github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02 h1:zVMJh0ehLc0amGBcqIh7HWikIGXGBGpmW+Lvz1YVYH8=
|
||||
github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
|
||||
github.com/unidoc/unitype v0.2.0 h1:N+ZKjwz8UDU0qa1IYzstDLffvQEctFo+bo6b6ZqW+9M=
|
||||
github.com/unidoc/unitype v0.2.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 h1:58fnuSXlxZmFdJyvtTFVmVhcMLU6v5fEb/ok4wyqtNU=
|
||||
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
@ -45,6 +53,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
|
||||
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 h1:opSr2sbRXk5X5/givKrrKj9HXxFpW2sdCiP8MJSKLQY=
|
||||
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 h1:ogLJMz+qpzav7lGMh10LMvAkM/fAoGlaiiHYiFYdm80=
|
||||
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
||||
|
@ -13,47 +13,81 @@ import (
|
||||
"github.com/unidoc/unipdf/v3/core"
|
||||
)
|
||||
|
||||
// IdentityEncoder represents an 2-byte identity encoding
|
||||
// IdentityEncoder represents an 2-byte identity encoding.
|
||||
// NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode
|
||||
// meaning, except via the ToUnicode maps.
|
||||
// TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing.
|
||||
// Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual
|
||||
// runes and character codes, CMaps together.
|
||||
type IdentityEncoder struct {
|
||||
baseName string
|
||||
|
||||
// runes registered by encoder for tracking what runes are used for subsetting.
|
||||
registeredMap map[rune]struct{}
|
||||
}
|
||||
|
||||
// NewIdentityTextEncoder returns a new IdentityEncoder based on predefined
|
||||
// encoding `baseName` and difference map `differences`.
|
||||
func NewIdentityTextEncoder(baseName string) IdentityEncoder {
|
||||
return IdentityEncoder{baseName}
|
||||
func NewIdentityTextEncoder(baseName string) *IdentityEncoder {
|
||||
return &IdentityEncoder{
|
||||
baseName: baseName,
|
||||
}
|
||||
}
|
||||
|
||||
// RegisteredRunes returns the slice of runes that have been registered as used by the encoder.
|
||||
func (enc *IdentityEncoder) RegisteredRunes() []rune {
|
||||
runes := make([]rune, len(enc.registeredMap))
|
||||
i := 0
|
||||
for r := range enc.registeredMap {
|
||||
runes[i] = r
|
||||
i++
|
||||
}
|
||||
return runes
|
||||
}
|
||||
|
||||
// String returns a string that describes `enc`.
|
||||
func (enc IdentityEncoder) String() string {
|
||||
func (enc *IdentityEncoder) String() string {
|
||||
return enc.baseName
|
||||
}
|
||||
|
||||
// Encode converts the Go unicode string to a PDF encoded string.
|
||||
func (enc IdentityEncoder) Encode(str string) []byte {
|
||||
func (enc *IdentityEncoder) Encode(str string) []byte {
|
||||
return encodeString16bit(enc, str)
|
||||
}
|
||||
|
||||
// Decode converts PDF encoded string to a Go unicode string.
|
||||
func (enc IdentityEncoder) Decode(raw []byte) string {
|
||||
func (enc *IdentityEncoder) Decode(raw []byte) string {
|
||||
return decodeString16bit(enc, raw)
|
||||
}
|
||||
|
||||
// RuneToCharcode converts rune `r` to a PDF character code.
|
||||
// The bool return flag is true if there was a match, and false otherwise.
|
||||
func (enc IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
|
||||
// TODO: Here the `r` is an actual rune.
|
||||
func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
|
||||
if enc.registeredMap == nil {
|
||||
enc.registeredMap = map[rune]struct{}{}
|
||||
}
|
||||
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
|
||||
|
||||
return CharCode(r), true
|
||||
}
|
||||
|
||||
// CharcodeToRune converts PDF character code `code` to a rune.
|
||||
// The bool return flag is true if there was a match, and false otherwise.
|
||||
func (enc IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
|
||||
// TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both).
|
||||
func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
|
||||
if enc.registeredMap == nil {
|
||||
enc.registeredMap = map[rune]struct{}{}
|
||||
}
|
||||
|
||||
// TODO: The rune(code) is confusing and is not an actual utf8 rune.
|
||||
enc.registeredMap[rune(code)] = struct{}{}
|
||||
return rune(code), true
|
||||
}
|
||||
|
||||
// RuneToGlyph returns the glyph name for rune `r`.
|
||||
// The bool return flag is true if there was a match, and false otherwise.
|
||||
func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
|
||||
func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
|
||||
if r == ' ' {
|
||||
return "space", true
|
||||
}
|
||||
@ -63,7 +97,7 @@ func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
|
||||
|
||||
// GlyphToRune returns the rune corresponding to glyph name `glyph`.
|
||||
// The bool return flag is true if there was a match, and false otherwise.
|
||||
func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
|
||||
func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
|
||||
// String with "uniXXXX" format where XXXX is the hexcode.
|
||||
if glyph == "space" {
|
||||
return ' ', true
|
||||
@ -78,7 +112,7 @@ func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
|
||||
}
|
||||
|
||||
// ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
|
||||
func (enc IdentityEncoder) ToPdfObject() core.PdfObject {
|
||||
func (enc *IdentityEncoder) ToPdfObject() core.PdfObject {
|
||||
if enc.baseName != "" {
|
||||
return core.MakeName(enc.baseName)
|
||||
}
|
||||
|
@ -103,6 +103,9 @@ type simpleEncoding struct {
|
||||
// one byte encoding: CharCode <-> byte
|
||||
encode map[rune]byte
|
||||
decode map[byte]rune
|
||||
|
||||
// runes registered by encoder for tracking what runes are used for subsetting.
|
||||
registeredMap map[rune]struct{}
|
||||
}
|
||||
|
||||
// Encode converts the Go unicode string to a PDF encoded string.
|
||||
@ -213,6 +216,10 @@ func (enc *simpleEncoding) Charcodes() []CharCode {
|
||||
|
||||
func (enc *simpleEncoding) RuneToCharcode(r rune) (CharCode, bool) {
|
||||
b, ok := enc.encode[r]
|
||||
if enc.registeredMap == nil {
|
||||
enc.registeredMap = map[rune]struct{}{}
|
||||
}
|
||||
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
|
||||
return CharCode(b), ok
|
||||
}
|
||||
|
||||
@ -222,6 +229,10 @@ func (enc *simpleEncoding) CharcodeToRune(code CharCode) (rune, bool) {
|
||||
}
|
||||
b := byte(code)
|
||||
r, ok := enc.decode[b]
|
||||
if enc.registeredMap == nil {
|
||||
enc.registeredMap = map[rune]struct{}{}
|
||||
}
|
||||
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
|
||||
return r, ok
|
||||
}
|
||||
|
||||
|
@ -103,31 +103,31 @@ type PdfAnnotationLink struct {
|
||||
}
|
||||
|
||||
// GetAction returns the PDF action for the annotation link.
|
||||
func (a *PdfAnnotationLink) GetAction() (*PdfAction, error) {
|
||||
if a.action != nil {
|
||||
return a.action, nil
|
||||
func (link *PdfAnnotationLink) GetAction() (*PdfAction, error) {
|
||||
if link.action != nil {
|
||||
return link.action, nil
|
||||
}
|
||||
if a.A == nil {
|
||||
if link.A == nil {
|
||||
return nil, nil
|
||||
}
|
||||
if a.reader == nil {
|
||||
if link.reader == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
action, err := a.reader.loadAction(a.A)
|
||||
action, err := link.reader.loadAction(link.A)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
a.action = action
|
||||
link.action = action
|
||||
|
||||
return a.action, nil
|
||||
return link.action, nil
|
||||
}
|
||||
|
||||
// SetAction sets the PDF action for the annotation link.
|
||||
func (a *PdfAnnotationLink) SetAction(action *PdfAction) {
|
||||
a.action = action
|
||||
func (link *PdfAnnotationLink) SetAction(action *PdfAction) {
|
||||
link.action = action
|
||||
if action == nil {
|
||||
a.A = nil
|
||||
link.A = nil
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,6 +50,7 @@ func (font *PdfFont) SubsetRegistered() error {
|
||||
case *pdfFontType0:
|
||||
err := t.subsetRegistered()
|
||||
if err != nil {
|
||||
common.Log.Debug("Subset error: %v", err)
|
||||
return err
|
||||
}
|
||||
if t.container != nil {
|
||||
@ -401,6 +402,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
|
||||
|
||||
charcodes := make([]textencoding.CharCode, 0, len(data)+len(data)%2)
|
||||
if font.baseFields().isCIDFont() {
|
||||
// Identity only?
|
||||
if len(data) == 1 {
|
||||
data = []byte{0, data[0]}
|
||||
}
|
||||
@ -413,6 +415,7 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
|
||||
charcodes = append(charcodes, textencoding.CharCode(b))
|
||||
}
|
||||
} else {
|
||||
// Simple font: byte -> charcode.
|
||||
for _, b := range data {
|
||||
charcodes = append(charcodes, textencoding.CharCode(b))
|
||||
}
|
||||
@ -755,8 +758,7 @@ func (base fontCommon) isCIDFont() bool {
|
||||
// newFontBaseFieldsFromPdfObject returns `fontObj` as a dictionary the common fields from that
|
||||
// dictionary in the fontCommon return. If there is a problem an error is returned.
|
||||
// The fontCommon is the group of fields common to all PDF fonts.
|
||||
func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon,
|
||||
error) {
|
||||
func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDictionary, *fontCommon, error) {
|
||||
font := &fontCommon{}
|
||||
|
||||
if obj, ok := fontObj.(*core.PdfIndirectObject); ok {
|
||||
|
@ -127,6 +127,9 @@ func (font *pdfFontType0) baseFields() *fontCommon {
|
||||
}
|
||||
|
||||
func (font *pdfFontType0) getFontDescriptor() *PdfFontDescriptor {
|
||||
if font.fontDescriptor == nil && font.DescendantFont != nil {
|
||||
return font.DescendantFont.FontDescriptor()
|
||||
}
|
||||
return font.fontDescriptor
|
||||
}
|
||||
|
||||
@ -210,14 +213,19 @@ func (font *pdfFontType0) subsetRegistered() error {
|
||||
common.Log.Debug("Missing font descriptor")
|
||||
return nil
|
||||
}
|
||||
if font.encoder == nil {
|
||||
common.Log.Debug("No encoder - subsetting ignored")
|
||||
return nil
|
||||
}
|
||||
|
||||
stream, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2)
|
||||
if !ok {
|
||||
common.Log.Debug("Embedded font object not found -- ABORT subsseting")
|
||||
common.Log.Debug("Embedded font object not found -- ABORT subsetting")
|
||||
return errors.New("fontfile2 not found")
|
||||
}
|
||||
decoded, err := core.DecodeStream(stream)
|
||||
if err != nil {
|
||||
common.Log.Debug("Decode error: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
@ -227,21 +235,52 @@ func (font *pdfFontType0) subsetRegistered() error {
|
||||
return err
|
||||
}
|
||||
|
||||
tenc, ok := font.encoder.(*textencoding.TrueTypeFontEncoder)
|
||||
if !ok {
|
||||
return fmt.Errorf("unsupported encoder for subsetting: %T", cidfnt.encoder)
|
||||
var runes []rune
|
||||
var subset *unitype.Font
|
||||
switch tenc := font.encoder.(type) {
|
||||
case *textencoding.TrueTypeFontEncoder:
|
||||
// Means the font has been loaded from TTF file.
|
||||
runes = tenc.RegisteredRunes()
|
||||
subset, err = fnt.SubsetKeepRunes(runes)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
}
|
||||
// Reduce the encoder also.
|
||||
tenc.SubsetRegistered()
|
||||
case *textencoding.IdentityEncoder:
|
||||
// IdentityEncoder typically means font was parsed from PDF file.
|
||||
// TODO: These are not actual runes... but glyph ids ? Very confusing.
|
||||
runes = tenc.RegisteredRunes()
|
||||
indices := make([]unitype.GlyphIndex, len(runes))
|
||||
for i, r := range runes {
|
||||
indices[i] = unitype.GlyphIndex(r)
|
||||
}
|
||||
|
||||
subset, err = fnt.SubsetKeepIndices(indices)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
}
|
||||
case textencoding.SimpleEncoder:
|
||||
// Simple encoding, bytes are 0-255
|
||||
charcodes := tenc.Charcodes()
|
||||
for _, c := range charcodes {
|
||||
r, ok := tenc.CharcodeToRune(c)
|
||||
if !ok {
|
||||
common.Log.Debug("ERROR: unable convert charcode to rune: %d", c)
|
||||
continue
|
||||
}
|
||||
runes = append(runes, r)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("unsupported encoder for subsetting: %T", font.encoder)
|
||||
}
|
||||
|
||||
runes := tenc.RegisteredRunes()
|
||||
subset, err := fnt.SubsetKeepRunes(runes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Reduce the encoder also.
|
||||
tenc.SubsetRegistered()
|
||||
var buf bytes.Buffer
|
||||
err = subset.Write(&buf)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
@ -249,7 +288,7 @@ func (font *pdfFontType0) subsetRegistered() error {
|
||||
if font.toUnicodeCmap != nil {
|
||||
codeToUnicode := make(map[cmap.CharCode]rune, len(runes))
|
||||
for _, r := range runes {
|
||||
cc, ok := tenc.RuneToCharcode(r)
|
||||
cc, ok := font.encoder.RuneToCharcode(r)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
@ -260,9 +299,16 @@ func (font *pdfFontType0) subsetRegistered() error {
|
||||
|
||||
stream, err = core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
}
|
||||
cidfnt.fontDescriptor.FontFile2 = stream
|
||||
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
|
||||
if curstr, ok := core.GetStream(cidfnt.fontDescriptor.FontFile2); ok {
|
||||
// Replace the current stream (keep same object).
|
||||
*curstr = *stream
|
||||
} else {
|
||||
cidfnt.fontDescriptor.FontFile2 = stream
|
||||
}
|
||||
|
||||
// Set subset name.
|
||||
tag := genSubsetTag()
|
||||
@ -334,6 +380,7 @@ func newPdfFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon)
|
||||
|
||||
encoderName, ok := core.GetNameVal(d.Get("Encoding"))
|
||||
if ok {
|
||||
// TODO: Identity-H maps 16-bit character codes straight to glyph index (don't need actual runes).
|
||||
if encoderName == "Identity-H" || encoderName == "Identity-V" {
|
||||
font.encoder = textencoding.NewIdentityTextEncoder(encoderName)
|
||||
} else if cmap.IsPredefinedCMap(encoderName) {
|
||||
|
139
model/optimize/clean_contentstream.go
Normal file
139
model/optimize/clean_contentstream.go
Normal file
@ -0,0 +1,139 @@
|
||||
/*
|
||||
* This file is subject to the terms and conditions defined in
|
||||
* file 'LICENSE.md', which is part of this source code package.
|
||||
*/
|
||||
|
||||
package optimize
|
||||
|
||||
import (
|
||||
"github.com/unidoc/unipdf/v3/contentstream"
|
||||
"github.com/unidoc/unipdf/v3/core"
|
||||
)
|
||||
|
||||
// CleanContentstream cleans up redundant operands in content streams, including Page and XObject Form
|
||||
// contents. This process includes:
|
||||
// 1. Marked content operators are removed.
|
||||
// 2. Some operands are simplified (shorter form).
|
||||
// TODO: Add more reduction methods and improving the methods for identifying unnecessary operands.
|
||||
type CleanContentstream struct {
|
||||
}
|
||||
|
||||
// filterOps cleans up the content stream in `ops`:
|
||||
// 1. Marked content operators are cleaned.
|
||||
// 2. Tm with 1 0 0 1 params are converted to Td (slightly shorter for same transformation).
|
||||
// TODO: Add operations that track the state and remove unnecessary operands, such as duplicates
|
||||
// or ones setting default values, or ones not drawing anything.
|
||||
func filterOps(ops *contentstream.ContentStreamOperations) *contentstream.ContentStreamOperations {
|
||||
if ops == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
filtered := contentstream.ContentStreamOperations{}
|
||||
for _, op := range *ops {
|
||||
switch op.Operand {
|
||||
case "BDC", "BMC", "EMC":
|
||||
continue
|
||||
case "Tm":
|
||||
if len(op.Params) == 6 {
|
||||
if nums, err := core.GetNumbersAsFloat(op.Params); err == nil {
|
||||
if nums[0] == 1 && nums[1] == 0 && nums[2] == 0 && nums[3] == 1 {
|
||||
op = &contentstream.ContentStreamOperation{
|
||||
Params: []core.PdfObject{
|
||||
op.Params[4],
|
||||
op.Params[5],
|
||||
},
|
||||
Operand: "Td",
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
filtered = append(filtered, op)
|
||||
}
|
||||
return &filtered
|
||||
}
|
||||
|
||||
// reduceContent performs content stream optimization of contents in `cstream` which can either be
|
||||
// from Page Contents or XObject Form.
|
||||
// NOTE: If from a Contents array, the operations may be unbalanced.
|
||||
func reduceContent(cstream *core.PdfObjectStream) error {
|
||||
decoded, err := core.DecodeStream(cstream)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
csp := contentstream.NewContentStreamParser(string(decoded))
|
||||
ops, err := csp.Parse()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ops = filterOps(ops)
|
||||
cleaned := ops.Bytes()
|
||||
if len(cleaned) >= len(decoded) {
|
||||
// No need to replace if no improvement.
|
||||
return nil
|
||||
}
|
||||
|
||||
newstream, err := core.MakeStream(ops.Bytes(), core.NewFlateEncoder())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cstream.Stream = newstream.Stream
|
||||
cstream.Merge(newstream.PdfObjectDictionary)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Optimize optimizes PDF objects to decrease PDF size.
|
||||
func (c *CleanContentstream) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
|
||||
// Track which content streams to process.
|
||||
queuedMap := map[*core.PdfObjectStream]struct{}{}
|
||||
var queued []*core.PdfObjectStream
|
||||
appendQueue := func(stream *core.PdfObjectStream) {
|
||||
if _, has := queuedMap[stream]; !has {
|
||||
queuedMap[stream] = struct{}{}
|
||||
queued = append(queued, stream)
|
||||
}
|
||||
}
|
||||
|
||||
// Collect objects to process: XObject Form and Page Content streams.
|
||||
for _, obj := range objects {
|
||||
switch t := obj.(type) {
|
||||
case *core.PdfIndirectObject:
|
||||
switch ti := t.PdfObject.(type) {
|
||||
case *core.PdfObjectDictionary:
|
||||
if name, ok := core.GetName(ti.Get("Type")); !ok || name.String() != "Page" {
|
||||
continue
|
||||
}
|
||||
|
||||
if stream, ok := core.GetStream(ti.Get("Contents")); ok {
|
||||
appendQueue(stream)
|
||||
} else if array, ok := core.GetArray(ti.Get("Contents")); ok {
|
||||
for _, el := range array.Elements() {
|
||||
if stream, ok := core.GetStream(el); ok {
|
||||
appendQueue(stream)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
case *core.PdfObjectStream:
|
||||
if name, ok := core.GetName(t.Get("Type")); !ok || name.String() != "XObject" {
|
||||
continue
|
||||
}
|
||||
if name, ok := core.GetName(t.Get("Subtype")); !ok || name.String() != "Form" {
|
||||
continue
|
||||
}
|
||||
appendQueue(t)
|
||||
}
|
||||
}
|
||||
|
||||
// Process the queued content streams.
|
||||
for _, stream := range queued {
|
||||
err = reduceContent(stream)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return objects, nil
|
||||
}
|
353
model/optimize/clean_fonts.go
Normal file
353
model/optimize/clean_fonts.go
Normal file
@ -0,0 +1,353 @@
|
||||
/*
|
||||
* This file is subject to the terms and conditions defined in
|
||||
* file 'LICENSE.md', which is part of this source code package.
|
||||
*/
|
||||
|
||||
package optimize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
|
||||
"github.com/unidoc/unitype"
|
||||
|
||||
"github.com/unidoc/unipdf/v3/common"
|
||||
"github.com/unidoc/unipdf/v3/core"
|
||||
"github.com/unidoc/unipdf/v3/extractor"
|
||||
"github.com/unidoc/unipdf/v3/internal/textencoding"
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
)
|
||||
|
||||
// CleanFonts cleans up embedded fonts, reducing font sizes.
|
||||
type CleanFonts struct {
|
||||
// Subset embedded fonts if encountered (if true).
|
||||
// Otherwise attempts to reduce the font program.
|
||||
Subset bool
|
||||
}
|
||||
|
||||
func optimizeFontsWithSubsetting(objects []core.PdfObject) (processed map[*core.PdfObjectStream]struct{}, err error) {
|
||||
// 1. Identify all fonts.
|
||||
// 2. Identify content streams and their Resources dictionaries (both via page, forms and annotations).
|
||||
// 3. Process content streams.
|
||||
processed = map[*core.PdfObjectStream]struct{}{}
|
||||
|
||||
fontMap := map[*model.PdfFont]struct{}{}
|
||||
|
||||
objstr := getObjectStructure(objects)
|
||||
for _, p := range objstr.pages {
|
||||
pdict, ok := core.GetDict(p.PdfObject)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
resourcesDict, ok := core.GetDict(pdict.Get("Resources"))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
contents, _ := getPageContents(pdict.Get("Contents"))
|
||||
resources, err := model.NewPdfPageResourcesFromDict(resourcesDict)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
allContents := []content{
|
||||
{
|
||||
content: contents,
|
||||
resources: resources,
|
||||
},
|
||||
}
|
||||
|
||||
annotContents := getAnnotationContents(pdict.Get("Annots"))
|
||||
if annotContents != nil {
|
||||
allContents = append(allContents, annotContents...)
|
||||
}
|
||||
|
||||
for _, cont := range allContents {
|
||||
e, err := extractor.NewFromContents(cont.content, cont.resources)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pt, _, _, err := e.ExtractPageText()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, el := range pt.Marks().Elements() {
|
||||
if el.Font == nil {
|
||||
continue
|
||||
}
|
||||
if _, has := fontMap[el.Font]; !has {
|
||||
fontMap[el.Font] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Map of font program stream to font. Multiple fonts can use the same font program.
|
||||
fontFileMap := map[*core.PdfObjectStream][]*model.PdfFont{}
|
||||
for font := range fontMap {
|
||||
fontDesc := font.FontDescriptor()
|
||||
if fontDesc == nil || fontDesc.FontFile2 == nil {
|
||||
continue
|
||||
}
|
||||
stream, ok := core.GetStream(fontDesc.FontFile2)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
fontFileMap[stream] = append(fontFileMap[stream], font)
|
||||
}
|
||||
|
||||
for stream := range fontFileMap {
|
||||
var allRunes []rune
|
||||
var allIndices []unitype.GlyphIndex
|
||||
|
||||
for _, font := range fontFileMap[stream] {
|
||||
switch t := font.Encoder().(type) {
|
||||
case *textencoding.IdentityEncoder:
|
||||
// TODO: This terminology is wrong as those are not runes, just charcodes cast as runes.
|
||||
// Identity encoder maps via 2-byte encoding directly from 2byte charcode to glyph index.
|
||||
runes := t.RegisteredRunes()
|
||||
indices := make([]unitype.GlyphIndex, len(runes))
|
||||
for i, r := range runes {
|
||||
indices[i] = unitype.GlyphIndex(r)
|
||||
}
|
||||
allIndices = append(allIndices, indices...)
|
||||
case *textencoding.TrueTypeFontEncoder:
|
||||
runes := t.RegisteredRunes()
|
||||
allRunes = append(allRunes, runes...)
|
||||
case textencoding.SimpleEncoder:
|
||||
charcodes := t.Charcodes()
|
||||
for _, c := range charcodes {
|
||||
r, ok := t.CharcodeToRune(c)
|
||||
if !ok {
|
||||
common.Log.Debug("Charcode<->rune not found: %d", c)
|
||||
continue
|
||||
}
|
||||
allRunes = append(allRunes, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
err = subsetFontStream(stream, allRunes, allIndices)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR subsetting font stream: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
processed[stream] = struct{}{}
|
||||
}
|
||||
return processed, nil
|
||||
}
|
||||
|
||||
// Subsets the font program in `stream` with the subset based on the `runes` and glyph `indices`.
|
||||
func subsetFontStream(stream *core.PdfObjectStream, runes []rune, indices []unitype.GlyphIndex) error {
|
||||
stream, ok := core.GetStream(stream)
|
||||
if !ok {
|
||||
common.Log.Debug("Embedded font object not found -- ABORT subsetting")
|
||||
return errors.New("fontfile2 not found")
|
||||
}
|
||||
decoded, err := core.DecodeStream(stream)
|
||||
if err != nil {
|
||||
common.Log.Debug("Decode error: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
fnt, err := unitype.Parse(bytes.NewReader(decoded))
|
||||
if err != nil {
|
||||
common.Log.Debug("Error parsing %d byte font", len(stream.Stream))
|
||||
return err
|
||||
}
|
||||
|
||||
allIndices := indices
|
||||
if len(runes) > 0 {
|
||||
indices := fnt.LookupRunes(runes)
|
||||
allIndices = append(allIndices, indices...)
|
||||
}
|
||||
|
||||
fnt, err = fnt.SubsetKeepIndices(allIndices)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR subsetting font: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
err = fnt.Write(&buf)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR Writing font: %v", err)
|
||||
return err
|
||||
}
|
||||
if buf.Len() > len(decoded) {
|
||||
common.Log.Debug("Re-written font is larger than original - skip")
|
||||
return nil
|
||||
}
|
||||
|
||||
newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR Writing font: %v", err)
|
||||
return err
|
||||
}
|
||||
// Overwrite.
|
||||
*stream = *newstream
|
||||
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Optimize optimizes PDF objects to decrease PDF size.
|
||||
func (c *CleanFonts) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfObject, err error) {
|
||||
var processed map[*core.PdfObjectStream]struct{}
|
||||
if c.Subset {
|
||||
var err error
|
||||
processed, err = optimizeFontsWithSubsetting(objects)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Clean font streams by loading and rewriting with minimal needed tables.
|
||||
for _, obj := range objects {
|
||||
stream, isStreamObj := core.GetStream(obj)
|
||||
if !isStreamObj {
|
||||
continue
|
||||
}
|
||||
if _, has := processed[stream]; has {
|
||||
// Skip - has been processed.
|
||||
continue
|
||||
}
|
||||
|
||||
encoder, err := core.NewEncoderFromStream(stream)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR getting encoder: %v - ignoring", err)
|
||||
continue
|
||||
}
|
||||
|
||||
decoded, err := encoder.DecodeStream(stream)
|
||||
if err != nil {
|
||||
common.Log.Debug("Decoding error : %v - ignoring", err)
|
||||
continue
|
||||
}
|
||||
if len(decoded) < 4 {
|
||||
continue
|
||||
}
|
||||
|
||||
version := string(decoded[:4])
|
||||
if version == "OTTO" {
|
||||
// Fonts based on PostScript outlines not supported yet.
|
||||
// See https://docs.microsoft.com/en-us/typography/opentype/spec/otff
|
||||
continue
|
||||
}
|
||||
if version != "\x00\x01\x00\x00" && version != "true" {
|
||||
continue
|
||||
}
|
||||
|
||||
fnt, err := unitype.Parse(bytes.NewReader(decoded))
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR Parsing font: %v - ignoring", err)
|
||||
continue
|
||||
}
|
||||
err = fnt.Optimize()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
err = fnt.Write(&buf)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR Writing font: %v - ignoring", err)
|
||||
continue
|
||||
}
|
||||
if buf.Len() > len(decoded) {
|
||||
common.Log.Debug("Re-written font is larger than original - skip")
|
||||
continue
|
||||
}
|
||||
|
||||
newstream, err := core.MakeStream(buf.Bytes(), core.NewFlateEncoder())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Overwrite.
|
||||
*stream = *newstream
|
||||
stream.Set("Length1", core.MakeInteger(int64(buf.Len())))
|
||||
}
|
||||
return objects, nil
|
||||
}
|
||||
|
||||
// content describes page or font contents which is a content stream along with resources.
|
||||
type content struct {
|
||||
content string
|
||||
resources *model.PdfPageResources
|
||||
}
|
||||
|
||||
// Best effort to get annotation contents.
|
||||
func getAnnotationContents(annotsObj core.PdfObject) []content {
|
||||
if annotsObj == nil {
|
||||
return nil
|
||||
}
|
||||
annotsArr, ok := core.GetArray(annotsObj)
|
||||
if !ok {
|
||||
common.Log.Debug("Annots not an array")
|
||||
return nil
|
||||
}
|
||||
|
||||
var annotContents []content
|
||||
for _, obj := range annotsArr.Elements() {
|
||||
annotDict, ok := core.GetDict(obj)
|
||||
if !ok {
|
||||
// Ignore any non dict elements.
|
||||
common.Log.Debug("Ignoring non-dict element in Annots")
|
||||
continue
|
||||
}
|
||||
|
||||
// Appearance.
|
||||
appDict, ok := core.GetDict(annotDict.Get("AP"))
|
||||
if !ok {
|
||||
common.Log.Debug("No AP entry - skipping")
|
||||
continue
|
||||
}
|
||||
|
||||
normal := core.TraceToDirectObject(appDict.Get("N"))
|
||||
if normal == nil {
|
||||
common.Log.Debug("No N entry - skipping")
|
||||
continue
|
||||
}
|
||||
|
||||
var stream *core.PdfObjectStream
|
||||
switch t := normal.(type) {
|
||||
case *core.PdfObjectDictionary:
|
||||
appState, ok := core.GetName(annotDict.Get("AS"))
|
||||
if !ok {
|
||||
common.Log.Debug("No AS entry - skipping")
|
||||
continue
|
||||
}
|
||||
stream, ok = core.GetStream(t.Get(*appState))
|
||||
if !ok {
|
||||
common.Log.Debug("Form not found - skipping")
|
||||
continue
|
||||
}
|
||||
case *core.PdfObjectStream:
|
||||
stream = t
|
||||
}
|
||||
if stream == nil {
|
||||
common.Log.Debug("Form not found (nil) - skipping")
|
||||
continue
|
||||
}
|
||||
|
||||
xform, err := model.NewXObjectFormFromStream(stream)
|
||||
if err != nil {
|
||||
common.Log.Debug("Error loading form: %v - ignoring", err)
|
||||
continue
|
||||
}
|
||||
|
||||
contents, err := xform.GetContentStream()
|
||||
if err != nil {
|
||||
common.Log.Debug("Error decoding contents: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
annotContents = append(annotContents, content{
|
||||
content: string(contents),
|
||||
resources: xform.Resources,
|
||||
})
|
||||
}
|
||||
|
||||
return annotContents
|
||||
}
|
@ -23,9 +23,17 @@ func (c *CompressStreams) Optimize(objects []core.PdfObject) (optimizedObjects [
|
||||
if !isStreamObj {
|
||||
continue
|
||||
}
|
||||
if _, found := core.GetName(stream.PdfObjectDictionary.Get("Filter")); found {
|
||||
continue
|
||||
// Skip objects that are already encoded.
|
||||
// TODO: Try filter combinations, and ignoring inefficient filters.
|
||||
if obj := stream.Get("Filter"); obj != nil {
|
||||
if _, skip := core.GetName(obj); skip {
|
||||
continue
|
||||
}
|
||||
if arr, ok := core.GetArray(obj); ok && arr.Len() > 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
encoder := core.NewFlateEncoder() // Most mainstream compressor and probably most robust.
|
||||
var data []byte
|
||||
data, err = encoder.EncodeBytes(stream.Stream)
|
||||
|
@ -110,28 +110,51 @@ func (i *Image) Optimize(objects []core.PdfObject) (optimizedObjects []core.PdfO
|
||||
common.Log.Warning("Error decode the image stream %s")
|
||||
continue
|
||||
}
|
||||
encoder := core.NewDCTEncoder()
|
||||
encoder.ColorComponents = img.ColorComponents
|
||||
encoder.Quality = i.ImageQuality
|
||||
encoder.BitsPerComponent = img.BitsPerComponent
|
||||
encoder.Width = img.Width
|
||||
encoder.Height = img.Height
|
||||
streamData, err := encoder.EncodeBytes(data)
|
||||
dctenc := core.NewDCTEncoder()
|
||||
dctenc.ColorComponents = img.ColorComponents
|
||||
dctenc.Quality = i.ImageQuality
|
||||
dctenc.BitsPerComponent = img.BitsPerComponent
|
||||
dctenc.Width = img.Width
|
||||
dctenc.Height = img.Height
|
||||
streamData, err := dctenc.EncodeBytes(data)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var filter core.StreamEncoder
|
||||
filter = dctenc
|
||||
|
||||
// Check if combining with FlateEncoding improves things further.
|
||||
{
|
||||
flate := core.NewFlateEncoder()
|
||||
multienc := core.NewMultiEncoder()
|
||||
multienc.AddEncoder(flate)
|
||||
multienc.AddEncoder(dctenc)
|
||||
|
||||
encoded, err := multienc.EncodeBytes(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(encoded) < len(streamData) {
|
||||
common.Log.Debug("Multi enc improves: %d to %d (orig %d)",
|
||||
len(streamData), len(encoded), len(stream.Stream))
|
||||
streamData = encoded
|
||||
filter = multienc
|
||||
}
|
||||
}
|
||||
|
||||
originalSize := len(stream.Stream)
|
||||
if originalSize < len(streamData) {
|
||||
// Worse - ignoring.
|
||||
continue
|
||||
}
|
||||
newStream := &core.PdfObjectStream{Stream: streamData}
|
||||
newStream.PdfObjectReference = stream.PdfObjectReference
|
||||
newStream.PdfObjectDictionary = core.MakeDict()
|
||||
newStream.PdfObjectDictionary.Merge(stream.PdfObjectDictionary)
|
||||
fn := core.PdfObjectName(encoder.GetFilterName())
|
||||
newStream.PdfObjectDictionary.Set(core.PdfObjectName("Filter"), &fn)
|
||||
ln := core.PdfObjectInteger(int64(len(streamData)))
|
||||
newStream.PdfObjectDictionary.Set(core.PdfObjectName("Length"), &ln)
|
||||
newStream.Merge(stream.PdfObjectDictionary)
|
||||
newStream.Merge(filter.MakeStreamDict())
|
||||
newStream.Set("Length", core.MakeInteger(int64(len(streamData))))
|
||||
replaceTable[stream] = newStream
|
||||
images[index].Stream = newStream
|
||||
}
|
||||
|
@ -12,6 +12,12 @@ import (
|
||||
// New creates a optimizers chain from options.
|
||||
func New(options Options) *Chain {
|
||||
chain := new(Chain)
|
||||
if options.CleanFonts || options.SubsetFonts {
|
||||
chain.Append(&CleanFonts{Subset: options.SubsetFonts})
|
||||
}
|
||||
if options.CleanContentstream {
|
||||
chain.Append(new(CleanContentstream))
|
||||
}
|
||||
if options.ImageUpperPPI > 0 {
|
||||
imageOptimizer := new(ImagePPI)
|
||||
imageOptimizer.ImageUpperPPI = options.ImageUpperPPI
|
||||
|
@ -14,4 +14,7 @@ type Options struct {
|
||||
UseObjectStreams bool
|
||||
CombineIdenticalIndirectObjects bool
|
||||
CompressStreams bool
|
||||
CleanFonts bool
|
||||
SubsetFonts bool
|
||||
CleanContentstream bool
|
||||
}
|
||||
|
102
model/optimize/utils.go
Normal file
102
model/optimize/utils.go
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* This file is subject to the terms and conditions defined in
|
||||
* file 'LICENSE.md', which is part of this source code package.
|
||||
*/
|
||||
|
||||
package optimize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/unidoc/unipdf/v3/core"
|
||||
)
|
||||
|
||||
type objectStructure struct {
|
||||
catalogDict *core.PdfObjectDictionary
|
||||
pagesDict *core.PdfObjectDictionary
|
||||
pages []*core.PdfIndirectObject
|
||||
}
|
||||
|
||||
// getObjectStructure identifies the Catalog and Pages dictionary and finds a list of pages.
|
||||
func getObjectStructure(objects []core.PdfObject) objectStructure {
|
||||
objstr := objectStructure{}
|
||||
found := false
|
||||
for _, obj := range objects {
|
||||
switch t := obj.(type) {
|
||||
case *core.PdfIndirectObject:
|
||||
dict, is := core.GetDict(t)
|
||||
if !is {
|
||||
continue
|
||||
}
|
||||
kind, is := core.GetName(dict.Get("Type"))
|
||||
if !is {
|
||||
continue
|
||||
}
|
||||
|
||||
switch kind.String() {
|
||||
case "Catalog":
|
||||
objstr.catalogDict = dict
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if found {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
return objstr
|
||||
}
|
||||
|
||||
pagesDict, ok := core.GetDict(objstr.catalogDict.Get("Pages"))
|
||||
if !ok {
|
||||
return objstr
|
||||
}
|
||||
objstr.pagesDict = pagesDict
|
||||
|
||||
kids, ok := core.GetArray(pagesDict.Get("Kids"))
|
||||
if !ok {
|
||||
return objstr
|
||||
}
|
||||
for _, obj := range kids.Elements() {
|
||||
pobj, ok := core.GetIndirect(obj)
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
objstr.pages = append(objstr.pages, pobj)
|
||||
}
|
||||
|
||||
return objstr
|
||||
}
|
||||
|
||||
// getPageContents loads the page content stream as a string from a /Contents entry.
|
||||
// Either a single stream, or an array of streams. Returns the list of objects that
|
||||
// can be used if need to replace.
|
||||
func getPageContents(contentsObj core.PdfObject) (contents string, objs []core.PdfObject) {
|
||||
var buf bytes.Buffer
|
||||
|
||||
switch t := contentsObj.(type) {
|
||||
case *core.PdfIndirectObject:
|
||||
objs = append(objs, t)
|
||||
contentsObj = t.PdfObject
|
||||
}
|
||||
|
||||
switch t := contentsObj.(type) {
|
||||
case *core.PdfObjectStream:
|
||||
if decoded, err := core.DecodeStream(t); err == nil {
|
||||
buf.Write(decoded)
|
||||
objs = append(objs, t)
|
||||
}
|
||||
case *core.PdfObjectArray:
|
||||
for _, elobj := range t.Elements() {
|
||||
switch el := elobj.(type) {
|
||||
case *core.PdfObjectStream:
|
||||
if decoded, err := core.DecodeStream(el); err == nil {
|
||||
buf.Write(decoded)
|
||||
objs = append(objs, el)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return buf.String(), objs
|
||||
}
|
@ -281,42 +281,42 @@ func (r *PdfReader) newPdfPageFromDict(p *core.PdfObjectDictionary) (*PdfPage, e
|
||||
|
||||
// GetAnnotations returns the list of page annotations for `page`. If not loaded attempts to load the
|
||||
// annotations, otherwise returns the loaded list.
|
||||
func (page *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) {
|
||||
if page.annotations != nil {
|
||||
return page.annotations, nil
|
||||
func (p *PdfPage) GetAnnotations() ([]*PdfAnnotation, error) {
|
||||
if p.annotations != nil {
|
||||
return p.annotations, nil
|
||||
}
|
||||
if page.Annots == nil {
|
||||
page.annotations = []*PdfAnnotation{}
|
||||
if p.Annots == nil {
|
||||
p.annotations = []*PdfAnnotation{}
|
||||
return nil, nil
|
||||
}
|
||||
if page.reader == nil {
|
||||
page.annotations = []*PdfAnnotation{}
|
||||
if p.reader == nil {
|
||||
p.annotations = []*PdfAnnotation{}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
annots, err := page.reader.loadAnnotations(page.Annots)
|
||||
annots, err := p.reader.loadAnnotations(p.Annots)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if annots == nil {
|
||||
page.annotations = []*PdfAnnotation{}
|
||||
p.annotations = []*PdfAnnotation{}
|
||||
}
|
||||
|
||||
page.annotations = annots
|
||||
return page.annotations, nil
|
||||
p.annotations = annots
|
||||
return p.annotations, nil
|
||||
}
|
||||
|
||||
// AddAnnotation appends `annot` to the list of page annotations.
|
||||
func (page *PdfPage) AddAnnotation(annot *PdfAnnotation) {
|
||||
if page.annotations == nil {
|
||||
page.GetAnnotations() // Ensure has been loaded.
|
||||
func (p *PdfPage) AddAnnotation(annot *PdfAnnotation) {
|
||||
if p.annotations == nil {
|
||||
p.GetAnnotations() // Ensure has been loaded.
|
||||
}
|
||||
page.annotations = append(page.annotations, annot)
|
||||
p.annotations = append(p.annotations, annot)
|
||||
}
|
||||
|
||||
// SetAnnotations sets the annotations list.
|
||||
func (page *PdfPage) SetAnnotations(annotations []*PdfAnnotation) {
|
||||
page.annotations = annotations
|
||||
func (p *PdfPage) SetAnnotations(annotations []*PdfAnnotation) {
|
||||
p.annotations = annotations
|
||||
}
|
||||
|
||||
// loadAnnotations loads and returns the PDF annotations from the input annotations object (array).
|
||||
|
Loading…
x
Reference in New Issue
Block a user