Skip referenced pages which are not present in the catalog (#377)

* Skip referenced pages which are not present in the catalog

* Improve documentation for the copyObject method of the writer

* Add creator test case for checking referenced page destinations
This commit is contained in:
Adrian-George Bostan 2020-06-18 18:06:06 +03:00 committed by GitHub
parent ae20c30ae4
commit 7bf2f62c3b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 155 additions and 51 deletions

View File

@ -3067,6 +3067,77 @@ func TestPageLabels(t *testing.T) {
require.Equal(t, core.EqualObjects(genPageLabels, pageLabels), true)
}
func TestReferencedPageDestinations(t *testing.T) {
testPages := func(buf *bytes.Buffer, expectedPages, expectedNullDestPages int) {
reader, err := model.NewPdfReader(bytes.NewReader(buf.Bytes()))
require.NoError(t, err)
// Check number of pages in catalog.
numPages, err := reader.GetNumPages()
require.NoError(t, err)
require.Equal(t, expectedPages, numPages)
// Check outline destionation pages.
outlines, err := reader.GetOutlines()
require.NoError(t, err)
var nullDestPages int
var validDestPages int
for _, entry := range outlines.Entries {
pageObj := entry.Dest.PageObj
require.NotNil(t, pageObj)
if core.IsNullObject(entry.Dest.PageObj) {
nullDestPages++
continue
}
_, _, err := reader.PageFromIndirectObject(pageObj)
require.NoError(t, err)
validDestPages++
}
require.Equal(t, expectedPages, validDestPages)
require.Equal(t, expectedNullDestPages, nullDestPages)
}
// Generate and test input file.
c := New()
c.AddTOC = true
numPages := 10
for i := 0; i < numPages; i++ {
chapter := c.NewChapter(fmt.Sprintf("Chapter %d", i+1))
paragraph := c.NewParagraph(fmt.Sprintf("Content for chapter %d", i+1))
chapter.Add(paragraph)
require.NoError(t, c.Draw(chapter))
if i < numPages-1 {
c.NewPage()
}
}
buf := bytes.NewBuffer(nil)
require.NoError(t, c.Write(buf))
testPages(buf, 11, 0)
// Generate and test split input file.
reader, err := model.NewPdfReader(bytes.NewReader(buf.Bytes()))
require.NoError(t, err)
writer := model.NewPdfWriter()
for i, page := range reader.PageList {
if i%2 == 0 {
require.NoError(t, writer.AddPage(page))
}
}
writer.AddOutlineTree(reader.GetOutlineTree())
buf = bytes.NewBuffer(nil)
require.NoError(t, writer.Write(buf))
testPages(buf, 6, 5)
}
var errRenderNotSupported = errors.New("rendering pdf is not supported on this system")
// renderPDFToPNGs uses ghostscript (gs) to render specified PDF file into a set of PNG images (one per page).

View File

@ -133,6 +133,7 @@ func SetPdfTitle(title string) {
type PdfWriter struct {
root *core.PdfIndirectObject
pages *core.PdfIndirectObject
pagesMap map[core.PdfObject]struct{} // Pages lookup table.
objects []core.PdfObject // Objects to write.
objectsMap map[core.PdfObject]struct{} // Quick lookup table.
outlines []*core.PdfIndirectObject
@ -256,6 +257,7 @@ func NewPdfWriter() PdfWriter {
pages.PdfObject = pagedict
w.pages = &pages
w.pagesMap = map[core.PdfObject]struct{}{}
w.addObject(w.pages)
catalogDict.Set("Pages", &pages)
@ -270,114 +272,143 @@ func NewPdfWriter() PdfWriter {
// fills objectToObjectCopyMap to replace the old object to the copy of object if needed.
// Parameter objectToObjectCopyMap is needed to replace object references to its copies.
// Because many objects can contain references to another objects like pages to images.
func copyObject(obj core.PdfObject, objectToObjectCopyMap map[core.PdfObject]core.PdfObject) core.PdfObject {
// If a skip map is provided and the writer is not set to append mode, the
// children objects of pages which are not present in the catalog are added to
// the map and the page dictionaries are replaced with null objects.
func (w *PdfWriter) copyObject(obj core.PdfObject,
objectToObjectCopyMap map[core.PdfObject]core.PdfObject,
skipMap map[core.PdfObject]struct{}, skip bool) core.PdfObject {
if newObj, ok := objectToObjectCopyMap[obj]; ok {
return newObj
}
newObj := obj
skipUnusedPages := !w.appendMode && skipMap != nil
switch t := obj.(type) {
case *core.PdfObjectArray:
newObj := &core.PdfObjectArray{}
arrObj := core.MakeArray()
newObj = arrObj
objectToObjectCopyMap[obj] = newObj
for _, val := range t.Elements() {
newObj.Append(copyObject(val, objectToObjectCopyMap))
arrObj.Append(w.copyObject(val, objectToObjectCopyMap, skipMap, skip))
}
return newObj
case *core.PdfObjectStreams:
newObj := &core.PdfObjectStreams{PdfObjectReference: t.PdfObjectReference}
streamsObj := &core.PdfObjectStreams{PdfObjectReference: t.PdfObjectReference}
newObj = streamsObj
objectToObjectCopyMap[obj] = newObj
for _, val := range t.Elements() {
newObj.Append(copyObject(val, objectToObjectCopyMap))
streamsObj.Append(w.copyObject(val, objectToObjectCopyMap, skipMap, skip))
}
return newObj
case *core.PdfObjectStream:
newObj := &core.PdfObjectStream{
streamObj := &core.PdfObjectStream{
Stream: t.Stream,
PdfObjectReference: t.PdfObjectReference,
}
newObj = streamObj
objectToObjectCopyMap[obj] = newObj
newObj.PdfObjectDictionary = copyObject(t.PdfObjectDictionary, objectToObjectCopyMap).(*core.PdfObjectDictionary)
return newObj
streamObj.PdfObjectDictionary = w.copyObject(t.PdfObjectDictionary, objectToObjectCopyMap, skipMap, skip).(*core.PdfObjectDictionary)
case *core.PdfObjectDictionary:
newObj := core.MakeDict()
// Check if the object is a page dictionary and search it in the
// writer pages. If not found, replace it with a null object and add
// the chain of children objects to the skip map.
var unused bool
if skipUnusedPages && !skip {
if dictType, _ := core.GetNameVal(t.Get("Type")); dictType == "Page" {
_, ok := w.pagesMap[t]
skip = !ok
unused = skip
}
}
dictObj := core.MakeDict()
newObj = dictObj
objectToObjectCopyMap[obj] = newObj
for _, key := range t.Keys() {
val := t.Get(key)
newObj.Set(key, copyObject(val, objectToObjectCopyMap))
dictObj.Set(key, w.copyObject(t.Get(key), objectToObjectCopyMap, skipMap, skip))
}
// If an unused page dictionary is found, replace it with a null object.
if unused {
newObj = core.MakeNull()
skip = false
}
return newObj
case *core.PdfIndirectObject:
newObj := &core.PdfIndirectObject{
indObj := &core.PdfIndirectObject{
PdfObjectReference: t.PdfObjectReference,
}
newObj = indObj
objectToObjectCopyMap[obj] = newObj
newObj.PdfObject = copyObject(t.PdfObject, objectToObjectCopyMap)
return newObj
indObj.PdfObject = w.copyObject(t.PdfObject, objectToObjectCopyMap, skipMap, skip)
case *core.PdfObjectString:
newObj := &core.PdfObjectString{}
*newObj = *t
strObj := *t
newObj = &strObj
objectToObjectCopyMap[obj] = newObj
return newObj
case *core.PdfObjectName:
newObj := core.PdfObjectName(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
nameObj := core.PdfObjectName(*t)
newObj = &nameObj
objectToObjectCopyMap[obj] = newObj
case *core.PdfObjectNull:
newObj := core.PdfObjectNull{}
objectToObjectCopyMap[obj] = &newObj
return &newObj
newObj = core.MakeNull()
objectToObjectCopyMap[obj] = newObj
case *core.PdfObjectInteger:
newObj := core.PdfObjectInteger(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
intObj := core.PdfObjectInteger(*t)
newObj = &intObj
objectToObjectCopyMap[obj] = newObj
case *core.PdfObjectReference:
newObj := core.PdfObjectReference(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
refObj := core.PdfObjectReference(*t)
newObj = &refObj
objectToObjectCopyMap[obj] = newObj
case *core.PdfObjectFloat:
newObj := core.PdfObjectFloat(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
floatObj := core.PdfObjectFloat(*t)
newObj = &floatObj
objectToObjectCopyMap[obj] = newObj
case *core.PdfObjectBool:
newObj := core.PdfObjectBool(*t)
objectToObjectCopyMap[obj] = &newObj
return &newObj
boolObj := core.PdfObjectBool(*t)
newObj = &boolObj
objectToObjectCopyMap[obj] = newObj
case *pdfSignDictionary:
newObj := &pdfSignDictionary{
sigObj := &pdfSignDictionary{
PdfObjectDictionary: core.MakeDict(),
handler: t.handler,
signature: t.signature,
}
newObj = sigObj
objectToObjectCopyMap[obj] = newObj
for _, key := range t.Keys() {
val := t.Get(key)
newObj.Set(key, copyObject(val, objectToObjectCopyMap))
sigObj.Set(key, w.copyObject(t.Get(key), objectToObjectCopyMap, skipMap, skip))
}
return newObj
default:
common.Log.Info("TODO(a5i): implement copyObject for %+v", obj)
}
// return other objects as is
return obj
if skipUnusedPages && skip {
skipMap[obj] = struct{}{}
}
return newObj
}
// copyObjects makes objects copy and set as working.
func (w *PdfWriter) copyObjects() {
objectToObjectCopyMap := make(map[core.PdfObject]core.PdfObject)
objects := make([]core.PdfObject, len(w.objects))
objects := make([]core.PdfObject, 0, len(w.objects))
objectsMap := make(map[core.PdfObject]struct{}, len(w.objects))
for i, obj := range w.objects {
newObject := copyObject(obj, objectToObjectCopyMap)
objects[i] = newObject
skipMap := make(map[core.PdfObject]struct{})
for _, obj := range w.objects {
newObject := w.copyObject(obj, objectToObjectCopyMap, skipMap, false)
if _, ok := skipMap[obj]; ok {
continue
}
objects = append(objects, newObject)
objectsMap[newObject] = struct{}{}
}
w.objects = objects
w.objectsMap = objectsMap
w.infoObj = copyObject(w.infoObj, objectToObjectCopyMap).(*core.PdfIndirectObject)
w.root = copyObject(w.root, objectToObjectCopyMap).(*core.PdfIndirectObject)
w.infoObj = w.copyObject(w.infoObj, objectToObjectCopyMap, nil, false).(*core.PdfIndirectObject)
w.root = w.copyObject(w.root, objectToObjectCopyMap, nil, false).(*core.PdfIndirectObject)
if w.encryptObj != nil {
w.encryptObj = copyObject(w.encryptObj, objectToObjectCopyMap).(*core.PdfIndirectObject)
w.encryptObj = w.copyObject(w.encryptObj, objectToObjectCopyMap, nil, false).(*core.PdfIndirectObject)
}
// Update replace map.
@ -631,6 +662,8 @@ func (w *PdfWriter) AddPage(page *PdfPage) error {
return errors.New("invalid Pages Kids obj (not an array)")
}
kids.Append(pageObj)
w.pagesMap[pDict] = struct{}{}
pageCount, ok := core.GetInt(pagesDict.Get("Count"))
if !ok {
return errors.New("invalid Pages Count object (not an integer)")