2020-03-27 12:47:41 +01:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package document
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/binary"
|
|
|
|
"io"
|
|
|
|
"runtime/debug"
|
|
|
|
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
|
|
|
|
|
|
"github.com/unidoc/unipdf/v3/internal/jbig2/bitmap"
|
|
|
|
"github.com/unidoc/unipdf/v3/internal/jbig2/document/segments"
|
|
|
|
"github.com/unidoc/unipdf/v3/internal/jbig2/encoder/classer"
|
|
|
|
"github.com/unidoc/unipdf/v3/internal/jbig2/errors"
|
|
|
|
"github.com/unidoc/unipdf/v3/internal/jbig2/reader"
|
|
|
|
"github.com/unidoc/unipdf/v3/internal/jbig2/writer"
|
|
|
|
)
|
|
|
|
|
|
|
|
// fileHeaderID first byte slices of the jbig2 encoded file, see D.4.1.
|
|
|
|
var fileHeaderID = []byte{0x97, 0x4A, 0x42, 0x32, 0x0D, 0x0A, 0x1A, 0x0A}
|
|
|
|
|
|
|
|
// Document is the jbig2 document model containing pages and global segments.
|
|
|
|
// By creating new document with method New or NewWithGlobals all the jbig2
|
|
|
|
// encoded data segment headers are decoded. In order to decode whole
|
|
|
|
// document, all of it's pages should be decoded using GetBitmap method.
|
|
|
|
// PDF encoded documents should contains only one Page with the number 1.
|
|
|
|
type Document struct {
|
|
|
|
// Pages contains all pages of this document.
|
|
|
|
Pages map[int]*Page
|
|
|
|
// NumberOfPagesUnknown defines if the amout of the pages is known.
|
|
|
|
NumberOfPagesUnknown bool
|
|
|
|
// NumberOfPages - D.4.3 - Number of pages field (4 bytes). Only presented if NumberOfPagesUnknown is true.
|
|
|
|
NumberOfPages uint32
|
|
|
|
// GBUseExtTemplate defines wether extended Template is used.
|
|
|
|
GBUseExtTemplate bool
|
|
|
|
// SubInputStream is the source data stream wrapped into a SubInputStream.
|
|
|
|
InputStream reader.StreamReader
|
|
|
|
// GlobalSegments contains all segments that aren't associated with a page.
|
|
|
|
GlobalSegments *Globals
|
|
|
|
// OrganizationType is the document segment organization.
|
|
|
|
OrganizationType segments.OrganizationType
|
|
|
|
|
|
|
|
// Encoder variables
|
|
|
|
// Classer is the document encoding classifier.
|
|
|
|
Classer *classer.Classer
|
|
|
|
// XRes and YRes are the PPI for the x and y direction.
|
|
|
|
XRes, YRes int
|
|
|
|
// FullHeaders is a flag that defines if the encoder should produce full JBIG2 files.
|
|
|
|
FullHeaders bool
|
|
|
|
|
|
|
|
// CurrentSegmentNumber current symbol number.
|
|
|
|
CurrentSegmentNumber uint32
|
|
|
|
|
|
|
|
// AverageTemplates are the grayed templates.
|
|
|
|
AverageTemplates *bitmap.Bitmaps
|
|
|
|
BaseIndexes []int
|
|
|
|
|
|
|
|
Refinement bool
|
|
|
|
RefineLevel int
|
|
|
|
|
|
|
|
fileHeaderLength uint8
|
|
|
|
|
|
|
|
w *writer.Buffer
|
|
|
|
|
|
|
|
EncodeGlobals bool
|
|
|
|
// globalSymbolsNumber is the number of globally defined symbols.
|
|
|
|
globalSymbolsNumber int
|
|
|
|
// singleUseSymbols is the mapping between the page and exclusively used components for that page.
|
|
|
|
// This means that if some symbols are mapped to the page 1 only then that information would be stored here.
|
|
|
|
singleUseSymbols map[int][]int
|
|
|
|
// pageComponents is the mapping from page number to the list of connected components for that page.
|
|
|
|
// i.e. 1 map[1]=[]int{1,2,3} means that page 1 contains components [1,2,3]
|
|
|
|
pageComponents map[int][]int
|
|
|
|
// symbolsUsed is the array that contains the number of a symbol being used.
|
|
|
|
// Array index is the class-id of the symbol, where the value is it's occurrence.
|
|
|
|
symbolsUsed []int
|
|
|
|
// symbolIndexMap is the mapping between symbol class and the number it represents.
|
|
|
|
symbolIndexMap map[int]int
|
|
|
|
}
|
|
|
|
|
|
|
|
// DecodeDocument decodes provided document based on the provided 'input' data stream
|
|
|
|
// and with optional Global defined segments 'globals'.
|
|
|
|
func DecodeDocument(input reader.StreamReader, globals *Globals) (*Document, error) {
|
|
|
|
return decodeWithGlobals(input, globals)
|
|
|
|
}
|
|
|
|
|
|
|
|
// InitEncodeDocument initializes the jbig2 document for the encoding process.
|
|
|
|
func InitEncodeDocument(fullHeaders bool) *Document {
|
|
|
|
return &Document{FullHeaders: fullHeaders, w: writer.BufferedMSB(), Pages: map[int]*Page{}, singleUseSymbols: map[int][]int{}, symbolIndexMap: map[int]int{}, pageComponents: map[int][]int{}}
|
|
|
|
}
|
|
|
|
|
|
|
|
// AddGenericPage creates the jbig2 page based on the provided bitmap. The data provided
|
|
|
|
func (d *Document) AddGenericPage(bm *bitmap.Bitmap, duplicateLineRemoval bool) (err error) {
|
|
|
|
const processName = "Document.AddGenericPage"
|
|
|
|
// check if this is PDFMode and there is already a page
|
|
|
|
if !d.FullHeaders && d.NumberOfPages != 0 {
|
2020-04-03 22:54:59 +02:00
|
|
|
return errors.Error(processName, "document already contains page. FileMode disallows adding more than one page")
|
2020-03-27 12:47:41 +01:00
|
|
|
}
|
|
|
|
// initialize page
|
|
|
|
page := &Page{
|
|
|
|
Segments: []*segments.Header{},
|
|
|
|
Bitmap: bm,
|
|
|
|
Document: d,
|
|
|
|
FinalHeight: bm.Height,
|
|
|
|
FinalWidth: bm.Width,
|
|
|
|
IsLossless: true,
|
|
|
|
}
|
|
|
|
page.PageNumber = int(d.nextPageNumber())
|
|
|
|
d.Pages[page.PageNumber] = page
|
|
|
|
|
|
|
|
// add page information segment.
|
|
|
|
page.AddPageInformationSegment()
|
|
|
|
// add the generic region for given bitmap
|
|
|
|
if err = page.AddGenericRegion(bm, 0, 0, 0, segments.TImmediateGenericRegion, duplicateLineRemoval); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
if d.FullHeaders {
|
|
|
|
// finish the page
|
|
|
|
page.AddEndOfPageSegment()
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// AddClassifiedPage adds the bitmap page with a classification 'method'.
|
|
|
|
func (d *Document) AddClassifiedPage(bm *bitmap.Bitmap, method classer.Method) (err error) {
|
|
|
|
const processName = "Document.AddClassifiedPage"
|
|
|
|
// check if this is PDFMode and there is already a page
|
|
|
|
if !d.FullHeaders && d.NumberOfPages != 0 {
|
|
|
|
return errors.Error(processName, "document already contains page. FileMode disallows adding more than one page")
|
|
|
|
}
|
|
|
|
// initialize the classer if not set yet
|
|
|
|
if d.Classer == nil {
|
|
|
|
if d.Classer, err = classer.Init(classer.DefaultSettings()); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pageNumber := int(d.nextPageNumber())
|
|
|
|
p := &Page{
|
|
|
|
Segments: []*segments.Header{},
|
|
|
|
Bitmap: bm,
|
|
|
|
Document: d,
|
|
|
|
FinalHeight: bm.Height,
|
|
|
|
FinalWidth: bm.Width,
|
|
|
|
PageNumber: pageNumber,
|
|
|
|
}
|
|
|
|
d.Pages[pageNumber] = p
|
|
|
|
switch method {
|
|
|
|
case classer.RankHaus:
|
|
|
|
p.EncodingMethod = RankHausEM
|
|
|
|
case classer.Correlation:
|
|
|
|
p.EncodingMethod = CorrelationEM
|
|
|
|
}
|
|
|
|
|
|
|
|
p.AddPageInformationSegment()
|
|
|
|
if err = d.Classer.AddPage(bm, pageNumber, method); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.FullHeaders {
|
|
|
|
// finish the page
|
|
|
|
p.AddEndOfPageSegment()
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// completeClassifiedPages completes the pages with the classification encoding type.
|
|
|
|
// creates global symbol dictionary segment.
|
|
|
|
func (d *Document) completeClassifiedPages() (err error) {
|
|
|
|
const processName = "completeClassifiedPages"
|
|
|
|
|
|
|
|
// check if the Classer is initialized
|
|
|
|
if d.Classer == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
// map symbol number to the times it was used
|
|
|
|
d.symbolsUsed = make([]int, d.Classer.UndilatedTemplates.Size())
|
|
|
|
for i := 0; i < d.Classer.ClassIDs.Size(); i++ {
|
|
|
|
classID, err := d.Classer.ClassIDs.Get(i)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrapf(err, processName, "class with id: '%d'", i)
|
|
|
|
}
|
|
|
|
d.symbolsUsed[classID]++
|
|
|
|
}
|
|
|
|
|
|
|
|
// find if there are any symbols used on multiple pages at once - Globals.
|
|
|
|
var globalSymbols []int
|
|
|
|
for i := 0; i < d.Classer.UndilatedTemplates.Size(); i++ {
|
|
|
|
if d.NumberOfPages == 1 || d.symbolsUsed[i] > 1 {
|
|
|
|
globalSymbols = append(globalSymbols, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// build page components map
|
|
|
|
var (
|
|
|
|
page *Page
|
|
|
|
ok bool
|
|
|
|
)
|
|
|
|
// iterate over all classified pages.
|
|
|
|
// create a page components map which maps the page number to the list of connected components
|
|
|
|
// for that page.
|
|
|
|
for i, pageNumber := range *d.Classer.ComponentPageNumbers {
|
|
|
|
if page, ok = d.Pages[pageNumber]; !ok {
|
|
|
|
return errors.Errorf(processName, "page: '%d' not found", i)
|
|
|
|
}
|
|
|
|
// make sure that the page is not Generic Encoded.
|
|
|
|
if page.EncodingMethod == GenericEM {
|
|
|
|
common.Log.Error("Generic page with number: '%d' mapped as classified page", i)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// add component to the given page number components.
|
|
|
|
d.pageComponents[pageNumber] = append(d.pageComponents[pageNumber], i)
|
|
|
|
|
|
|
|
// check if the symbol was used on this page exclusively.
|
|
|
|
symbol, err := d.Classer.ClassIDs.Get(i)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrapf(err, processName, "no such classID: %d", i)
|
|
|
|
}
|
|
|
|
if d.symbolsUsed[symbol] == 1 && d.NumberOfPages != 1 {
|
|
|
|
sus := append(d.singleUseSymbols[pageNumber], symbol)
|
|
|
|
d.singleUseSymbols[pageNumber] = sus
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if err = d.Classer.ComputeLLCorners(); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: set the page number of the symold dictionary depending if there are more than one page.
|
|
|
|
// This is related with the mapping[*bitmap.Bitmap]int
|
|
|
|
// create global symbols
|
|
|
|
if _, err = d.addSymbolDictionary(0, d.Classer.UndilatedTemplates, globalSymbols, d.symbolIndexMap, false); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) produceClassifiedPages() (err error) {
|
|
|
|
const processName = "produceClassifiedPages"
|
|
|
|
if d.Classer == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
var (
|
|
|
|
page *Page
|
|
|
|
ok bool
|
|
|
|
globalSD *segments.Header
|
|
|
|
)
|
|
|
|
// iterate over all pages and find if it is of type different than GenericEM.
|
|
|
|
for i := 1; i <= int(d.NumberOfPages); i++ {
|
|
|
|
if page, ok = d.Pages[i]; !ok {
|
|
|
|
return errors.Errorf(processName, "page: '%d' not found", i)
|
|
|
|
}
|
|
|
|
if page.EncodingMethod == GenericEM {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if globalSD == nil {
|
|
|
|
if globalSD, err = d.GlobalSegments.GetSymbolDictionary(); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// produce new classified page.
|
|
|
|
if err = d.produceClassifiedPage(page, globalSD); err != nil {
|
|
|
|
return errors.Wrapf(err, processName, "page: '%d'", i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) produceClassifiedPage(page *Page, globalSD *segments.Header) (err error) {
|
|
|
|
const processName = "produceClassifiedPage"
|
|
|
|
// if given page contains it's own unique symbols
|
|
|
|
// create additional symbols dictionary
|
|
|
|
var secondSymbolMap map[int]int
|
|
|
|
numSyms := d.globalSymbolsNumber
|
|
|
|
refererTo := []*segments.Header{globalSD}
|
|
|
|
|
|
|
|
// check if there are symbols used only for given page. This would be present only if there is
|
|
|
|
// more then 1 page in the document.
|
|
|
|
if len(d.singleUseSymbols[page.PageNumber]) > 0 {
|
|
|
|
// create new symbols dictionary
|
|
|
|
secondSymbolMap = map[int]int{}
|
|
|
|
extraSDHeader, err := d.addSymbolDictionary(page.PageNumber, d.Classer.UndilatedTemplates, d.singleUseSymbols[page.PageNumber], secondSymbolMap, false)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
refererTo = append(refererTo, extraSDHeader)
|
|
|
|
numSyms += len(d.singleUseSymbols[page.PageNumber])
|
|
|
|
}
|
|
|
|
// get components array for given page.
|
|
|
|
comps := d.pageComponents[page.PageNumber]
|
|
|
|
common.Log.Debug("Page: '%d' comps: %v", page.PageNumber, comps)
|
|
|
|
// add the text region to the page
|
|
|
|
page.addTextRegionSegment(refererTo, d.symbolIndexMap, secondSymbolMap, d.pageComponents[page.PageNumber],
|
|
|
|
d.Classer.PtaLL, d.Classer.UndilatedTemplates, d.Classer.ClassIDs, nil,
|
|
|
|
log2up(numSyms), len(d.pageComponents[page.PageNumber]))
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func log2up(v int) int {
|
|
|
|
r := 0
|
|
|
|
isPow2 := (v & (v - 1)) == 0
|
|
|
|
|
|
|
|
v >>= 1
|
|
|
|
for ; v != 0; v >>= 1 {
|
|
|
|
r++
|
|
|
|
}
|
|
|
|
if isPow2 {
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
return r + 1
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) addSymbolDictionary(
|
|
|
|
pageNumber int, symbols *bitmap.Bitmaps, symbolList []int,
|
|
|
|
symbolMap map[int]int, unborderSymbols bool,
|
|
|
|
) (*segments.Header, error) {
|
|
|
|
const processName = "addSymbolDictionary"
|
|
|
|
// add symbolTable
|
|
|
|
sd := &segments.SymbolDictionary{}
|
|
|
|
if err := sd.InitEncode(symbols, symbolList, symbolMap, unborderSymbols); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
sh := &segments.Header{
|
|
|
|
Type: segments.TSymbolDictionary,
|
|
|
|
PageAssociation: pageNumber,
|
|
|
|
SegmentData: sd,
|
|
|
|
}
|
|
|
|
if pageNumber == 0 {
|
|
|
|
if d.GlobalSegments == nil {
|
|
|
|
d.GlobalSegments = &Globals{}
|
|
|
|
}
|
|
|
|
d.GlobalSegments.AddSegment(sh)
|
|
|
|
return sh, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// find the page at Number: 'pageNumber'
|
|
|
|
// add the symbol dictionary after page info header
|
|
|
|
page, ok := d.Pages[pageNumber]
|
|
|
|
if !ok {
|
|
|
|
return nil, errors.Errorf(processName, "page: '%d' not found", pageNumber)
|
|
|
|
}
|
|
|
|
var (
|
|
|
|
i int
|
|
|
|
seg *segments.Header
|
|
|
|
)
|
|
|
|
for i, seg = range page.Segments {
|
|
|
|
if seg.Type == segments.TPageInformation {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// the segment should be after page info
|
|
|
|
i++
|
|
|
|
page.Segments = append(page.Segments, nil)
|
|
|
|
copy(page.Segments[i+1:], page.Segments[i:])
|
|
|
|
page.Segments[i] = sh
|
|
|
|
|
|
|
|
return sh, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) completeSymbols() (err error) {
|
|
|
|
const processName = "completeSymbols"
|
|
|
|
// check if classer was used.
|
|
|
|
if d.Classer == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.Classer.UndilatedTemplates == nil {
|
|
|
|
return errors.Error(processName, "no templates defined for the classer")
|
|
|
|
}
|
|
|
|
|
|
|
|
// select the symbols used only once per page
|
|
|
|
// map these symbols
|
|
|
|
singlePage := len(d.Pages) == 1
|
|
|
|
symbolsUsed := make([]int, d.Classer.UndilatedTemplates.Size())
|
|
|
|
var n int
|
|
|
|
for i := 0; i < d.Classer.ClassIDs.Size(); i++ {
|
|
|
|
n, err = d.Classer.ClassIDs.Get(i)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "class ID's")
|
|
|
|
}
|
|
|
|
symbolsUsed[n]++
|
|
|
|
}
|
|
|
|
|
|
|
|
var multiuseSymbols []int
|
|
|
|
for i := 0; i < d.Classer.UndilatedTemplates.Size(); i++ {
|
|
|
|
if symbolsUsed[i] == 0 {
|
|
|
|
return errors.Error(processName, "no symbols instances found for given class? ")
|
|
|
|
}
|
|
|
|
if symbolsUsed[i] > 1 || singlePage {
|
|
|
|
multiuseSymbols = append(multiuseSymbols, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
d.globalSymbolsNumber = len(multiuseSymbols)
|
|
|
|
|
|
|
|
// build page components map for page number to the list of connected components for that page.
|
|
|
|
var pageNum, symNum int
|
|
|
|
for i := 0; i < d.Classer.ComponentPageNumbers.Size(); i++ {
|
|
|
|
pageNum, err = d.Classer.ComponentPageNumbers.Get(i)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrapf(err, processName, "page: '%d' not found in the classer pagenumbers", i)
|
|
|
|
}
|
|
|
|
symNum, err = d.Classer.ClassIDs.Get(i)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrapf(err, processName, "can't get symbol for page '%d' from classer", pageNum)
|
|
|
|
}
|
|
|
|
if symbolsUsed[symNum] == 1 && !singlePage {
|
|
|
|
d.singleUseSymbols[pageNum] = append(d.singleUseSymbols[pageNum], symNum)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if err = d.Classer.ComputeLLCorners(); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// AutoThreshold gathers classes of symbols and uses a single representative to stand for them all.
|
|
|
|
// func (d *Document) AutoThreshold() error {
|
|
|
|
// bms := d.Classer.Pixat
|
|
|
|
// for i := 0; i < len(bm); i++ {
|
|
|
|
// bm := bms[i]
|
|
|
|
//
|
|
|
|
// for j := i + 1; j < len(bms); j++ {
|
|
|
|
// if bm.Equivalent(bms[j]) {
|
|
|
|
// if err := d.uniteTemplatesWithIndexes(i, j); err != nil {
|
|
|
|
// return err
|
|
|
|
// }
|
|
|
|
// j--
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// return nil
|
|
|
|
// }
|
|
|
|
|
|
|
|
// Encode encodes the given document and stores into 'w' writer.
|
|
|
|
func (d *Document) Encode() (data []byte, err error) {
|
|
|
|
const processName = "Document.Encode"
|
|
|
|
var n, temp int
|
|
|
|
// if the full headers flag is on, encode file header
|
|
|
|
if d.FullHeaders {
|
|
|
|
if n, err = d.encodeFileHeader(d.w); err != nil {
|
|
|
|
return nil, errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
var (
|
|
|
|
ok bool
|
|
|
|
seg *segments.Header
|
|
|
|
page *Page
|
|
|
|
)
|
|
|
|
// complete classified pages
|
|
|
|
if err = d.completeClassifiedPages(); err != nil {
|
|
|
|
return nil, errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
// produce classified pages
|
|
|
|
if err = d.produceClassifiedPages(); err != nil {
|
|
|
|
return nil, errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.GlobalSegments != nil {
|
|
|
|
for _, seg = range d.GlobalSegments.Segments {
|
|
|
|
if err = d.encodeSegment(seg, &n); err != nil {
|
|
|
|
return nil, errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 1; i <= int(d.NumberOfPages); i++ {
|
|
|
|
if page, ok = d.Pages[i]; !ok {
|
|
|
|
return nil, errors.Errorf(processName, "page: '%d' not found", i)
|
|
|
|
}
|
|
|
|
for _, seg = range page.Segments {
|
|
|
|
if err = d.encodeSegment(seg, &n); err != nil {
|
|
|
|
return nil, errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// with full headers the End Of File header must be encoded
|
|
|
|
if d.FullHeaders {
|
|
|
|
if temp, err = d.encodeEOFHeader(d.w); err != nil {
|
|
|
|
return nil, errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
n += temp
|
|
|
|
}
|
|
|
|
|
|
|
|
// get the encoded data from the 'w' writer
|
|
|
|
data = d.w.Data()
|
|
|
|
if len(data) != n {
|
|
|
|
common.Log.Debug("Bytes written (n): '%d' is not equal to the length of the data encoded: '%d'", n, len(data))
|
|
|
|
}
|
|
|
|
return data, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) encodeSegment(seg *segments.Header, n *int) error {
|
|
|
|
const processName = "encodeSegment"
|
|
|
|
seg.SegmentNumber = d.nextSegmentNumber()
|
|
|
|
|
|
|
|
temp, err := seg.Encode(d.w)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrapf(err, processName, "segment: '%d'", seg.SegmentNumber)
|
|
|
|
}
|
|
|
|
*n += temp
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetGlobalSegment implements segments.Documenter interface.
|
|
|
|
func (d *Document) GetGlobalSegment(i int) (*segments.Header, error) {
|
|
|
|
h, err := d.GlobalSegments.GetSegment(i)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "GetGlobalSegment", "")
|
|
|
|
}
|
|
|
|
return h, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetNumberOfPages gets the amount of Pages in the given document.
|
|
|
|
func (d *Document) GetNumberOfPages() (uint32, error) {
|
|
|
|
if d.NumberOfPagesUnknown || d.NumberOfPages == 0 {
|
|
|
|
if len(d.Pages) == 0 {
|
|
|
|
if err := d.mapData(); err != nil {
|
|
|
|
return 0, errors.Wrap(err, "Document.GetNumberOfPages", "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return uint32(len(d.Pages)), nil
|
|
|
|
}
|
|
|
|
return d.NumberOfPages, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetPage implements segments.Documenter interface.
|
|
|
|
// NOTE: in order to decode all document images, get page by page (page numeration starts from '1') and
|
|
|
|
// decode them by calling 'GetBitmap' method.
|
|
|
|
func (d *Document) GetPage(pageNumber int) (segments.Pager, error) {
|
|
|
|
const processName = "Document.GetPage"
|
|
|
|
if pageNumber < 0 {
|
|
|
|
common.Log.Debug("JBIG2 Page - GetPage: %d. Page cannot be lower than 0. %s", pageNumber, debug.Stack())
|
|
|
|
return nil, errors.Errorf(processName, "invalid jbig2 document - provided invalid page number: %d", pageNumber)
|
|
|
|
}
|
|
|
|
|
|
|
|
if pageNumber > len(d.Pages) {
|
|
|
|
common.Log.Debug("Page not found: %d. %s", pageNumber, debug.Stack())
|
|
|
|
return nil, errors.Error(processName, "invalid jbig2 document - page not found")
|
|
|
|
}
|
|
|
|
|
|
|
|
p, ok := d.Pages[pageNumber]
|
|
|
|
if !ok {
|
|
|
|
common.Log.Debug("Page not found: %d. %s", pageNumber, debug.Stack())
|
|
|
|
return nil, errors.Errorf(processName, "invalid jbig2 document - page not found")
|
|
|
|
}
|
|
|
|
|
|
|
|
return p, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
|
|
Private document methods
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
func (d *Document) determineRandomDataOffsets(segmentHeaders []*segments.Header, offset uint64) {
|
|
|
|
if d.OrganizationType != segments.ORandom {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, s := range segmentHeaders {
|
|
|
|
s.SegmentDataStartOffset = offset
|
|
|
|
offset += s.SegmentDataLength
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// encodeEOFHeader writes the end of file header segment into 'w' writer.
|
|
|
|
func (d *Document) encodeEOFHeader(w writer.BinaryWriter) (n int, err error) {
|
|
|
|
h := &segments.Header{SegmentNumber: d.nextSegmentNumber(), Type: segments.TEndOfFile}
|
|
|
|
if n, err = h.Encode(w); err != nil {
|
|
|
|
return 0, errors.Wrap(err, "encodeEOFHeader", "")
|
|
|
|
}
|
|
|
|
return n, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// encodeFileHeader writes the file header segment into the 'w' writer.
|
|
|
|
func (d *Document) encodeFileHeader(w writer.BinaryWriter) (n int, err error) {
|
|
|
|
const processName = "encodeFileHeader"
|
|
|
|
// file header contains following fields:
|
|
|
|
// ID string - constant 8-byte sequence
|
|
|
|
n, err = w.Write(fileHeaderID)
|
|
|
|
if err != nil {
|
|
|
|
return n, errors.Wrap(err, processName, "id")
|
|
|
|
}
|
|
|
|
|
|
|
|
// file header flags one byte field
|
|
|
|
// where:
|
|
|
|
// 0th bit - file organization type - '1' for sequential, '0' for random-access
|
|
|
|
// 1st bit - unknown number of pages - '1' if true
|
|
|
|
// this encoder stores only sequential organization type with known number of pages
|
|
|
|
// thus file header flags are equal to 0x01 byte
|
|
|
|
if err = w.WriteByte(0x01); err != nil {
|
|
|
|
return n, errors.Wrap(err, processName, "flags")
|
|
|
|
}
|
|
|
|
n++
|
|
|
|
|
|
|
|
temp := make([]byte, 4)
|
|
|
|
// last 4 bytes is the number of pages - used if the number of pages is known
|
|
|
|
binary.BigEndian.PutUint32(temp, d.NumberOfPages)
|
|
|
|
nt, err := w.Write(temp)
|
|
|
|
if err != nil {
|
|
|
|
return nt, errors.Wrap(err, processName, "page number")
|
|
|
|
}
|
|
|
|
n += nt
|
|
|
|
return n, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) isFileHeaderPresent() (bool, error) {
|
|
|
|
d.InputStream.Mark()
|
|
|
|
|
|
|
|
for _, magicByte := range fileHeaderID {
|
|
|
|
b, err := d.InputStream.ReadByte()
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if magicByte != b {
|
|
|
|
d.InputStream.Reset()
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
d.InputStream.Reset()
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) mapData() error {
|
|
|
|
const processName = "mapData"
|
|
|
|
// Get the header list
|
|
|
|
var (
|
|
|
|
segmentHeaders []*segments.Header
|
|
|
|
offset int64
|
|
|
|
kind segments.Type
|
|
|
|
)
|
|
|
|
|
|
|
|
isFileHeaderPresent, err := d.isFileHeaderPresent()
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parse the file header if exists.
|
|
|
|
if isFileHeaderPresent {
|
|
|
|
if err = d.parseFileHeader(); err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
offset += int64(d.fileHeaderLength)
|
|
|
|
d.FullHeaders = true
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
page *Page
|
|
|
|
reachedEOF bool
|
|
|
|
)
|
|
|
|
|
|
|
|
// type 51 is the EndOfFile segment kind
|
|
|
|
for kind != 51 && !reachedEOF {
|
|
|
|
// get new segment
|
|
|
|
segment, err := segments.NewHeader(d, d.InputStream, offset, d.OrganizationType)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
|
|
|
|
common.Log.Trace("Decoding segment number: %d, Type: %s", segment.SegmentNumber, segment.Type)
|
|
|
|
|
|
|
|
kind = segment.Type
|
|
|
|
if kind != segments.TEndOfFile {
|
|
|
|
if segment.PageAssociation != 0 {
|
|
|
|
page = d.Pages[segment.PageAssociation]
|
|
|
|
if page == nil {
|
|
|
|
page = newPage(d, segment.PageAssociation)
|
|
|
|
d.Pages[segment.PageAssociation] = page
|
|
|
|
if d.NumberOfPagesUnknown {
|
|
|
|
d.NumberOfPages++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
page.Segments = append(page.Segments, segment)
|
|
|
|
} else {
|
|
|
|
d.GlobalSegments.AddSegment(segment)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
segmentHeaders = append(segmentHeaders, segment)
|
|
|
|
offset = d.InputStream.StreamPosition()
|
|
|
|
|
|
|
|
if d.OrganizationType == segments.OSequential {
|
|
|
|
offset += int64(segment.SegmentDataLength)
|
|
|
|
}
|
|
|
|
|
|
|
|
reachedEOF, err = d.reachedEOF(offset)
|
|
|
|
if err != nil {
|
|
|
|
common.Log.Debug("jbig2 document reached EOF with error: %v", err)
|
|
|
|
return errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
d.determineRandomDataOffsets(segmentHeaders, uint64(offset))
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) nextPageNumber() uint32 {
|
|
|
|
d.NumberOfPages++
|
|
|
|
return d.NumberOfPages
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) nextSegmentNumber() uint32 {
|
|
|
|
s := d.CurrentSegmentNumber
|
|
|
|
d.CurrentSegmentNumber++
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) parseFileHeader() error {
|
|
|
|
const processName = "parseFileHeader"
|
|
|
|
// D.4.1 ID string read will be skipped.
|
|
|
|
_, err := d.InputStream.Seek(8, io.SeekStart)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "id")
|
|
|
|
}
|
|
|
|
|
|
|
|
// D.4.2 Header flag (1 byte)
|
|
|
|
// Bit 3-7 are reserved and must be 0
|
|
|
|
_, err = d.InputStream.ReadBits(5)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "reserved bits")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Bit 2 - extended templates are used
|
|
|
|
b, err := d.InputStream.ReadBit()
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "extended templates")
|
|
|
|
}
|
|
|
|
if b == 1 {
|
|
|
|
d.GBUseExtTemplate = true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Bit 1 - Indicates if amount of pages are unknown.
|
|
|
|
b, err = d.InputStream.ReadBit()
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "unknown page number")
|
|
|
|
}
|
|
|
|
if b != 1 {
|
|
|
|
d.NumberOfPagesUnknown = false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Bit 0 - Indicates file organization type.
|
|
|
|
b, err = d.InputStream.ReadBit()
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "organization type")
|
|
|
|
}
|
|
|
|
d.OrganizationType = segments.OrganizationType(b)
|
|
|
|
|
|
|
|
// D.4.3 Number of pages
|
|
|
|
if !d.NumberOfPagesUnknown {
|
|
|
|
d.NumberOfPages, err = d.InputStream.ReadUint32()
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, processName, "number of pages")
|
|
|
|
}
|
|
|
|
d.fileHeaderLength = 13
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Document) reachedEOF(offset int64) (bool, error) {
|
|
|
|
const processName = "reachedEOF"
|
|
|
|
_, err := d.InputStream.Seek(offset, io.SeekStart)
|
|
|
|
if err != nil {
|
|
|
|
common.Log.Debug("reachedEOF - d.InputStream.Seek failed: %v", err)
|
|
|
|
return false, errors.Wrap(err, processName, "input stream seek failed")
|
|
|
|
}
|
|
|
|
|
|
|
|
_, err = d.InputStream.ReadBits(32)
|
|
|
|
if err == io.EOF {
|
|
|
|
return true, nil
|
|
|
|
} else if err != nil {
|
|
|
|
return false, errors.Wrap(err, processName, "")
|
|
|
|
}
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func decodeWithGlobals(input reader.StreamReader, globals *Globals) (*Document, error) {
|
|
|
|
d := &Document{
|
|
|
|
Pages: make(map[int]*Page),
|
|
|
|
InputStream: input,
|
|
|
|
OrganizationType: segments.OSequential,
|
|
|
|
NumberOfPagesUnknown: true,
|
|
|
|
GlobalSegments: globals,
|
|
|
|
fileHeaderLength: 9,
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.GlobalSegments == nil {
|
|
|
|
d.GlobalSegments = &Globals{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// mapData map the data stream
|
|
|
|
if err := d.mapData(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return d, nil
|
|
|
|
}
|