805 lines
24 KiB
Go
Raw Normal View History

JBIG2 Generic Encoder (#264) * Prepared skeleton and basic component implementations for the jbig2 encoding. * Added Bitset. Implemented Bitmap. * Decoder with old Arithmetic Decoder * Partly working arithmetic * Working arithmetic decoder. * MMR patched. * rebuild to apache. * Working generic * Working generic * Decoded full document * Update Jenkinsfile go version [master] (#398) * Update Jenkinsfile go version * Decoded AnnexH document * Minor issues fixed. * Update README.md * Fixed generic region errors. Added benchmark. Added bitmap unpadder. Added Bitmap toImage method. * Fixed endofpage error * Added integration test. * Decoded all test files without errors. Implemented JBIG2Global. * Merged with v3 version * Fixed the EOF in the globals issue * Fixed the JBIG2 ChocolateData Decode * JBIG2 Added license information * Minor fix in jbig2 encoding. * Applied the logging convention * Cleaned unnecessary imports * Go modules clear unused imports * checked out the README.md * Moved trace to Debug. Fixed the build integrate tag in the document_decode_test.go * Initial encoder skeleton * Applied UniPDF Developer Guide. Fixed lint issues. * Cleared documentation, fixed style issues. * Added jbig2 doc.go files. Applied unipdf guide style. * Minor code style changes. * Minor naming and style issues fixes. * Minor naming changes. Style issues fixed. * Review r11 fixes. * Added JBIG2 Encoder skeleton. * Moved Document and Page to jbig2/document package. Created decoder package responsible for decoding jbig2 stream. * Implemented raster functions. * Added raster uni low test funcitons. * Added raster low test functions * untracked files on jbig2-encoder: c869089 Added raster low test functions * index on jbig2-encoder: c869089 Added raster low test functions * Added morph files. * implemented jbig2 encoder basics * JBIG2 Encoder - Generic method * Added jbig2 image encode ttests, black/white image tests * cleaned and tested jbig2 package * unfinished jbig2 classified encoder * jbig2 minor style changes * minor jbig2 encoder changes * prepared JBIG2 Encoder * Style and lint fixes * Minor changes and lints * Fixed shift unsinged value build errors * Minor naming change * Added jbig2 encode, image gondels. Fixed jbig2 decode bug. * Provided jbig2 core.DecodeGlobals function. * Fixed JBIG2Encoder `r6` revision issues. * Removed public JBIG2Encoder document. * Minor style changes * added NewJBIG2Encoder function. * fixed JBIG2Encoder 'r9' revision issues. * Cleared 'r9' commented code. * Updated ACKNOWLEDGEMENETS. Fixed JBIG2Encoder 'r10' revision issues. Co-authored-by: Gunnsteinn Hall <gunnsteinn.hall@gmail.com>
2020-03-27 12:47:41 +01:00
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package document
import (
"encoding/binary"
"io"
"runtime/debug"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/internal/jbig2/bitmap"
"github.com/unidoc/unipdf/v3/internal/jbig2/document/segments"
"github.com/unidoc/unipdf/v3/internal/jbig2/encoder/classer"
"github.com/unidoc/unipdf/v3/internal/jbig2/errors"
"github.com/unidoc/unipdf/v3/internal/jbig2/reader"
"github.com/unidoc/unipdf/v3/internal/jbig2/writer"
)
// fileHeaderID first byte slices of the jbig2 encoded file, see D.4.1.
var fileHeaderID = []byte{0x97, 0x4A, 0x42, 0x32, 0x0D, 0x0A, 0x1A, 0x0A}
// Document is the jbig2 document model containing pages and global segments.
// By creating new document with method New or NewWithGlobals all the jbig2
// encoded data segment headers are decoded. In order to decode whole
// document, all of it's pages should be decoded using GetBitmap method.
// PDF encoded documents should contains only one Page with the number 1.
type Document struct {
// Pages contains all pages of this document.
Pages map[int]*Page
// NumberOfPagesUnknown defines if the amout of the pages is known.
NumberOfPagesUnknown bool
// NumberOfPages - D.4.3 - Number of pages field (4 bytes). Only presented if NumberOfPagesUnknown is true.
NumberOfPages uint32
// GBUseExtTemplate defines wether extended Template is used.
GBUseExtTemplate bool
// SubInputStream is the source data stream wrapped into a SubInputStream.
InputStream reader.StreamReader
// GlobalSegments contains all segments that aren't associated with a page.
GlobalSegments *Globals
// OrganizationType is the document segment organization.
OrganizationType segments.OrganizationType
// Encoder variables
// Classer is the document encoding classifier.
Classer *classer.Classer
// XRes and YRes are the PPI for the x and y direction.
XRes, YRes int
// FullHeaders is a flag that defines if the encoder should produce full JBIG2 files.
FullHeaders bool
// CurrentSegmentNumber current symbol number.
CurrentSegmentNumber uint32
// AverageTemplates are the grayed templates.
AverageTemplates *bitmap.Bitmaps
BaseIndexes []int
Refinement bool
RefineLevel int
fileHeaderLength uint8
w *writer.Buffer
EncodeGlobals bool
// globalSymbolsNumber is the number of globally defined symbols.
globalSymbolsNumber int
// singleUseSymbols is the mapping between the page and exclusively used components for that page.
// This means that if some symbols are mapped to the page 1 only then that information would be stored here.
singleUseSymbols map[int][]int
// pageComponents is the mapping from page number to the list of connected components for that page.
// i.e. 1 map[1]=[]int{1,2,3} means that page 1 contains components [1,2,3]
pageComponents map[int][]int
// symbolsUsed is the array that contains the number of a symbol being used.
// Array index is the class-id of the symbol, where the value is it's occurrence.
symbolsUsed []int
// symbolIndexMap is the mapping between symbol class and the number it represents.
symbolIndexMap map[int]int
}
// DecodeDocument decodes provided document based on the provided 'input' data stream
// and with optional Global defined segments 'globals'.
func DecodeDocument(input reader.StreamReader, globals *Globals) (*Document, error) {
return decodeWithGlobals(input, globals)
}
// InitEncodeDocument initializes the jbig2 document for the encoding process.
func InitEncodeDocument(fullHeaders bool) *Document {
return &Document{FullHeaders: fullHeaders, w: writer.BufferedMSB(), Pages: map[int]*Page{}, singleUseSymbols: map[int][]int{}, symbolIndexMap: map[int]int{}, pageComponents: map[int][]int{}}
}
// AddGenericPage creates the jbig2 page based on the provided bitmap. The data provided
func (d *Document) AddGenericPage(bm *bitmap.Bitmap, duplicateLineRemoval bool) (err error) {
const processName = "Document.AddGenericPage"
// check if this is PDFMode and there is already a page
if !d.FullHeaders && d.NumberOfPages != 0 {
return errors.Error(processName, "document already contains page. FileMode disallows adding more than one page")
JBIG2 Generic Encoder (#264) * Prepared skeleton and basic component implementations for the jbig2 encoding. * Added Bitset. Implemented Bitmap. * Decoder with old Arithmetic Decoder * Partly working arithmetic * Working arithmetic decoder. * MMR patched. * rebuild to apache. * Working generic * Working generic * Decoded full document * Update Jenkinsfile go version [master] (#398) * Update Jenkinsfile go version * Decoded AnnexH document * Minor issues fixed. * Update README.md * Fixed generic region errors. Added benchmark. Added bitmap unpadder. Added Bitmap toImage method. * Fixed endofpage error * Added integration test. * Decoded all test files without errors. Implemented JBIG2Global. * Merged with v3 version * Fixed the EOF in the globals issue * Fixed the JBIG2 ChocolateData Decode * JBIG2 Added license information * Minor fix in jbig2 encoding. * Applied the logging convention * Cleaned unnecessary imports * Go modules clear unused imports * checked out the README.md * Moved trace to Debug. Fixed the build integrate tag in the document_decode_test.go * Initial encoder skeleton * Applied UniPDF Developer Guide. Fixed lint issues. * Cleared documentation, fixed style issues. * Added jbig2 doc.go files. Applied unipdf guide style. * Minor code style changes. * Minor naming and style issues fixes. * Minor naming changes. Style issues fixed. * Review r11 fixes. * Added JBIG2 Encoder skeleton. * Moved Document and Page to jbig2/document package. Created decoder package responsible for decoding jbig2 stream. * Implemented raster functions. * Added raster uni low test funcitons. * Added raster low test functions * untracked files on jbig2-encoder: c869089 Added raster low test functions * index on jbig2-encoder: c869089 Added raster low test functions * Added morph files. * implemented jbig2 encoder basics * JBIG2 Encoder - Generic method * Added jbig2 image encode ttests, black/white image tests * cleaned and tested jbig2 package * unfinished jbig2 classified encoder * jbig2 minor style changes * minor jbig2 encoder changes * prepared JBIG2 Encoder * Style and lint fixes * Minor changes and lints * Fixed shift unsinged value build errors * Minor naming change * Added jbig2 encode, image gondels. Fixed jbig2 decode bug. * Provided jbig2 core.DecodeGlobals function. * Fixed JBIG2Encoder `r6` revision issues. * Removed public JBIG2Encoder document. * Minor style changes * added NewJBIG2Encoder function. * fixed JBIG2Encoder 'r9' revision issues. * Cleared 'r9' commented code. * Updated ACKNOWLEDGEMENETS. Fixed JBIG2Encoder 'r10' revision issues. Co-authored-by: Gunnsteinn Hall <gunnsteinn.hall@gmail.com>
2020-03-27 12:47:41 +01:00
}
// initialize page
page := &Page{
Segments: []*segments.Header{},
Bitmap: bm,
Document: d,
FinalHeight: bm.Height,
FinalWidth: bm.Width,
IsLossless: true,
}
page.PageNumber = int(d.nextPageNumber())
d.Pages[page.PageNumber] = page
// add page information segment.
page.AddPageInformationSegment()
// add the generic region for given bitmap
if err = page.AddGenericRegion(bm, 0, 0, 0, segments.TImmediateGenericRegion, duplicateLineRemoval); err != nil {
return errors.Wrap(err, processName, "")
}
if d.FullHeaders {
// finish the page
page.AddEndOfPageSegment()
}
return nil
}
// AddClassifiedPage adds the bitmap page with a classification 'method'.
func (d *Document) AddClassifiedPage(bm *bitmap.Bitmap, method classer.Method) (err error) {
const processName = "Document.AddClassifiedPage"
// check if this is PDFMode and there is already a page
if !d.FullHeaders && d.NumberOfPages != 0 {
return errors.Error(processName, "document already contains page. FileMode disallows adding more than one page")
}
// initialize the classer if not set yet
if d.Classer == nil {
if d.Classer, err = classer.Init(classer.DefaultSettings()); err != nil {
return errors.Wrap(err, processName, "")
}
}
pageNumber := int(d.nextPageNumber())
p := &Page{
Segments: []*segments.Header{},
Bitmap: bm,
Document: d,
FinalHeight: bm.Height,
FinalWidth: bm.Width,
PageNumber: pageNumber,
}
d.Pages[pageNumber] = p
switch method {
case classer.RankHaus:
p.EncodingMethod = RankHausEM
case classer.Correlation:
p.EncodingMethod = CorrelationEM
}
p.AddPageInformationSegment()
if err = d.Classer.AddPage(bm, pageNumber, method); err != nil {
return errors.Wrap(err, processName, "")
}
if d.FullHeaders {
// finish the page
p.AddEndOfPageSegment()
}
return nil
}
// completeClassifiedPages completes the pages with the classification encoding type.
// creates global symbol dictionary segment.
func (d *Document) completeClassifiedPages() (err error) {
const processName = "completeClassifiedPages"
// check if the Classer is initialized
if d.Classer == nil {
return nil
}
// map symbol number to the times it was used
d.symbolsUsed = make([]int, d.Classer.UndilatedTemplates.Size())
for i := 0; i < d.Classer.ClassIDs.Size(); i++ {
classID, err := d.Classer.ClassIDs.Get(i)
if err != nil {
return errors.Wrapf(err, processName, "class with id: '%d'", i)
}
d.symbolsUsed[classID]++
}
// find if there are any symbols used on multiple pages at once - Globals.
var globalSymbols []int
for i := 0; i < d.Classer.UndilatedTemplates.Size(); i++ {
if d.NumberOfPages == 1 || d.symbolsUsed[i] > 1 {
globalSymbols = append(globalSymbols, i)
}
}
// build page components map
var (
page *Page
ok bool
)
// iterate over all classified pages.
// create a page components map which maps the page number to the list of connected components
// for that page.
for i, pageNumber := range *d.Classer.ComponentPageNumbers {
if page, ok = d.Pages[pageNumber]; !ok {
return errors.Errorf(processName, "page: '%d' not found", i)
}
// make sure that the page is not Generic Encoded.
if page.EncodingMethod == GenericEM {
common.Log.Error("Generic page with number: '%d' mapped as classified page", i)
continue
}
// add component to the given page number components.
d.pageComponents[pageNumber] = append(d.pageComponents[pageNumber], i)
// check if the symbol was used on this page exclusively.
symbol, err := d.Classer.ClassIDs.Get(i)
if err != nil {
return errors.Wrapf(err, processName, "no such classID: %d", i)
}
if d.symbolsUsed[symbol] == 1 && d.NumberOfPages != 1 {
sus := append(d.singleUseSymbols[pageNumber], symbol)
d.singleUseSymbols[pageNumber] = sus
}
}
if err = d.Classer.ComputeLLCorners(); err != nil {
return errors.Wrap(err, processName, "")
}
// TODO: set the page number of the symold dictionary depending if there are more than one page.
// This is related with the mapping[*bitmap.Bitmap]int
// create global symbols
if _, err = d.addSymbolDictionary(0, d.Classer.UndilatedTemplates, globalSymbols, d.symbolIndexMap, false); err != nil {
return errors.Wrap(err, processName, "")
}
return nil
}
func (d *Document) produceClassifiedPages() (err error) {
const processName = "produceClassifiedPages"
if d.Classer == nil {
return nil
}
var (
page *Page
ok bool
globalSD *segments.Header
)
// iterate over all pages and find if it is of type different than GenericEM.
for i := 1; i <= int(d.NumberOfPages); i++ {
if page, ok = d.Pages[i]; !ok {
return errors.Errorf(processName, "page: '%d' not found", i)
}
if page.EncodingMethod == GenericEM {
continue
}
if globalSD == nil {
if globalSD, err = d.GlobalSegments.GetSymbolDictionary(); err != nil {
return errors.Wrap(err, processName, "")
}
}
// produce new classified page.
if err = d.produceClassifiedPage(page, globalSD); err != nil {
return errors.Wrapf(err, processName, "page: '%d'", i)
}
}
return nil
}
func (d *Document) produceClassifiedPage(page *Page, globalSD *segments.Header) (err error) {
const processName = "produceClassifiedPage"
// if given page contains it's own unique symbols
// create additional symbols dictionary
var secondSymbolMap map[int]int
numSyms := d.globalSymbolsNumber
refererTo := []*segments.Header{globalSD}
// check if there are symbols used only for given page. This would be present only if there is
// more then 1 page in the document.
if len(d.singleUseSymbols[page.PageNumber]) > 0 {
// create new symbols dictionary
secondSymbolMap = map[int]int{}
extraSDHeader, err := d.addSymbolDictionary(page.PageNumber, d.Classer.UndilatedTemplates, d.singleUseSymbols[page.PageNumber], secondSymbolMap, false)
if err != nil {
return errors.Wrap(err, processName, "")
}
refererTo = append(refererTo, extraSDHeader)
numSyms += len(d.singleUseSymbols[page.PageNumber])
}
// get components array for given page.
comps := d.pageComponents[page.PageNumber]
common.Log.Debug("Page: '%d' comps: %v", page.PageNumber, comps)
// add the text region to the page
page.addTextRegionSegment(refererTo, d.symbolIndexMap, secondSymbolMap, d.pageComponents[page.PageNumber],
d.Classer.PtaLL, d.Classer.UndilatedTemplates, d.Classer.ClassIDs, nil,
log2up(numSyms), len(d.pageComponents[page.PageNumber]))
return nil
}
func log2up(v int) int {
r := 0
isPow2 := (v & (v - 1)) == 0
v >>= 1
for ; v != 0; v >>= 1 {
r++
}
if isPow2 {
return r
}
return r + 1
}
func (d *Document) addSymbolDictionary(
pageNumber int, symbols *bitmap.Bitmaps, symbolList []int,
symbolMap map[int]int, unborderSymbols bool,
) (*segments.Header, error) {
const processName = "addSymbolDictionary"
// add symbolTable
sd := &segments.SymbolDictionary{}
if err := sd.InitEncode(symbols, symbolList, symbolMap, unborderSymbols); err != nil {
return nil, err
}
sh := &segments.Header{
Type: segments.TSymbolDictionary,
PageAssociation: pageNumber,
SegmentData: sd,
}
if pageNumber == 0 {
if d.GlobalSegments == nil {
d.GlobalSegments = &Globals{}
}
d.GlobalSegments.AddSegment(sh)
return sh, nil
}
// find the page at Number: 'pageNumber'
// add the symbol dictionary after page info header
page, ok := d.Pages[pageNumber]
if !ok {
return nil, errors.Errorf(processName, "page: '%d' not found", pageNumber)
}
var (
i int
seg *segments.Header
)
for i, seg = range page.Segments {
if seg.Type == segments.TPageInformation {
break
}
}
// the segment should be after page info
i++
page.Segments = append(page.Segments, nil)
copy(page.Segments[i+1:], page.Segments[i:])
page.Segments[i] = sh
return sh, nil
}
func (d *Document) completeSymbols() (err error) {
const processName = "completeSymbols"
// check if classer was used.
if d.Classer == nil {
return nil
}
if d.Classer.UndilatedTemplates == nil {
return errors.Error(processName, "no templates defined for the classer")
}
// select the symbols used only once per page
// map these symbols
singlePage := len(d.Pages) == 1
symbolsUsed := make([]int, d.Classer.UndilatedTemplates.Size())
var n int
for i := 0; i < d.Classer.ClassIDs.Size(); i++ {
n, err = d.Classer.ClassIDs.Get(i)
if err != nil {
return errors.Wrap(err, processName, "class ID's")
}
symbolsUsed[n]++
}
var multiuseSymbols []int
for i := 0; i < d.Classer.UndilatedTemplates.Size(); i++ {
if symbolsUsed[i] == 0 {
return errors.Error(processName, "no symbols instances found for given class? ")
}
if symbolsUsed[i] > 1 || singlePage {
multiuseSymbols = append(multiuseSymbols, i)
}
}
d.globalSymbolsNumber = len(multiuseSymbols)
// build page components map for page number to the list of connected components for that page.
var pageNum, symNum int
for i := 0; i < d.Classer.ComponentPageNumbers.Size(); i++ {
pageNum, err = d.Classer.ComponentPageNumbers.Get(i)
if err != nil {
return errors.Wrapf(err, processName, "page: '%d' not found in the classer pagenumbers", i)
}
symNum, err = d.Classer.ClassIDs.Get(i)
if err != nil {
return errors.Wrapf(err, processName, "can't get symbol for page '%d' from classer", pageNum)
}
if symbolsUsed[symNum] == 1 && !singlePage {
d.singleUseSymbols[pageNum] = append(d.singleUseSymbols[pageNum], symNum)
}
}
if err = d.Classer.ComputeLLCorners(); err != nil {
return errors.Wrap(err, processName, "")
}
return nil
}
// AutoThreshold gathers classes of symbols and uses a single representative to stand for them all.
// func (d *Document) AutoThreshold() error {
// bms := d.Classer.Pixat
// for i := 0; i < len(bm); i++ {
// bm := bms[i]
//
// for j := i + 1; j < len(bms); j++ {
// if bm.Equivalent(bms[j]) {
// if err := d.uniteTemplatesWithIndexes(i, j); err != nil {
// return err
// }
// j--
// }
// }
// }
// return nil
// }
// Encode encodes the given document and stores into 'w' writer.
func (d *Document) Encode() (data []byte, err error) {
const processName = "Document.Encode"
var n, temp int
// if the full headers flag is on, encode file header
if d.FullHeaders {
if n, err = d.encodeFileHeader(d.w); err != nil {
return nil, errors.Wrap(err, processName, "")
}
}
var (
ok bool
seg *segments.Header
page *Page
)
// complete classified pages
if err = d.completeClassifiedPages(); err != nil {
return nil, errors.Wrap(err, processName, "")
}
// produce classified pages
if err = d.produceClassifiedPages(); err != nil {
return nil, errors.Wrap(err, processName, "")
}
if d.GlobalSegments != nil {
for _, seg = range d.GlobalSegments.Segments {
if err = d.encodeSegment(seg, &n); err != nil {
return nil, errors.Wrap(err, processName, "")
}
}
}
for i := 1; i <= int(d.NumberOfPages); i++ {
if page, ok = d.Pages[i]; !ok {
return nil, errors.Errorf(processName, "page: '%d' not found", i)
}
for _, seg = range page.Segments {
if err = d.encodeSegment(seg, &n); err != nil {
return nil, errors.Wrap(err, processName, "")
}
}
}
// with full headers the End Of File header must be encoded
if d.FullHeaders {
if temp, err = d.encodeEOFHeader(d.w); err != nil {
return nil, errors.Wrap(err, processName, "")
}
n += temp
}
// get the encoded data from the 'w' writer
data = d.w.Data()
if len(data) != n {
common.Log.Debug("Bytes written (n): '%d' is not equal to the length of the data encoded: '%d'", n, len(data))
}
return data, nil
}
func (d *Document) encodeSegment(seg *segments.Header, n *int) error {
const processName = "encodeSegment"
seg.SegmentNumber = d.nextSegmentNumber()
temp, err := seg.Encode(d.w)
if err != nil {
return errors.Wrapf(err, processName, "segment: '%d'", seg.SegmentNumber)
}
*n += temp
return nil
}
// GetGlobalSegment implements segments.Documenter interface.
func (d *Document) GetGlobalSegment(i int) (*segments.Header, error) {
h, err := d.GlobalSegments.GetSegment(i)
if err != nil {
return nil, errors.Wrap(err, "GetGlobalSegment", "")
}
return h, nil
}
// GetNumberOfPages gets the amount of Pages in the given document.
func (d *Document) GetNumberOfPages() (uint32, error) {
if d.NumberOfPagesUnknown || d.NumberOfPages == 0 {
if len(d.Pages) == 0 {
if err := d.mapData(); err != nil {
return 0, errors.Wrap(err, "Document.GetNumberOfPages", "")
}
}
return uint32(len(d.Pages)), nil
}
return d.NumberOfPages, nil
}
// GetPage implements segments.Documenter interface.
// NOTE: in order to decode all document images, get page by page (page numeration starts from '1') and
// decode them by calling 'GetBitmap' method.
func (d *Document) GetPage(pageNumber int) (segments.Pager, error) {
const processName = "Document.GetPage"
if pageNumber < 0 {
common.Log.Debug("JBIG2 Page - GetPage: %d. Page cannot be lower than 0. %s", pageNumber, debug.Stack())
return nil, errors.Errorf(processName, "invalid jbig2 document - provided invalid page number: %d", pageNumber)
}
if pageNumber > len(d.Pages) {
common.Log.Debug("Page not found: %d. %s", pageNumber, debug.Stack())
return nil, errors.Error(processName, "invalid jbig2 document - page not found")
}
p, ok := d.Pages[pageNumber]
if !ok {
common.Log.Debug("Page not found: %d. %s", pageNumber, debug.Stack())
return nil, errors.Errorf(processName, "invalid jbig2 document - page not found")
}
return p, nil
}
/**
Private document methods
*/
func (d *Document) determineRandomDataOffsets(segmentHeaders []*segments.Header, offset uint64) {
if d.OrganizationType != segments.ORandom {
return
}
for _, s := range segmentHeaders {
s.SegmentDataStartOffset = offset
offset += s.SegmentDataLength
}
}
// encodeEOFHeader writes the end of file header segment into 'w' writer.
func (d *Document) encodeEOFHeader(w writer.BinaryWriter) (n int, err error) {
h := &segments.Header{SegmentNumber: d.nextSegmentNumber(), Type: segments.TEndOfFile}
if n, err = h.Encode(w); err != nil {
return 0, errors.Wrap(err, "encodeEOFHeader", "")
}
return n, nil
}
// encodeFileHeader writes the file header segment into the 'w' writer.
func (d *Document) encodeFileHeader(w writer.BinaryWriter) (n int, err error) {
const processName = "encodeFileHeader"
// file header contains following fields:
// ID string - constant 8-byte sequence
n, err = w.Write(fileHeaderID)
if err != nil {
return n, errors.Wrap(err, processName, "id")
}
// file header flags one byte field
// where:
// 0th bit - file organization type - '1' for sequential, '0' for random-access
// 1st bit - unknown number of pages - '1' if true
// this encoder stores only sequential organization type with known number of pages
// thus file header flags are equal to 0x01 byte
if err = w.WriteByte(0x01); err != nil {
return n, errors.Wrap(err, processName, "flags")
}
n++
temp := make([]byte, 4)
// last 4 bytes is the number of pages - used if the number of pages is known
binary.BigEndian.PutUint32(temp, d.NumberOfPages)
nt, err := w.Write(temp)
if err != nil {
return nt, errors.Wrap(err, processName, "page number")
}
n += nt
return n, nil
}
func (d *Document) isFileHeaderPresent() (bool, error) {
d.InputStream.Mark()
for _, magicByte := range fileHeaderID {
b, err := d.InputStream.ReadByte()
if err != nil {
return false, err
}
if magicByte != b {
d.InputStream.Reset()
return false, nil
}
}
d.InputStream.Reset()
return true, nil
}
func (d *Document) mapData() error {
const processName = "mapData"
// Get the header list
var (
segmentHeaders []*segments.Header
offset int64
kind segments.Type
)
isFileHeaderPresent, err := d.isFileHeaderPresent()
if err != nil {
return errors.Wrap(err, processName, "")
}
// Parse the file header if exists.
if isFileHeaderPresent {
if err = d.parseFileHeader(); err != nil {
return errors.Wrap(err, processName, "")
}
offset += int64(d.fileHeaderLength)
d.FullHeaders = true
}
var (
page *Page
reachedEOF bool
)
// type 51 is the EndOfFile segment kind
for kind != 51 && !reachedEOF {
// get new segment
segment, err := segments.NewHeader(d, d.InputStream, offset, d.OrganizationType)
if err != nil {
return errors.Wrap(err, processName, "")
}
common.Log.Trace("Decoding segment number: %d, Type: %s", segment.SegmentNumber, segment.Type)
kind = segment.Type
if kind != segments.TEndOfFile {
if segment.PageAssociation != 0 {
page = d.Pages[segment.PageAssociation]
if page == nil {
page = newPage(d, segment.PageAssociation)
d.Pages[segment.PageAssociation] = page
if d.NumberOfPagesUnknown {
d.NumberOfPages++
}
}
page.Segments = append(page.Segments, segment)
} else {
d.GlobalSegments.AddSegment(segment)
}
}
segmentHeaders = append(segmentHeaders, segment)
offset = d.InputStream.StreamPosition()
if d.OrganizationType == segments.OSequential {
offset += int64(segment.SegmentDataLength)
}
reachedEOF, err = d.reachedEOF(offset)
if err != nil {
common.Log.Debug("jbig2 document reached EOF with error: %v", err)
return errors.Wrap(err, processName, "")
}
}
d.determineRandomDataOffsets(segmentHeaders, uint64(offset))
return nil
}
func (d *Document) nextPageNumber() uint32 {
d.NumberOfPages++
return d.NumberOfPages
}
func (d *Document) nextSegmentNumber() uint32 {
s := d.CurrentSegmentNumber
d.CurrentSegmentNumber++
return s
}
func (d *Document) parseFileHeader() error {
const processName = "parseFileHeader"
// D.4.1 ID string read will be skipped.
_, err := d.InputStream.Seek(8, io.SeekStart)
if err != nil {
return errors.Wrap(err, processName, "id")
}
// D.4.2 Header flag (1 byte)
// Bit 3-7 are reserved and must be 0
_, err = d.InputStream.ReadBits(5)
if err != nil {
return errors.Wrap(err, processName, "reserved bits")
}
// Bit 2 - extended templates are used
b, err := d.InputStream.ReadBit()
if err != nil {
return errors.Wrap(err, processName, "extended templates")
}
if b == 1 {
d.GBUseExtTemplate = true
}
// Bit 1 - Indicates if amount of pages are unknown.
b, err = d.InputStream.ReadBit()
if err != nil {
return errors.Wrap(err, processName, "unknown page number")
}
if b != 1 {
d.NumberOfPagesUnknown = false
}
// Bit 0 - Indicates file organization type.
b, err = d.InputStream.ReadBit()
if err != nil {
return errors.Wrap(err, processName, "organization type")
}
d.OrganizationType = segments.OrganizationType(b)
// D.4.3 Number of pages
if !d.NumberOfPagesUnknown {
d.NumberOfPages, err = d.InputStream.ReadUint32()
if err != nil {
return errors.Wrap(err, processName, "number of pages")
}
d.fileHeaderLength = 13
}
return nil
}
func (d *Document) reachedEOF(offset int64) (bool, error) {
const processName = "reachedEOF"
_, err := d.InputStream.Seek(offset, io.SeekStart)
if err != nil {
common.Log.Debug("reachedEOF - d.InputStream.Seek failed: %v", err)
return false, errors.Wrap(err, processName, "input stream seek failed")
}
_, err = d.InputStream.ReadBits(32)
if err == io.EOF {
return true, nil
} else if err != nil {
return false, errors.Wrap(err, processName, "")
}
return false, nil
}
func decodeWithGlobals(input reader.StreamReader, globals *Globals) (*Document, error) {
d := &Document{
Pages: make(map[int]*Page),
InputStream: input,
OrganizationType: segments.OSequential,
NumberOfPagesUnknown: true,
GlobalSegments: globals,
fileHeaderLength: 9,
}
if d.GlobalSegments == nil {
d.GlobalSegments = &Globals{}
}
// mapData map the data stream
if err := d.mapData(); err != nil {
return nil, err
}
return d, nil
}