unipdf/pdf/extractor/extractor.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import "github.com/unidoc/unidoc/pdf/model"

// Extractor stores and offers functionality for extracting content from PDF pages.
type Extractor struct {
	contents  string
	resources *model.PdfPageResources

	// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
	// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
	fontCache map[string]fontEntry

	// accessCount is used to set fontEntry.access to an incrementing number.
	accessCount int64
}

// New returns an Extractor instance for extracting content from the input PDF page.
func New(page *model.PdfPage) (*Extractor, error) {
	contents, err := page.GetAllContentStreams()
	if err != nil {
		return nil, err
	}

	// Uncomment these lines to see the contents of the page. For debugging.
	// fmt.Println("========================= +++ =========================")
	// fmt.Printf("%s\n", contents)
	// fmt.Println("========================= ::: =========================")

	e := &Extractor{
		contents:  contents,
		resources: page.Resources,
		fontCache: map[string]fontEntry{},
	}
	return e, nil
}