Moved font cache from global variable to Extractor.

This commit is contained in:
Peter Williams 2018-09-22 09:28:18 +10:00
parent 69be54d501
commit c76fa6985e
3 changed files with 35 additions and 26 deletions

View File

@ -11,6 +11,13 @@ import "github.com/unidoc/unidoc/pdf/model"
type Extractor struct {
contents string
resources *model.PdfPageResources
// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
fontCache map[string]fontEntry
// accessCount is used to set fontEntry.access to an incrementing number.
accessCount int64
}
// New returns an Extractor instance for extracting content from the input PDF page.
@ -25,6 +32,10 @@ func New(page *model.PdfPage) (*Extractor, error) {
// fmt.Printf("%s\n", contents)
// fmt.Println("========================= ::: =========================")
e := &Extractor{contents: contents, resources: page.Resources}
e := &Extractor{
contents: contents,
resources: page.Resources,
fontCache: map[string]fontEntry{},
}
return e, nil
}

View File

@ -624,11 +624,13 @@ func (tl *TextList) ToText() string {
// getFont returns the font named `name` if it exists in the page's resources or an error if it
// doesn't. It caches the returned fonts.
func (to *textObject) getFont(name string) (*model.PdfFont, error) {
accessCount++
entry, ok := fontCache[name]
if ok {
entry.access = accessCount
return entry.font, nil
if to.e.fontCache != nil {
to.e.accessCount++
entry, ok := to.e.fontCache[name]
if ok {
entry.access = to.e.accessCount
return entry.font, nil
}
}
// Font not in cache. Load it.
@ -636,20 +638,23 @@ func (to *textObject) getFont(name string) (*model.PdfFont, error) {
if err != nil {
return nil, err
}
entry = fontEntry{font, accessCount}
// Eject a victim if the cache is full.
if len(fontCache) >= maxFontCache {
names := []string{}
for name := range fontCache {
names = append(names, name)
if to.e.fontCache != nil {
entry := fontEntry{font, to.e.accessCount}
// Eject a victim if the cache is full.
if len(to.e.fontCache) >= maxFontCache {
names := []string{}
for name := range to.e.fontCache {
names = append(names, name)
}
sort.Slice(names, func(i, j int) bool {
return to.e.fontCache[names[i]].access < to.e.fontCache[names[j]].access
})
delete(to.e.fontCache, names[0])
}
sort.Slice(names, func(i, j int) bool {
return fontCache[names[i]].access < fontCache[names[j]].access
})
delete(fontCache, names[0])
to.e.fontCache[name] = entry
}
fontCache[name] = entry
return font, nil
}
@ -660,19 +665,12 @@ type fontEntry struct {
access int64 // Last access. Used to determine LRU cache victims.
}
// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
var fontCache = map[string]fontEntry{}
// maxFontCache is the maximum number of PdfFont's in fontCache.
const maxFontCache = 10
// accessCount is used to set fontEntry.access to an incrementing number.
var accessCount int64
// getFontDirect returns the font named `name` if it exists in the page's resources or an error if
// is doesn't.
// This is a direct (uncached access)
// This is a direct (uncached access).
func (to *textObject) getFontDirect(name string) (*model.PdfFont, error) {
// This is a hack for testing.

View File

@ -573,7 +573,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
// toUnicodeToCmap returns a CMap of `toUnicode` if it exists.
func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, error) {
toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
toUnicodeStream, ok := core.GetStream(toUnicode)
if !ok {
common.Log.Debug("ERROR: toUnicodeToCmap: Not a stream (%T)", toUnicode)
return nil, core.ErrTypeError