2018-03-22 14:03:47 +00:00
|
|
|
/*
|
|
|
|
* This file is subject to the terms and conditions defined in
|
|
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
|
|
*/
|
|
|
|
|
2018-03-22 13:01:04 +00:00
|
|
|
package extractor
|
|
|
|
|
2018-12-27 20:51:34 +11:00
|
|
|
import (
|
2019-05-16 23:44:51 +03:00
|
|
|
"github.com/unidoc/unipdf/v3/model"
|
2018-12-27 20:51:34 +11:00
|
|
|
)
|
2018-03-22 13:01:04 +00:00
|
|
|
|
|
|
|
// Extractor stores and offers functionality for extracting content from PDF pages.
|
|
|
|
type Extractor struct {
|
2018-12-27 20:51:34 +11:00
|
|
|
// stream contents and resources for page
|
2018-12-27 21:33:31 +11:00
|
|
|
contents string
|
|
|
|
resources *model.PdfPageResources
|
2018-09-22 09:28:18 +10:00
|
|
|
|
|
|
|
// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
|
|
|
|
// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
|
|
|
|
fontCache map[string]fontEntry
|
|
|
|
|
2018-12-27 20:51:34 +11:00
|
|
|
// text results from running extractXYText on forms within the page.
|
|
|
|
// TODO(peterwilliams): Cache this map accross all pages in a PDF to speed up processig.
|
|
|
|
formResults map[string]textResult
|
|
|
|
|
2018-09-22 09:28:18 +10:00
|
|
|
// accessCount is used to set fontEntry.access to an incrementing number.
|
|
|
|
accessCount int64
|
2018-11-28 18:06:03 +11:00
|
|
|
|
|
|
|
// textCount is an incrementing number used to identify XYTest objects.
|
|
|
|
textCount int64
|
2018-03-22 13:01:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// New returns an Extractor instance for extracting content from the input PDF page.
|
|
|
|
func New(page *model.PdfPage) (*Extractor, error) {
|
|
|
|
contents, err := page.GetAllContentStreams()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2018-07-24 21:32:02 +10:00
|
|
|
// Uncomment these lines to see the contents of the page. For debugging.
|
2018-07-03 14:26:42 +10:00
|
|
|
// fmt.Println("========================= +++ =========================")
|
|
|
|
// fmt.Printf("%s\n", contents)
|
|
|
|
// fmt.Println("========================= ::: =========================")
|
2018-03-22 13:01:04 +00:00
|
|
|
|
2018-09-22 09:28:18 +10:00
|
|
|
e := &Extractor{
|
2018-12-27 21:33:31 +11:00
|
|
|
contents: contents,
|
|
|
|
resources: page.Resources,
|
|
|
|
fontCache: map[string]fontEntry{},
|
2018-12-27 20:51:34 +11:00
|
|
|
formResults: map[string]textResult{},
|
2018-09-22 09:28:18 +10:00
|
|
|
}
|
2018-03-22 13:01:04 +00:00
|
|
|
return e, nil
|
|
|
|
}
|