mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
72 lines
2.1 KiB
Go
72 lines
2.1 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// Extractor stores and offers functionality for extracting content from PDF pages.
|
|
type Extractor struct {
|
|
// stream contents and resources for page
|
|
contents string
|
|
resources *model.PdfPageResources
|
|
mediaBox model.PdfRectangle
|
|
|
|
// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFonts
|
|
// from PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFonts.
|
|
fontCache map[string]fontEntry
|
|
|
|
// text results from running extractXYText on forms within the page.
|
|
// TODO(peterwilliams97): Cache this map accross all pages in a PDF to speed up processing.
|
|
formResults map[string]textResult
|
|
|
|
// accessCount is used to set fontEntry.access to an incrementing number.
|
|
accessCount int64
|
|
|
|
// textCount is an incrementing number used to identify XYTest objects.
|
|
textCount int
|
|
}
|
|
|
|
// New returns an Extractor instance for extracting content from the input PDF page.
|
|
func New(page *model.PdfPage) (*Extractor, error) {
|
|
contents, err := page.GetAllContentStreams()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Uncomment these lines to see the contents of the page. For debugging.
|
|
// fmt.Println("========================= +++ =========================")
|
|
// fmt.Printf("%s\n", contents)
|
|
// fmt.Println("========================= ::: =========================")
|
|
|
|
mediaBox, err := page.GetMediaBox()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("extractor requires mediaBox. %v", err)
|
|
}
|
|
e := &Extractor{
|
|
contents: contents,
|
|
resources: page.Resources,
|
|
mediaBox: *mediaBox,
|
|
fontCache: map[string]fontEntry{},
|
|
formResults: map[string]textResult{},
|
|
}
|
|
return e, nil
|
|
}
|
|
|
|
// NewFromContents creates a new extractor from contents and page resources.
|
|
func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) {
|
|
e := &Extractor{
|
|
contents: contents,
|
|
resources: resources,
|
|
fontCache: map[string]fontEntry{},
|
|
formResults: map[string]textResult{},
|
|
}
|
|
return e, nil
|
|
}
|