unipdf/extractor/utils.go
Peter Williams 88fda44e0a
Text extraction code for columns. (#366)
* Fixed filename:page in logging

* Got CMap working for multi-rune entries

* Treat CMap entries as strings instead of runes to handle multi-byte encodings.

* Added a test for multibyte encoding.

* First version of text extraction that recognizes columns

* Added an expanation of the text columns code to README.md.

* fixed typos

* Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

* Added function comments.

* Fixed text state save/restore.

* Adjusted inter-word search distance to make paragrah division work for thanh.pdf

* Got text_test.go passing.

* Reinstated hyphen suppression

* Handle more cases of fonts not being set in text extraction code.

* Fixed typo

* More verbose logging

* Adding tables to text extractor.

* Added tests for columns extraction.

* Removed commented code

* Check for textParas that are on the same line when writing out extracted text.

* Absorb text to the left of paras into paras e.g. Footnote numbers

* Removed funny character from text_test.go

* Commented out a creator_test.go test that was broken by my text extraction changes.

* Big changes to columns text extraction code for PR.

Performance improvements in several places.
Commented code.

* Updated extractor/README

* Cleaned up some comments and removed a panic

* Increased threshold for truncating extracted text when there is no license 100 -> 102.

This is a workaround to let a test in creator_test.go pass.

With the old text extraction code the following extracted text was 100 chars. With the new code it
is 102 chars which looks correct.

"你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n"

* Improved an error message.

* Removed irrelevant spaces

* Commented code and removed unused functions.

* Reverted PdfRectangle changes

* Added duplicate text detection.

* Combine diacritic textMarks in text extraction

* Reinstated a diacritic recombination test.

* Small code reorganisation

* Reinstated handling of rotated text

* Addressed issues in PR review

* Added color fields to TextMark

* Updated README

* Reinstated the disabled tests I missed before.

* Tightened definition for tables to prevent detection of tables where there weren't any.

* Compute line splitting search range based on fontsize of first word in word bag.

* Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs.

See https://blog.golang.org/go1.13-errors

* Fixed some naming and added some comments.

* errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility

* Removed code that doesn't ever get called.

* Removed unused test
2020-06-30 19:33:10 +00:00

108 lines
2.9 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"bytes"
"fmt"
"image/color"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/common/license"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/model"
)
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
type RenderMode int
// Render mode type.
const (
RenderModeStroke RenderMode = 1 << iota // Stroke
RenderModeFill // Fill
RenderModeClip // Clip
)
// toFloatXY returns `objs` as 2 floats, if that's what `objs` is, or an error if it isn't.
func toFloatXY(objs []core.PdfObject) (x, y float64, err error) {
if len(objs) != 2 {
return 0, 0, fmt.Errorf("invalid number of params: %d", len(objs))
}
floats, err := core.GetNumbersAsFloat(objs)
if err != nil {
return 0, 0, err
}
return floats[0], floats[1], nil
}
func procBuf(pt *PageText) {
if isTesting {
return
}
lk := license.GetLicenseKey()
if lk != nil && lk.IsLicensed() {
return
}
fmt.Printf("Unlicensed copy of unidoc\n")
fmt.Printf("To get rid of the watermark and keep entire text - Please get a license on https://unidoc.io\n")
var buf bytes.Buffer
buf.WriteString(pt.viewText)
s := "- [Unlicensed UniDoc - Get a license on https://unidoc.io]"
if buf.Len() > 102 {
s = "... [Truncated - Unlicensed UniDoc - Get a license on https://unidoc.io]"
buf.Truncate(buf.Len() - 100)
}
buf.WriteString(s)
pt.viewText = buf.String()
if len(pt.marks) > 200 {
pt.marks = pt.marks[:200]
}
if len(pt.viewMarks) > 200 {
pt.viewMarks = pt.viewMarks[:200]
}
}
// truncate returns the first `n` characters in string `s`.
func truncate(s string, n int) string {
if len(s) < n {
return s
}
return s[:n]
}
// pdfColorToGoColor converts the specified color to a Go color, using the
// provided colorspace. If unsuccessful, color.Black is returned.
func pdfColorToGoColor(space model.PdfColorspace, c model.PdfColor) color.Color {
if space == nil || c == nil {
return color.Black
}
conv, err := space.ColorToRGB(c)
if err != nil {
common.Log.Debug("WARN: could not convert color %v (%v) to RGB: %s", c, space, err)
return color.Black
}
rgb, ok := conv.(*model.PdfColorDeviceRGB)
if !ok {
common.Log.Debug("WARN: converted color is not in the RGB colorspace: %v", conv)
return color.Black
}
return color.NRGBA{
R: uint8(rgb.R() * 255),
G: uint8(rgb.G() * 255),
B: uint8(rgb.B() * 255),
A: uint8(255),
}
}