mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-02 22:17:06 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
108 lines
2.9 KiB
Go
108 lines
2.9 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"image/color"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/common/license"
|
|
"github.com/unidoc/unipdf/v3/core"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
|
|
// glyph outlines to be stroked, filled, used as a clipping boundary, or some combination of the three.
|
|
// Stroking, filling, and clipping shall have the same effects for a text object as they do for a path object
|
|
// (see 8.5.3, "Path-Painting Operators" and 8.5.4, "Clipping Path Operators").
|
|
type RenderMode int
|
|
|
|
// Render mode type.
|
|
const (
|
|
RenderModeStroke RenderMode = 1 << iota // Stroke
|
|
RenderModeFill // Fill
|
|
RenderModeClip // Clip
|
|
)
|
|
|
|
// toFloatXY returns `objs` as 2 floats, if that's what `objs` is, or an error if it isn't.
|
|
func toFloatXY(objs []core.PdfObject) (x, y float64, err error) {
|
|
if len(objs) != 2 {
|
|
return 0, 0, fmt.Errorf("invalid number of params: %d", len(objs))
|
|
}
|
|
floats, err := core.GetNumbersAsFloat(objs)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
return floats[0], floats[1], nil
|
|
}
|
|
|
|
func procBuf(pt *PageText) {
|
|
if isTesting {
|
|
return
|
|
}
|
|
|
|
lk := license.GetLicenseKey()
|
|
if lk != nil && lk.IsLicensed() {
|
|
return
|
|
}
|
|
fmt.Printf("Unlicensed copy of unidoc\n")
|
|
fmt.Printf("To get rid of the watermark and keep entire text - Please get a license on https://unidoc.io\n")
|
|
|
|
var buf bytes.Buffer
|
|
buf.WriteString(pt.viewText)
|
|
|
|
s := "- [Unlicensed UniDoc - Get a license on https://unidoc.io]"
|
|
if buf.Len() > 102 {
|
|
s = "... [Truncated - Unlicensed UniDoc - Get a license on https://unidoc.io]"
|
|
buf.Truncate(buf.Len() - 100)
|
|
}
|
|
buf.WriteString(s)
|
|
pt.viewText = buf.String()
|
|
|
|
if len(pt.marks) > 200 {
|
|
pt.marks = pt.marks[:200]
|
|
}
|
|
if len(pt.viewMarks) > 200 {
|
|
pt.viewMarks = pt.viewMarks[:200]
|
|
}
|
|
}
|
|
|
|
// truncate returns the first `n` characters in string `s`.
|
|
func truncate(s string, n int) string {
|
|
if len(s) < n {
|
|
return s
|
|
}
|
|
return s[:n]
|
|
}
|
|
|
|
// pdfColorToGoColor converts the specified color to a Go color, using the
|
|
// provided colorspace. If unsuccessful, color.Black is returned.
|
|
func pdfColorToGoColor(space model.PdfColorspace, c model.PdfColor) color.Color {
|
|
if space == nil || c == nil {
|
|
return color.Black
|
|
}
|
|
|
|
conv, err := space.ColorToRGB(c)
|
|
if err != nil {
|
|
common.Log.Debug("WARN: could not convert color %v (%v) to RGB: %s", c, space, err)
|
|
return color.Black
|
|
}
|
|
rgb, ok := conv.(*model.PdfColorDeviceRGB)
|
|
if !ok {
|
|
common.Log.Debug("WARN: converted color is not in the RGB colorspace: %v", conv)
|
|
return color.Black
|
|
}
|
|
|
|
return color.NRGBA{
|
|
R: uint8(rgb.R() * 255),
|
|
G: uint8(rgb.G() * 255),
|
|
B: uint8(rgb.B() * 255),
|
|
A: uint8(255),
|
|
}
|
|
}
|