mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-27 13:48:51 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
190 lines
6.0 KiB
Go
190 lines
6.0 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"image/color"
|
|
"math"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/internal/transform"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// textMark represents text drawn on a page and its position in device coordinates.
|
|
// All dimensions are in device coordinates.
|
|
type textMark struct {
|
|
model.PdfRectangle // Bounding box oriented so character base is at bottom
|
|
orient int // Orientation
|
|
text string // The text (decoded via ToUnicode).
|
|
original string // Original text (decoded).
|
|
font *model.PdfFont // The font the mark was drawn with.
|
|
fontsize float64 // The font size the mark was drawn with.
|
|
charspacing float64 // TODO (peterwilliams97: Should this be exposed in TextMark?
|
|
trm transform.Matrix // The current text rendering matrix (TRM above).
|
|
end transform.Point // The end of character device coordinates.
|
|
originaBBox model.PdfRectangle // Bounding box without orientation correction.
|
|
fillColor color.Color // Text fill color.
|
|
strokeColor color.Color // Text stroke color.
|
|
}
|
|
|
|
// newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
|
|
// and end of character device coordinates `end`. `spaceWidth` is our best guess at the width of a
|
|
// space in the font the text is rendered in device coordinates.
|
|
func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
|
|
spaceWidth float64, font *model.PdfFont, charspacing float64,
|
|
fillColor, strokeColor color.Color) (textMark, bool) {
|
|
theta := trm.Angle()
|
|
orient := nearestMultiple(theta, orientationGranularity)
|
|
var height float64
|
|
if orient%180 != 90 {
|
|
height = trm.ScalingFactorY()
|
|
} else {
|
|
height = trm.ScalingFactorX()
|
|
}
|
|
|
|
start := translation(trm)
|
|
bbox := model.PdfRectangle{Llx: start.X, Lly: start.Y, Urx: end.X, Ury: end.Y}
|
|
switch orient % 360 {
|
|
case 90:
|
|
bbox.Urx -= height
|
|
case 180:
|
|
bbox.Ury -= height
|
|
case 270:
|
|
bbox.Urx += height
|
|
case 0:
|
|
bbox.Ury += height
|
|
default:
|
|
// This is a hack to capture diagonal text.
|
|
// TODO(peterwilliams97): Extract diagonal text.
|
|
orient = 0
|
|
bbox.Ury += height
|
|
}
|
|
if bbox.Llx > bbox.Urx {
|
|
bbox.Llx, bbox.Urx = bbox.Urx, bbox.Llx
|
|
}
|
|
if bbox.Lly > bbox.Ury {
|
|
bbox.Lly, bbox.Ury = bbox.Ury, bbox.Lly
|
|
}
|
|
|
|
clipped, onPage := rectIntersection(bbox, to.e.mediaBox)
|
|
if !onPage {
|
|
common.Log.Debug("Text mark outside page. bbox=%g mediaBox=%g text=%q",
|
|
bbox, to.e.mediaBox, text)
|
|
}
|
|
bbox = clipped
|
|
|
|
// The orientedBBox is bbox rotated and translated so the base of the character is at Lly.
|
|
orientedBBox := bbox
|
|
orientedMBox := to.e.mediaBox
|
|
|
|
switch orient % 360 {
|
|
case 90:
|
|
orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
|
|
orientedBBox = model.PdfRectangle{
|
|
Llx: orientedMBox.Urx - bbox.Ury,
|
|
Urx: orientedMBox.Urx - bbox.Lly,
|
|
Lly: bbox.Llx,
|
|
Ury: bbox.Urx}
|
|
case 180:
|
|
orientedBBox = model.PdfRectangle{
|
|
Llx: orientedMBox.Urx - bbox.Llx,
|
|
Urx: orientedMBox.Urx - bbox.Urx,
|
|
Lly: orientedMBox.Ury - bbox.Lly,
|
|
Ury: orientedMBox.Ury - bbox.Ury}
|
|
case 270:
|
|
orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
|
|
orientedBBox = model.PdfRectangle{
|
|
Llx: bbox.Ury,
|
|
Urx: bbox.Lly,
|
|
Lly: orientedMBox.Ury - bbox.Llx,
|
|
Ury: orientedMBox.Ury - bbox.Urx}
|
|
}
|
|
if orientedBBox.Llx > orientedBBox.Urx {
|
|
orientedBBox.Llx, orientedBBox.Urx = orientedBBox.Urx, orientedBBox.Llx
|
|
}
|
|
if orientedBBox.Lly > orientedBBox.Ury {
|
|
orientedBBox.Lly, orientedBBox.Ury = orientedBBox.Ury, orientedBBox.Lly
|
|
}
|
|
|
|
tm := textMark{
|
|
text: text,
|
|
PdfRectangle: orientedBBox,
|
|
originaBBox: bbox,
|
|
font: font,
|
|
fontsize: height,
|
|
charspacing: charspacing,
|
|
trm: trm,
|
|
end: end,
|
|
orient: orient,
|
|
fillColor: fillColor,
|
|
strokeColor: strokeColor,
|
|
}
|
|
if verboseGeom {
|
|
common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
|
|
}
|
|
return tm, onPage
|
|
}
|
|
|
|
// String returns a description of `tm`.
|
|
func (tm *textMark) String() string {
|
|
return fmt.Sprintf("%.2f fontsize=%.2f \"%s\"", tm.PdfRectangle, tm.fontsize, tm.text)
|
|
}
|
|
|
|
// bbox makes textMark implement the `bounded` interface.
|
|
func (tm *textMark) bbox() model.PdfRectangle {
|
|
return tm.PdfRectangle
|
|
}
|
|
|
|
// ToTextMark returns the public view of `tm`.
|
|
func (tm *textMark) ToTextMark() TextMark {
|
|
return TextMark{
|
|
Text: tm.text,
|
|
Original: tm.original,
|
|
BBox: tm.originaBBox,
|
|
Font: tm.font,
|
|
FontSize: tm.fontsize,
|
|
FillColor: tm.fillColor,
|
|
StrokeColor: tm.strokeColor,
|
|
}
|
|
}
|
|
|
|
// inDiacriticArea returns true if `diacritic` is in the area where it could be a diacritic of `tm`.
|
|
func (tm *textMark) inDiacriticArea(diacritic *textMark) bool {
|
|
dLlx := tm.Llx - diacritic.Llx
|
|
dUrx := tm.Urx - diacritic.Urx
|
|
dLly := tm.Lly - diacritic.Lly
|
|
return math.Abs(dLlx+dUrx) < tm.Width()*diacriticRadiusR &&
|
|
math.Abs(dLly) < tm.Height()*diacriticRadiusR
|
|
}
|
|
|
|
// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted
|
|
// text.
|
|
func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark {
|
|
mark.Offset = *offset
|
|
marks = append(marks, mark)
|
|
*offset += len(mark.Text)
|
|
return marks
|
|
}
|
|
|
|
// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`,
|
|
// the offset of `mark` in the extracted text.
|
|
func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark {
|
|
mark := spaceMark
|
|
mark.Text = spaceChar
|
|
return appendTextMark(marks, offset, mark)
|
|
}
|
|
|
|
// nearestMultiple return the integer multiple of `m` that is closest to `x`.
|
|
func nearestMultiple(x float64, m int) int {
|
|
if m == 0 {
|
|
m = 1
|
|
}
|
|
fac := float64(m)
|
|
return int(math.Round(x/fac) * fac)
|
|
}
|