mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-02 22:17:06 +08:00

* Fixed filename:page in logging * Got CMap working for multi-rune entries * Treat CMap entries as strings instead of runes to handle multi-byte encodings. * Added a test for multibyte encoding. * First version of text extraction that recognizes columns * Added an expanation of the text columns code to README.md. * fixed typos * Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code. * Added function comments. * Fixed text state save/restore. * Adjusted inter-word search distance to make paragrah division work for thanh.pdf * Got text_test.go passing. * Reinstated hyphen suppression * Handle more cases of fonts not being set in text extraction code. * Fixed typo * More verbose logging * Adding tables to text extractor. * Added tests for columns extraction. * Removed commented code * Check for textParas that are on the same line when writing out extracted text. * Absorb text to the left of paras into paras e.g. Footnote numbers * Removed funny character from text_test.go * Commented out a creator_test.go test that was broken by my text extraction changes. * Big changes to columns text extraction code for PR. Performance improvements in several places. Commented code. * Updated extractor/README * Cleaned up some comments and removed a panic * Increased threshold for truncating extracted text when there is no license 100 -> 102. This is a workaround to let a test in creator_test.go pass. With the old text extraction code the following extracted text was 100 chars. With the new code it is 102 chars which looks correct. "你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n" * Improved an error message. * Removed irrelevant spaces * Commented code and removed unused functions. * Reverted PdfRectangle changes * Added duplicate text detection. * Combine diacritic textMarks in text extraction * Reinstated a diacritic recombination test. * Small code reorganisation * Reinstated handling of rotated text * Addressed issues in PR review * Added color fields to TextMark * Updated README * Reinstated the disabled tests I missed before. * Tightened definition for tables to prevent detection of tables where there weren't any. * Compute line splitting search range based on fontsize of first word in word bag. * Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs. See https://blog.golang.org/go1.13-errors * Fixed some naming and added some comments. * errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility * Removed code that doesn't ever get called. * Removed unused test
304 lines
7.4 KiB
Go
304 lines
7.4 KiB
Go
/*
|
|
* This file is subject to the terms and conditions defined in
|
|
* file 'LICENSE.md', which is part of this source code package.
|
|
*/
|
|
|
|
package extractor
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
|
|
"github.com/unidoc/unipdf/v3/common"
|
|
"github.com/unidoc/unipdf/v3/model"
|
|
)
|
|
|
|
// textTable is a table of `w` x `h` textPara cells.
|
|
type textTable struct {
|
|
model.PdfRectangle // Bounding rectangle.
|
|
w, h int // w=number of columns. h=number of rows.
|
|
cells map[uint64]*textPara // The cells
|
|
}
|
|
|
|
// String returns a description of `t`.
|
|
func (t *textTable) String() string {
|
|
return fmt.Sprintf("%d x %d", t.w, t.h)
|
|
}
|
|
|
|
// bbox makes textLine implement the `bounded` interface.
|
|
func (t *textTable) bbox() model.PdfRectangle {
|
|
return t.PdfRectangle
|
|
}
|
|
|
|
// extractTables converts the`paras` that are table cells to tables containing those cells.
|
|
func (paras paraList) extractTables() paraList {
|
|
if verboseTable {
|
|
common.Log.Debug("extractTables=%d ===========x=============", len(paras))
|
|
}
|
|
if len(paras) < minTableParas {
|
|
return paras
|
|
}
|
|
tables := paras.findTables()
|
|
if verboseTable {
|
|
common.Log.Info("combined tables %d ================", len(tables))
|
|
for i, t := range tables {
|
|
t.log(fmt.Sprintf("combined %d", i))
|
|
}
|
|
}
|
|
return paras.applyTables(tables)
|
|
}
|
|
|
|
// findTables returns all the tables in `paras`.
|
|
func (paras paraList) findTables() []*textTable {
|
|
paras.addNeighbours()
|
|
// Pre-sort by reading direction then depth
|
|
sort.Slice(paras, func(i, j int) bool {
|
|
return diffReadingDepth(paras[i], paras[j]) < 0
|
|
})
|
|
|
|
var tables []*textTable
|
|
for _, para := range paras {
|
|
if para.isCell {
|
|
continue
|
|
}
|
|
table := para.isAtom()
|
|
if table == nil {
|
|
continue
|
|
}
|
|
|
|
table.growTable()
|
|
if table.w*table.h < minTableParas {
|
|
continue
|
|
}
|
|
table.markCells()
|
|
table.log("grown")
|
|
tables = append(tables, table)
|
|
|
|
}
|
|
return tables
|
|
}
|
|
|
|
// isAtom atempts to build the smallest possible table fragment of 2 x 2 cells.
|
|
// If a table can be built then it is returned. Otherwise nil is returned.
|
|
// The smallest possible table is
|
|
// a b
|
|
// c d
|
|
// where
|
|
// a is `para`.
|
|
// b is immediately to the right of a and overlaps it in the y axis.
|
|
// c is immediately below a and overlaps it in the x axis.
|
|
// d is immediately to the right of c and overlaps it in the y axis and
|
|
// immediately below b and ooverlaps it in the s axis.
|
|
// None of a, b, c or d are cells in existing tables.
|
|
func (para *textPara) isAtom() *textTable {
|
|
a := para
|
|
b := para.right
|
|
c := para.below
|
|
if !(b != nil && !b.isCell && c != nil && !c.isCell) {
|
|
return nil
|
|
}
|
|
d := b.below
|
|
if !(d != nil && !d.isCell && d == c.right) {
|
|
return nil
|
|
}
|
|
|
|
if b.left != a || c.above != a || d.left != c || d.above != b {
|
|
return nil
|
|
}
|
|
return newTableAtom(a, b, c, d)
|
|
}
|
|
|
|
// newTable returns a table containing the a, b, c, d elements from isAtom().
|
|
func newTableAtom(a, b, c, d *textPara) *textTable {
|
|
t := &textTable{w: 2, h: 2, cells: map[uint64]*textPara{}}
|
|
t.put(0, 0, a)
|
|
t.put(1, 0, b)
|
|
t.put(0, 1, c)
|
|
t.put(1, 1, d)
|
|
return t
|
|
}
|
|
|
|
// growTable grows `t` to the largest w x h it can while remaining a valid table.
|
|
// It repeatedly tries to extend by one row and/or column
|
|
// - down and right, then
|
|
// - down, then
|
|
// - right.
|
|
func (t *textTable) growTable() {
|
|
growDown := func(down paraList) {
|
|
t.h++
|
|
for x := 0; x < t.w; x++ {
|
|
cell := down[x]
|
|
t.put(x, t.h-1, cell)
|
|
}
|
|
}
|
|
growRight := func(right paraList) {
|
|
t.w++
|
|
for y := 0; y < t.h; y++ {
|
|
cell := right[y]
|
|
t.put(t.w-1, y, cell)
|
|
}
|
|
}
|
|
|
|
for {
|
|
changed := false
|
|
down := t.getDown()
|
|
right := t.getRight()
|
|
if down != nil && right != nil {
|
|
downRight := down[len(down)-1]
|
|
if downRight != nil && !downRight.isCell && downRight == right[len(right)-1] {
|
|
growDown(down)
|
|
growRight(right)
|
|
t.put(t.w-1, t.h-1, downRight)
|
|
changed = true
|
|
}
|
|
}
|
|
if !changed && down != nil {
|
|
growDown(down)
|
|
changed = true
|
|
}
|
|
if !changed && right != nil {
|
|
growRight(right)
|
|
changed = true
|
|
}
|
|
if !changed {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// getDown returns the row of cells below `t` if they are a valid extension to `t` or nil if they aren't.
|
|
func (t *textTable) getDown() paraList {
|
|
cells := make(paraList, t.w)
|
|
for x := 0; x < t.w; x++ {
|
|
cell := t.get(x, t.h-1).below
|
|
if cell == nil || cell.isCell {
|
|
return nil
|
|
}
|
|
cells[x] = cell
|
|
}
|
|
for x := 0; x < t.w-1; x++ {
|
|
if cells[x].right != cells[x+1] {
|
|
return nil
|
|
}
|
|
}
|
|
return cells
|
|
}
|
|
|
|
// getRight returns the column of cells to the right `t` if they are a valid extension to `t` or nil
|
|
// if they aren't.
|
|
func (t *textTable) getRight() paraList {
|
|
cells := make(paraList, t.h)
|
|
for y := 0; y < t.h; y++ {
|
|
cell := t.get(t.w-1, y).right
|
|
if cell == nil || cell.isCell {
|
|
return nil
|
|
}
|
|
cells[y] = cell
|
|
}
|
|
for y := 0; y < t.h-1; y++ {
|
|
if cells[y].below != cells[y+1] {
|
|
return nil
|
|
}
|
|
}
|
|
return cells
|
|
}
|
|
|
|
// applyTables replaces the paras that are cells in `tables` with paras containing the tables in
|
|
//`tables`. This, of course, reduces the number of paras.
|
|
func (paras paraList) applyTables(tables []*textTable) paraList {
|
|
consumed := map[*textPara]struct{}{}
|
|
var tabled paraList
|
|
for _, table := range tables {
|
|
for _, para := range table.cells {
|
|
consumed[para] = struct{}{}
|
|
}
|
|
tabled = append(tabled, table.newTablePara())
|
|
}
|
|
for _, para := range paras {
|
|
if _, ok := consumed[para]; !ok {
|
|
tabled = append(tabled, para)
|
|
}
|
|
}
|
|
return tabled
|
|
}
|
|
|
|
// markCells marks the paras that are cells in `t` with isCell=true so that the won't be considered
|
|
// as cell candidates for tables in the future.
|
|
func (t *textTable) markCells() {
|
|
for y := 0; y < t.h; y++ {
|
|
for x := 0; x < t.w; x++ {
|
|
para := t.get(x, y)
|
|
para.isCell = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// newTablePara returns a textPara containing `t`.
|
|
func (t *textTable) newTablePara() *textPara {
|
|
bbox := t.computeBbox()
|
|
return &textPara{
|
|
PdfRectangle: bbox,
|
|
eBBox: bbox,
|
|
table: t,
|
|
}
|
|
}
|
|
|
|
// computeBbox computes and returns the bounding box of `t`.
|
|
func (t *textTable) computeBbox() model.PdfRectangle {
|
|
r := t.get(0, 0).PdfRectangle
|
|
for x := 1; x < t.w; x++ {
|
|
r = rectUnion(r, t.get(x, 0).PdfRectangle)
|
|
}
|
|
for y := 1; y < t.h; y++ {
|
|
for x := 0; x < t.w; x++ {
|
|
r = rectUnion(r, t.get(x, y).PdfRectangle)
|
|
}
|
|
}
|
|
return r
|
|
}
|
|
|
|
// toTextTable returns the TextTable corresponding to `t`.
|
|
func (t *textTable) toTextTable() TextTable {
|
|
cells := make([][]TableCell, t.h)
|
|
for y := 0; y < t.h; y++ {
|
|
cells[y] = make([]TableCell, t.w)
|
|
for x := 0; x < t.w; x++ {
|
|
c := t.get(x, y)
|
|
cells[y][x].Text = c.text()
|
|
offset := 0
|
|
cells[y][x].Marks.marks = c.toTextMarks(&offset)
|
|
}
|
|
}
|
|
return TextTable{W: t.w, H: t.h, Cells: cells}
|
|
}
|
|
|
|
// get returns the cell at `x`, `y`.
|
|
func (t *textTable) get(x, y int) *textPara {
|
|
return t.cells[cellIndex(x, y)]
|
|
}
|
|
|
|
// put sets the cell at `x`, `y` to `cell`.
|
|
func (t *textTable) put(x, y int, cell *textPara) {
|
|
t.cells[cellIndex(x, y)] = cell
|
|
}
|
|
|
|
// cellIndex returns a number that will be different for different `x` and `y` for any table found
|
|
// in a PDF which will less than 2^32 wide and hight.
|
|
func cellIndex(x, y int) uint64 {
|
|
return uint64(x)*0x1000000 + uint64(y)
|
|
}
|
|
|
|
func (t *textTable) log(title string) {
|
|
if !verboseTable {
|
|
return
|
|
}
|
|
common.Log.Info("~~~ %s: %d x %d\n %6.2f", title,
|
|
t.w, t.h, t.PdfRectangle)
|
|
for y := 0; y < t.h; y++ {
|
|
for x := 0; x < t.w; x++ {
|
|
p := t.get(x, y)
|
|
fmt.Printf("%4d %2d: %6.2f %q\n", x, y, p.PdfRectangle, truncate(p.text(), 50))
|
|
}
|
|
}
|
|
}
|