unipdf/extractor/text_table.go
Peter Williams 88fda44e0a
Text extraction code for columns. (#366)
* Fixed filename:page in logging

* Got CMap working for multi-rune entries

* Treat CMap entries as strings instead of runes to handle multi-byte encodings.

* Added a test for multibyte encoding.

* First version of text extraction that recognizes columns

* Added an expanation of the text columns code to README.md.

* fixed typos

* Abstracted textWord depth calculation. This required change textMark to *textMark in a lot of code.

* Added function comments.

* Fixed text state save/restore.

* Adjusted inter-word search distance to make paragrah division work for thanh.pdf

* Got text_test.go passing.

* Reinstated hyphen suppression

* Handle more cases of fonts not being set in text extraction code.

* Fixed typo

* More verbose logging

* Adding tables to text extractor.

* Added tests for columns extraction.

* Removed commented code

* Check for textParas that are on the same line when writing out extracted text.

* Absorb text to the left of paras into paras e.g. Footnote numbers

* Removed funny character from text_test.go

* Commented out a creator_test.go test that was broken by my text extraction changes.

* Big changes to columns text extraction code for PR.

Performance improvements in several places.
Commented code.

* Updated extractor/README

* Cleaned up some comments and removed a panic

* Increased threshold for truncating extracted text when there is no license 100 -> 102.

This is a workaround to let a test in creator_test.go pass.

With the old text extraction code the following extracted text was 100 chars. With the new code it
is 102 chars which looks correct.

"你好\n你好你好你好你好\n河上白云\n\nUnlicensed UniDoc - Get a license on https://unidoc.io\n\n"

* Improved an error message.

* Removed irrelevant spaces

* Commented code and removed unused functions.

* Reverted PdfRectangle changes

* Added duplicate text detection.

* Combine diacritic textMarks in text extraction

* Reinstated a diacritic recombination test.

* Small code reorganisation

* Reinstated handling of rotated text

* Addressed issues in PR review

* Added color fields to TextMark

* Updated README

* Reinstated the disabled tests I missed before.

* Tightened definition for tables to prevent detection of tables where there weren't any.

* Compute line splitting search range based on fontsize of first word in word bag.

* Use errors.Is(err, core.ErrNotSupported) to distinguish unsupported font errorrs.

See https://blog.golang.org/go1.13-errors

* Fixed some naming and added some comments.

* errors.Is -> xerrors.Is and %w -> %v for go 1.12 compatibility

* Removed code that doesn't ever get called.

* Removed unused test
2020-06-30 19:33:10 +00:00

304 lines
7.4 KiB
Go

/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"fmt"
"sort"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
)
// textTable is a table of `w` x `h` textPara cells.
type textTable struct {
model.PdfRectangle // Bounding rectangle.
w, h int // w=number of columns. h=number of rows.
cells map[uint64]*textPara // The cells
}
// String returns a description of `t`.
func (t *textTable) String() string {
return fmt.Sprintf("%d x %d", t.w, t.h)
}
// bbox makes textLine implement the `bounded` interface.
func (t *textTable) bbox() model.PdfRectangle {
return t.PdfRectangle
}
// extractTables converts the`paras` that are table cells to tables containing those cells.
func (paras paraList) extractTables() paraList {
if verboseTable {
common.Log.Debug("extractTables=%d ===========x=============", len(paras))
}
if len(paras) < minTableParas {
return paras
}
tables := paras.findTables()
if verboseTable {
common.Log.Info("combined tables %d ================", len(tables))
for i, t := range tables {
t.log(fmt.Sprintf("combined %d", i))
}
}
return paras.applyTables(tables)
}
// findTables returns all the tables in `paras`.
func (paras paraList) findTables() []*textTable {
paras.addNeighbours()
// Pre-sort by reading direction then depth
sort.Slice(paras, func(i, j int) bool {
return diffReadingDepth(paras[i], paras[j]) < 0
})
var tables []*textTable
for _, para := range paras {
if para.isCell {
continue
}
table := para.isAtom()
if table == nil {
continue
}
table.growTable()
if table.w*table.h < minTableParas {
continue
}
table.markCells()
table.log("grown")
tables = append(tables, table)
}
return tables
}
// isAtom atempts to build the smallest possible table fragment of 2 x 2 cells.
// If a table can be built then it is returned. Otherwise nil is returned.
// The smallest possible table is
// a b
// c d
// where
// a is `para`.
// b is immediately to the right of a and overlaps it in the y axis.
// c is immediately below a and overlaps it in the x axis.
// d is immediately to the right of c and overlaps it in the y axis and
// immediately below b and ooverlaps it in the s axis.
// None of a, b, c or d are cells in existing tables.
func (para *textPara) isAtom() *textTable {
a := para
b := para.right
c := para.below
if !(b != nil && !b.isCell && c != nil && !c.isCell) {
return nil
}
d := b.below
if !(d != nil && !d.isCell && d == c.right) {
return nil
}
if b.left != a || c.above != a || d.left != c || d.above != b {
return nil
}
return newTableAtom(a, b, c, d)
}
// newTable returns a table containing the a, b, c, d elements from isAtom().
func newTableAtom(a, b, c, d *textPara) *textTable {
t := &textTable{w: 2, h: 2, cells: map[uint64]*textPara{}}
t.put(0, 0, a)
t.put(1, 0, b)
t.put(0, 1, c)
t.put(1, 1, d)
return t
}
// growTable grows `t` to the largest w x h it can while remaining a valid table.
// It repeatedly tries to extend by one row and/or column
// - down and right, then
// - down, then
// - right.
func (t *textTable) growTable() {
growDown := func(down paraList) {
t.h++
for x := 0; x < t.w; x++ {
cell := down[x]
t.put(x, t.h-1, cell)
}
}
growRight := func(right paraList) {
t.w++
for y := 0; y < t.h; y++ {
cell := right[y]
t.put(t.w-1, y, cell)
}
}
for {
changed := false
down := t.getDown()
right := t.getRight()
if down != nil && right != nil {
downRight := down[len(down)-1]
if downRight != nil && !downRight.isCell && downRight == right[len(right)-1] {
growDown(down)
growRight(right)
t.put(t.w-1, t.h-1, downRight)
changed = true
}
}
if !changed && down != nil {
growDown(down)
changed = true
}
if !changed && right != nil {
growRight(right)
changed = true
}
if !changed {
break
}
}
}
// getDown returns the row of cells below `t` if they are a valid extension to `t` or nil if they aren't.
func (t *textTable) getDown() paraList {
cells := make(paraList, t.w)
for x := 0; x < t.w; x++ {
cell := t.get(x, t.h-1).below
if cell == nil || cell.isCell {
return nil
}
cells[x] = cell
}
for x := 0; x < t.w-1; x++ {
if cells[x].right != cells[x+1] {
return nil
}
}
return cells
}
// getRight returns the column of cells to the right `t` if they are a valid extension to `t` or nil
// if they aren't.
func (t *textTable) getRight() paraList {
cells := make(paraList, t.h)
for y := 0; y < t.h; y++ {
cell := t.get(t.w-1, y).right
if cell == nil || cell.isCell {
return nil
}
cells[y] = cell
}
for y := 0; y < t.h-1; y++ {
if cells[y].below != cells[y+1] {
return nil
}
}
return cells
}
// applyTables replaces the paras that are cells in `tables` with paras containing the tables in
//`tables`. This, of course, reduces the number of paras.
func (paras paraList) applyTables(tables []*textTable) paraList {
consumed := map[*textPara]struct{}{}
var tabled paraList
for _, table := range tables {
for _, para := range table.cells {
consumed[para] = struct{}{}
}
tabled = append(tabled, table.newTablePara())
}
for _, para := range paras {
if _, ok := consumed[para]; !ok {
tabled = append(tabled, para)
}
}
return tabled
}
// markCells marks the paras that are cells in `t` with isCell=true so that the won't be considered
// as cell candidates for tables in the future.
func (t *textTable) markCells() {
for y := 0; y < t.h; y++ {
for x := 0; x < t.w; x++ {
para := t.get(x, y)
para.isCell = true
}
}
}
// newTablePara returns a textPara containing `t`.
func (t *textTable) newTablePara() *textPara {
bbox := t.computeBbox()
return &textPara{
PdfRectangle: bbox,
eBBox: bbox,
table: t,
}
}
// computeBbox computes and returns the bounding box of `t`.
func (t *textTable) computeBbox() model.PdfRectangle {
r := t.get(0, 0).PdfRectangle
for x := 1; x < t.w; x++ {
r = rectUnion(r, t.get(x, 0).PdfRectangle)
}
for y := 1; y < t.h; y++ {
for x := 0; x < t.w; x++ {
r = rectUnion(r, t.get(x, y).PdfRectangle)
}
}
return r
}
// toTextTable returns the TextTable corresponding to `t`.
func (t *textTable) toTextTable() TextTable {
cells := make([][]TableCell, t.h)
for y := 0; y < t.h; y++ {
cells[y] = make([]TableCell, t.w)
for x := 0; x < t.w; x++ {
c := t.get(x, y)
cells[y][x].Text = c.text()
offset := 0
cells[y][x].Marks.marks = c.toTextMarks(&offset)
}
}
return TextTable{W: t.w, H: t.h, Cells: cells}
}
// get returns the cell at `x`, `y`.
func (t *textTable) get(x, y int) *textPara {
return t.cells[cellIndex(x, y)]
}
// put sets the cell at `x`, `y` to `cell`.
func (t *textTable) put(x, y int, cell *textPara) {
t.cells[cellIndex(x, y)] = cell
}
// cellIndex returns a number that will be different for different `x` and `y` for any table found
// in a PDF which will less than 2^32 wide and hight.
func cellIndex(x, y int) uint64 {
return uint64(x)*0x1000000 + uint64(y)
}
func (t *textTable) log(title string) {
if !verboseTable {
return
}
common.Log.Info("~~~ %s: %d x %d\n %6.2f", title,
t.w, t.h, t.PdfRectangle)
for y := 0; y < t.h; y++ {
for x := 0; x < t.w; x++ {
p := t.get(x, y)
fmt.Printf("%4d %2d: %6.2f %q\n", x, y, p.PdfRectangle, truncate(p.text(), 50))
}
}
}