unipdf/extractor/text_table.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"fmt"
	"sort"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/model"
)

// textTable is a table of `w` x `h` textPara cells.
type textTable struct {
	model.PdfRectangle                      // Bounding rectangle.
	w, h               int                  // w=number of columns. h=number of rows.
	cells              map[uint64]*textPara // The cells
}

// String returns a description of `t`.
func (t *textTable) String() string {
	return fmt.Sprintf("%d x %d", t.w, t.h)
}

// bbox makes textLine implement the `bounded` interface.
func (t *textTable) bbox() model.PdfRectangle {
	return t.PdfRectangle
}

// extractTables converts the`paras` that are table cells to tables containing those cells.
func (paras paraList) extractTables() paraList {
	if verboseTable {
		common.Log.Debug("extractTables=%d ===========x=============", len(paras))
	}
	if len(paras) < minTableParas {
		return paras
	}
	tables := paras.findTables()
	if verboseTable {
		common.Log.Info("combined tables %d ================", len(tables))
		for i, t := range tables {
			t.log(fmt.Sprintf("combined %d", i))
		}
	}
	return paras.applyTables(tables)
}

// findTables returns all the tables  in `paras`.
func (paras paraList) findTables() []*textTable {
	paras.addNeighbours()
	// Pre-sort by reading direction then depth
	sort.Slice(paras, func(i, j int) bool {
		return diffReadingDepth(paras[i], paras[j]) < 0
	})

	var tables []*textTable
	for _, para := range paras {
		if para.isCell {
			continue
		}
		table := para.isAtom()
		if table == nil {
			continue
		}

		table.growTable()
		if table.w*table.h < minTableParas {
			continue
		}
		table.markCells()
		table.log("grown")
		tables = append(tables, table)

	}
	return tables
}

// isAtom atempts to build the smallest possible table fragment of 2 x 2 cells.
// If a table can be built then it is returned. Otherwise nil is returned.
// The smallest possible table is
//   a b
//   c d
// where
//   a is `para`.
//   b is immediately to the right of a and overlaps it in the y axis.
//   c is immediately below a and overlaps it in the x axis.
//   d is immediately to the right of c and overlaps it in the y axis and
//        immediately below b and ooverlaps it in the s axis.
//   None of a, b, c or d are cells in existing tables.
func (para *textPara) isAtom() *textTable {
	a := para
	b := para.right
	c := para.below
	if !(b != nil && !b.isCell && c != nil && !c.isCell) {
		return nil
	}
	d := b.below
	if !(d != nil && !d.isCell && d == c.right) {
		return nil
	}

	if b.left != a || c.above != a || d.left != c || d.above != b {
		return nil
	}
	return newTableAtom(a, b, c, d)
}

// newTable returns a table containing the a, b, c, d elements from isAtom().
func newTableAtom(a, b, c, d *textPara) *textTable {
	t := &textTable{w: 2, h: 2, cells: map[uint64]*textPara{}}
	t.put(0, 0, a)
	t.put(1, 0, b)
	t.put(0, 1, c)
	t.put(1, 1, d)
	return t
}

// growTable grows `t` to the largest w x h it can while remaining a valid table.
// It repeatedly tries to extend by one row and/or column
//    - down and right, then
//    - down, then
//    - right.
func (t *textTable) growTable() {
	growDown := func(down paraList) {
		t.h++
		for x := 0; x < t.w; x++ {
			cell := down[x]
			t.put(x, t.h-1, cell)
		}
	}
	growRight := func(right paraList) {
		t.w++
		for y := 0; y < t.h; y++ {
			cell := right[y]
			t.put(t.w-1, y, cell)
		}
	}

	for {
		changed := false
		down := t.getDown()
		right := t.getRight()
		if down != nil && right != nil {
			downRight := down[len(down)-1]
			if downRight != nil && !downRight.isCell && downRight == right[len(right)-1] {
				growDown(down)
				growRight(right)
				t.put(t.w-1, t.h-1, downRight)
				changed = true
			}
		}
		if !changed && down != nil {
			growDown(down)
			changed = true
		}
		if !changed && right != nil {
			growRight(right)
			changed = true
		}
		if !changed {
			break
		}
	}
}

// getDown returns the row of cells below `t` if they are a valid extension to `t` or nil if they aren't.
func (t *textTable) getDown() paraList {
	cells := make(paraList, t.w)
	for x := 0; x < t.w; x++ {
		cell := t.get(x, t.h-1).below
		if cell == nil || cell.isCell {
			return nil
		}
		cells[x] = cell
	}
	for x := 0; x < t.w-1; x++ {
		if cells[x].right != cells[x+1] {
			return nil
		}
	}
	return cells
}

// getRight returns the column of cells to the right `t` if they are a valid extension to `t` or nil
// if they aren't.
func (t *textTable) getRight() paraList {
	cells := make(paraList, t.h)
	for y := 0; y < t.h; y++ {
		cell := t.get(t.w-1, y).right
		if cell == nil || cell.isCell {
			return nil
		}
		cells[y] = cell
	}
	for y := 0; y < t.h-1; y++ {
		if cells[y].below != cells[y+1] {
			return nil
		}
	}
	return cells
}

// applyTables replaces the paras that are cells in `tables` with paras containing the tables in
//`tables`. This, of course, reduces the number of paras.
func (paras paraList) applyTables(tables []*textTable) paraList {
	consumed := map[*textPara]struct{}{}
	var tabled paraList
	for _, table := range tables {
		for _, para := range table.cells {
			consumed[para] = struct{}{}
		}
		tabled = append(tabled, table.newTablePara())
	}
	for _, para := range paras {
		if _, ok := consumed[para]; !ok {
			tabled = append(tabled, para)
		}
	}
	return tabled
}

// markCells marks the paras that are cells in `t` with isCell=true so that the won't be considered
// as cell candidates for tables in the future.
func (t *textTable) markCells() {
	for y := 0; y < t.h; y++ {
		for x := 0; x < t.w; x++ {
			para := t.get(x, y)
			para.isCell = true
		}
	}
}

// newTablePara returns a textPara containing `t`.
func (t *textTable) newTablePara() *textPara {
	bbox := t.computeBbox()
	return &textPara{
		PdfRectangle: bbox,
		eBBox:        bbox,
		table:        t,
	}
}

// computeBbox computes and returns the bounding box of `t`.
func (t *textTable) computeBbox() model.PdfRectangle {
	r := t.get(0, 0).PdfRectangle
	for x := 1; x < t.w; x++ {
		r = rectUnion(r, t.get(x, 0).PdfRectangle)
	}
	for y := 1; y < t.h; y++ {
		for x := 0; x < t.w; x++ {
			r = rectUnion(r, t.get(x, y).PdfRectangle)
		}
	}
	return r
}

// toTextTable returns the TextTable corresponding to `t`.
func (t *textTable) toTextTable() TextTable {
	cells := make([][]TableCell, t.h)
	for y := 0; y < t.h; y++ {
		cells[y] = make([]TableCell, t.w)
		for x := 0; x < t.w; x++ {
			c := t.get(x, y)
			cells[y][x].Text = c.text()
			offset := 0
			cells[y][x].Marks.marks = c.toTextMarks(&offset)
		}
	}
	return TextTable{W: t.w, H: t.h, Cells: cells}
}

// get returns the cell at `x`, `y`.
func (t *textTable) get(x, y int) *textPara {
	return t.cells[cellIndex(x, y)]
}

// put sets the cell at `x`, `y` to `cell`.
func (t *textTable) put(x, y int, cell *textPara) {
	t.cells[cellIndex(x, y)] = cell
}

// cellIndex returns a number that will be different for different `x` and `y` for any table found
// in a PDF which will less than 2^32 wide and hight.
func cellIndex(x, y int) uint64 {
	return uint64(x)*0x1000000 + uint64(y)
}

func (t *textTable) log(title string) {
	if !verboseTable {
		return
	}
	common.Log.Info("~~~ %s: %d x %d\n      %6.2f", title,
		t.w, t.h, t.PdfRectangle)
	for y := 0; y < t.h; y++ {
		for x := 0; x < t.w; x++ {
			p := t.get(x, y)
			fmt.Printf("%4d %2d: %6.2f %q\n", x, y, p.PdfRectangle, truncate(p.text(), 50))
		}
	}
}