unipdf/extractor/text_mark.go

/*
 * This file is subject to the terms and conditions defined in
 * file 'LICENSE.md', which is part of this source code package.
 */

package extractor

import (
	"fmt"
	"image/color"
	"math"

	"github.com/unidoc/unipdf/v3/common"
	"github.com/unidoc/unipdf/v3/internal/transform"
	"github.com/unidoc/unipdf/v3/model"
)

// textMark represents text drawn on a page and its position in device coordinates.
// All dimensions are in device coordinates.
type textMark struct {
	model.PdfRectangle                    // Bounding box oriented so character base is at bottom
	orient             int                // Orientation
	text               string             // The text (decoded via ToUnicode).
	original           string             // Original text (decoded).
	font               *model.PdfFont     // The font the mark was drawn with.
	fontsize           float64            // The font size the mark was drawn with.
	charspacing        float64            // TODO (peterwilliams97: Should this be exposed in TextMark?
	trm                transform.Matrix   // The current text rendering matrix (TRM above).
	end                transform.Point    // The end of character device coordinates.
	originaBBox        model.PdfRectangle // Bounding box without orientation correction.
	fillColor          color.Color        // Text fill color.
	strokeColor        color.Color        // Text stroke color.
}

// newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
// and end of character device coordinates `end`. `spaceWidth` is our best guess at the width of a
// space in the font the text is rendered in device coordinates.
func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
	spaceWidth float64, font *model.PdfFont, charspacing float64,
	fillColor, strokeColor color.Color) (textMark, bool) {
	theta := trm.Angle()
	orient := nearestMultiple(theta, orientationGranularity)
	var height float64
	if orient%180 != 90 {
		height = trm.ScalingFactorY()
	} else {
		height = trm.ScalingFactorX()
	}

	start := translation(trm)
	bbox := model.PdfRectangle{Llx: start.X, Lly: start.Y, Urx: end.X, Ury: end.Y}
	switch orient % 360 {
	case 90:
		bbox.Urx -= height
	case 180:
		bbox.Ury -= height
	case 270:
		bbox.Urx += height
	case 0:
		bbox.Ury += height
	default:
		// This is a hack to capture diagonal text.
		// TODO(peterwilliams97): Extract diagonal text.
		orient = 0
		bbox.Ury += height
	}
	if bbox.Llx > bbox.Urx {
		bbox.Llx, bbox.Urx = bbox.Urx, bbox.Llx
	}
	if bbox.Lly > bbox.Ury {
		bbox.Lly, bbox.Ury = bbox.Ury, bbox.Lly
	}

	clipped, onPage := rectIntersection(bbox, to.e.mediaBox)
	if !onPage {
		common.Log.Debug("Text mark outside page. bbox=%g mediaBox=%g text=%q",
			bbox, to.e.mediaBox, text)
	}
	bbox = clipped

	// The orientedBBox is bbox rotated and translated so the base of the character is at Lly.
	orientedBBox := bbox
	orientedMBox := to.e.mediaBox

	switch orient % 360 {
	case 90:
		orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
		orientedBBox = model.PdfRectangle{
			Llx: orientedMBox.Urx - bbox.Ury,
			Urx: orientedMBox.Urx - bbox.Lly,
			Lly: bbox.Llx,
			Ury: bbox.Urx}
	case 180:
		orientedBBox = model.PdfRectangle{
			Llx: orientedMBox.Urx - bbox.Llx,
			Urx: orientedMBox.Urx - bbox.Urx,
			Lly: orientedMBox.Ury - bbox.Lly,
			Ury: orientedMBox.Ury - bbox.Ury}
	case 270:
		orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
		orientedBBox = model.PdfRectangle{
			Llx: bbox.Ury,
			Urx: bbox.Lly,
			Lly: orientedMBox.Ury - bbox.Llx,
			Ury: orientedMBox.Ury - bbox.Urx}
	}
	if orientedBBox.Llx > orientedBBox.Urx {
		orientedBBox.Llx, orientedBBox.Urx = orientedBBox.Urx, orientedBBox.Llx
	}
	if orientedBBox.Lly > orientedBBox.Ury {
		orientedBBox.Lly, orientedBBox.Ury = orientedBBox.Ury, orientedBBox.Lly
	}

	tm := textMark{
		text:         text,
		PdfRectangle: orientedBBox,
		originaBBox:  bbox,
		font:         font,
		fontsize:     height,
		charspacing:  charspacing,
		trm:          trm,
		end:          end,
		orient:       orient,
		fillColor:    fillColor,
		strokeColor:  strokeColor,
	}
	if verboseGeom {
		common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
	}
	return tm, onPage
}

// String returns a description of `tm`.
func (tm *textMark) String() string {
	return fmt.Sprintf("%.2f fontsize=%.2f \"%s\"", tm.PdfRectangle, tm.fontsize, tm.text)
}

// bbox makes textMark implement the `bounded` interface.
func (tm *textMark) bbox() model.PdfRectangle {
	return tm.PdfRectangle
}

// ToTextMark returns the public view of `tm`.
func (tm *textMark) ToTextMark() TextMark {
	return TextMark{
		Text:        tm.text,
		Original:    tm.original,
		BBox:        tm.originaBBox,
		Font:        tm.font,
		FontSize:    tm.fontsize,
		FillColor:   tm.fillColor,
		StrokeColor: tm.strokeColor,
	}
}

// inDiacriticArea returns true if `diacritic` is in the area where it could be a diacritic of `tm`.
func (tm *textMark) inDiacriticArea(diacritic *textMark) bool {
	dLlx := tm.Llx - diacritic.Llx
	dUrx := tm.Urx - diacritic.Urx
	dLly := tm.Lly - diacritic.Lly
	return math.Abs(dLlx+dUrx) < tm.Width()*diacriticRadiusR &&
		math.Abs(dLly) < tm.Height()*diacriticRadiusR
}

// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted
// text.
func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark {
	mark.Offset = *offset
	marks = append(marks, mark)
	*offset += len(mark.Text)
	return marks
}

// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`,
// the offset of `mark` in the extracted text.
func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark {
	mark := spaceMark
	mark.Text = spaceChar
	return appendTextMark(marks, offset, mark)
}

// nearestMultiple return the integer multiple of `m` that is closest to `x`.
func nearestMultiple(x float64, m int) int {
	if m == 0 {
		m = 1
	}
	fac := float64(m)
	return int(math.Round(x/fac) * fac)
}