mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-29 13:48:54 +08:00
Merge branch 'extract.text' of https://github.com/peterwilliams97/unidoc into v3-peterwilliams97-extract.text
This commit is contained in:
commit
f04f83b271
@ -18,6 +18,9 @@ type Extractor struct {
|
||||
|
||||
// accessCount is used to set fontEntry.access to an incrementing number.
|
||||
accessCount int64
|
||||
|
||||
// textCount is an incrementing number used to identify XYTest objects.
|
||||
textCount int64
|
||||
}
|
||||
|
||||
// New returns an Extractor instance for extracting content from the input PDF page.
|
||||
|
@ -13,11 +13,13 @@ import (
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/unidoc/unidoc/common"
|
||||
"github.com/unidoc/unidoc/pdf/contentstream"
|
||||
"github.com/unidoc/unidoc/pdf/core"
|
||||
"github.com/unidoc/unidoc/pdf/model"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
// ExtractText processes and extracts all text data in content streams and returns as a string.
|
||||
@ -49,7 +51,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
|
||||
cstreamParser := contentstream.NewContentStreamParser(e.contents)
|
||||
operations, err := cstreamParser.Parse()
|
||||
if err != nil {
|
||||
common.Log.Debug("ExtractXYText: parse failed. err=%v", err)
|
||||
common.Log.Debug("ERROR: ExtractXYText parse failed. err=%v", err)
|
||||
return textList, state.numChars, state.numMisses, err
|
||||
}
|
||||
|
||||
@ -309,7 +311,7 @@ func (to *textObject) nextLine() {
|
||||
|
||||
// setTextMatrix "Tm".
|
||||
// Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers
|
||||
// in `f` (page 250)
|
||||
// in `f` (page 250).
|
||||
func (to *textObject) setTextMatrix(f []float64) {
|
||||
if len(f) != 6 {
|
||||
common.Log.Debug("ERROR: len(f) != 6 (%d)", len(f))
|
||||
@ -317,25 +319,23 @@ func (to *textObject) setTextMatrix(f []float64) {
|
||||
}
|
||||
a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5]
|
||||
to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
||||
to.Tlm = contentstream.NewMatrix(a, b, c, d, tx, ty)
|
||||
common.Log.Debug("setTextMatrix: Tm=%s", to.Tm)
|
||||
to.Tlm = to.Tm
|
||||
}
|
||||
|
||||
// showText "Tj" Show a text string.
|
||||
// showText "Tj". Show a text string.
|
||||
func (to *textObject) showText(charcodes []byte) error {
|
||||
return to.renderText(charcodes)
|
||||
}
|
||||
|
||||
// showTextAdjusted "TJ" Show text with adjustable spacing.
|
||||
// showTextAdjusted "TJ". Show text with adjustable spacing.
|
||||
func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
|
||||
vertical := false
|
||||
for _, o := range args.Elements() {
|
||||
switch o.(type) {
|
||||
case *core.PdfObjectFloat, *core.PdfObjectInteger:
|
||||
// The following is supposed to be equivalent to the existing Unidoc implementation.
|
||||
x, err := core.GetNumberAsFloat(o)
|
||||
if err != nil {
|
||||
common.Log.Debug("showTextAdjusted: Bad numerical arg. o=%s args=%+v", o, args)
|
||||
common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args)
|
||||
return err
|
||||
}
|
||||
dx, dy := -x*0.001*to.State.Tfs, 0.0
|
||||
@ -344,23 +344,23 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
|
||||
}
|
||||
td := translationMatrix(Point{X: dx, Y: dy})
|
||||
to.Tm = td.Mult(to.Tm)
|
||||
common.Log.Debug("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm)
|
||||
common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm)
|
||||
case *core.PdfObjectString:
|
||||
charcodes, ok := core.GetStringBytes(o)
|
||||
if !ok {
|
||||
common.Log.Debug("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
|
||||
common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
|
||||
return core.ErrTypeError
|
||||
}
|
||||
to.renderText(charcodes)
|
||||
default:
|
||||
common.Log.Debug("showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
|
||||
common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
|
||||
return core.ErrTypeError
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setTextLeading "TL" Set text leading.
|
||||
// setTextLeading "TL". Set text leading.
|
||||
func (to *textObject) setTextLeading(y float64) {
|
||||
if to == nil || to.State == nil {
|
||||
return
|
||||
@ -368,7 +368,7 @@ func (to *textObject) setTextLeading(y float64) {
|
||||
to.State.Tl = y
|
||||
}
|
||||
|
||||
// setCharSpacing "Tc" Set character spacing.
|
||||
// setCharSpacing "Tc". Set character spacing.
|
||||
func (to *textObject) setCharSpacing(x float64) {
|
||||
if to == nil {
|
||||
return
|
||||
@ -376,7 +376,7 @@ func (to *textObject) setCharSpacing(x float64) {
|
||||
to.State.Tc = x
|
||||
}
|
||||
|
||||
// setFont "Tf" Set font.
|
||||
// setFont "Tf". Set font.
|
||||
func (to *textObject) setFont(name string, size float64) error {
|
||||
if to == nil {
|
||||
return nil
|
||||
@ -399,7 +399,7 @@ func (to *textObject) setFont(name string, size float64) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// setTextRenderMode "Tr" Set text rendering mode.
|
||||
// setTextRenderMode "Tr". Set text rendering mode.
|
||||
func (to *textObject) setTextRenderMode(mode int) {
|
||||
if to == nil {
|
||||
return
|
||||
@ -407,7 +407,7 @@ func (to *textObject) setTextRenderMode(mode int) {
|
||||
to.State.Tmode = RenderMode(mode)
|
||||
}
|
||||
|
||||
// setTextRise "Ts" Set text rise.
|
||||
// setTextRise "Ts". Set text rise.
|
||||
func (to *textObject) setTextRise(y float64) {
|
||||
if to == nil {
|
||||
return
|
||||
@ -415,7 +415,7 @@ func (to *textObject) setTextRise(y float64) {
|
||||
to.State.Trise = y
|
||||
}
|
||||
|
||||
// setWordSpacing "Tw" Set word spacing.
|
||||
// setWordSpacing "Tw". Set word spacing.
|
||||
func (to *textObject) setWordSpacing(y float64) {
|
||||
if to == nil {
|
||||
return
|
||||
@ -423,7 +423,7 @@ func (to *textObject) setWordSpacing(y float64) {
|
||||
to.State.Tw = y
|
||||
}
|
||||
|
||||
// setHorizScaling "Tz" Set horizontal scaling.
|
||||
// setHorizScaling "Tz". Set horizontal scaling.
|
||||
func (to *textObject) setHorizScaling(y float64) {
|
||||
if to == nil {
|
||||
return
|
||||
@ -577,9 +577,6 @@ type textObject struct {
|
||||
Tm contentstream.Matrix // Text matrix. For the character pointer.
|
||||
Tlm contentstream.Matrix // Text line matrix. For the start of line pointer.
|
||||
Texts []XYText // Text gets written here.
|
||||
|
||||
// These fields are used to implement existing UniDoc behaviour.
|
||||
xPos, yPos float64
|
||||
}
|
||||
|
||||
// newTextState returns a default textState.
|
||||
@ -610,6 +607,9 @@ func (to *textObject) renderText(data []byte) error {
|
||||
charcodes := font.BytesToCharcodes(data)
|
||||
|
||||
runes, numChars, numMisses := font.CharcodesToUnicode(charcodes)
|
||||
if numMisses > 0 {
|
||||
common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
|
||||
}
|
||||
|
||||
to.State.numChars += numChars
|
||||
to.State.numMisses += numMisses
|
||||
@ -629,10 +629,14 @@ func (to *textObject) renderText(data []byte) error {
|
||||
0, tfs,
|
||||
0, state.Trise)
|
||||
|
||||
common.Log.Debug("==========================================")
|
||||
common.Log.Debug("%d codes=%+v runes=%q", len(charcodes), charcodes, runes)
|
||||
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes)
|
||||
|
||||
for i, r := range runes {
|
||||
// XXX(peterwilliams97) Need to find and fix cases where this happens.
|
||||
if r == "\x00" {
|
||||
continue
|
||||
}
|
||||
|
||||
code := charcodes[i]
|
||||
// The location of the text on the page in device coordinates is given by trm, the text
|
||||
// rendering matrix.
|
||||
@ -656,14 +660,13 @@ func (to *textObject) renderText(data []byte) error {
|
||||
// c is the character size in unscaled text units.
|
||||
c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
|
||||
|
||||
// t0 is the end of this character.
|
||||
// t is the displacement of the text cursor when the character is rendered.
|
||||
// float tx = displacementX * fontSize * horizontalScaling;
|
||||
// w = 0
|
||||
t0 := Point{X: (c.X*tfs + w) * th}
|
||||
t := Point{X: (c.X*tfs + state.Tc + w) * th}
|
||||
|
||||
// td, td0 are t, t0 in matrix form.
|
||||
// td0 is where this char ends. td is where the next char stats.
|
||||
// td0 is where this character ends. td is where the next character starts.
|
||||
td0 := translationMatrix(t0)
|
||||
td := translationMatrix(t)
|
||||
|
||||
@ -671,19 +674,17 @@ func (to *textObject) renderText(data []byte) error {
|
||||
common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.Tc, w, state.Tw)
|
||||
common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.Tm).Mult(to.gs.CTM))
|
||||
|
||||
nextTm := td.Mult(to.Tm)
|
||||
common.Log.Trace("nextTm=%s", nextTm)
|
||||
|
||||
xyt := newXYText(
|
||||
xyt := to.newXYText(
|
||||
string(r),
|
||||
trm,
|
||||
translation(td0.Mult(to.Tm).Mult(to.gs.CTM)),
|
||||
1.0*trm.ScalingFactorY(),
|
||||
spaceWidth*trm.ScalingFactorX())
|
||||
common.Log.Trace("i=%d code=%d, xyt=%s", i, code, xyt)
|
||||
common.Log.Trace("i=%d code=%d xyt=%s trm=%s", i, code, xyt, trm)
|
||||
to.Texts = append(to.Texts, xyt)
|
||||
|
||||
// update the text matrix by the displacement of the text location.
|
||||
to.Tm = nextTm
|
||||
to.Tm = td.Mult(to.Tm)
|
||||
common.Log.Trace("to.Tm=%s", to.Tm)
|
||||
}
|
||||
|
||||
@ -714,36 +715,45 @@ func (to *textObject) moveTo(tx, ty float64) {
|
||||
}
|
||||
|
||||
// XYText represents text drawn on a page and its position in device coordinates.
|
||||
// All dimensions are in device coordinates.
|
||||
type XYText struct {
|
||||
Trm contentstream.Matrix
|
||||
Text string // The text.
|
||||
Orient int // The text orientation.
|
||||
OrientedStart Point // Left of text in orientation where text is horizontal.
|
||||
OrientedEnd Point // Right of text in orientation where text is horizontal.
|
||||
ColorStroking model.PdfColor // Colour that text is stroked with, if any.
|
||||
ColorNonStroking model.PdfColor // Colour that text is filled with, if any.
|
||||
Orient int
|
||||
Text string
|
||||
SpaceWidth float64
|
||||
Font string
|
||||
FontSize float64
|
||||
Height float64 // Text height.
|
||||
SpaceWidth float64 // Best guess at the width of a space in the font the text was rendered with.
|
||||
count int64 // To help with reading debug logs.
|
||||
}
|
||||
|
||||
func newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText {
|
||||
// newXYText returns an XYText for text `text` rendered with text rendering matrix `trm` and end
|
||||
// of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in
|
||||
// the font the text is rendered in device coordinates.
|
||||
func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point,
|
||||
height, spaceWidth float64) XYText {
|
||||
to.e.textCount++
|
||||
theta := trm.Angle()
|
||||
if theta%180 == 0 {
|
||||
height = trm.ScalingFactorY()
|
||||
} else {
|
||||
height = trm.ScalingFactorX()
|
||||
}
|
||||
|
||||
return XYText{
|
||||
Text: text,
|
||||
Trm: trm,
|
||||
Orient: theta,
|
||||
OrientedStart: translation(trm).Rotate(theta),
|
||||
OrientedEnd: end.Rotate(theta),
|
||||
Orient: theta,
|
||||
Height: height,
|
||||
SpaceWidth: spaceWidth,
|
||||
count: to.e.textCount,
|
||||
}
|
||||
}
|
||||
|
||||
// String returns a string describing `t`.
|
||||
func (t XYText) String() string {
|
||||
return fmt.Sprintf("XYText{%s %.1f |%d| [%.3f,%.3f] %q}",
|
||||
t.Trm.String(), t.Width(), t.Orient, t.OrientedStart.X, t.OrientedStart.Y,
|
||||
truncate(t.Text, 100))
|
||||
return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f %d° %q}",
|
||||
t.count, t.OrientedStart.X, t.OrientedStart.Y, t.Width(), t.Orient, truncate(t.Text, 100))
|
||||
}
|
||||
|
||||
// Width returns the width of `t`.Text in the text direction.
|
||||
@ -755,13 +765,25 @@ func (t XYText) Width() float64 {
|
||||
type TextList []XYText
|
||||
|
||||
// Length returns the number of elements in `tl`.
|
||||
func (tl *TextList) Length() int {
|
||||
return len(*tl)
|
||||
func (tl TextList) Length() int {
|
||||
return len(tl)
|
||||
}
|
||||
|
||||
// height returns the max height of the elements in `tl`.
|
||||
func (tl TextList) height() float64 {
|
||||
fontHeight := 0.0
|
||||
for _, t := range tl {
|
||||
if t.Height > fontHeight {
|
||||
fontHeight = t.Height
|
||||
}
|
||||
}
|
||||
return fontHeight
|
||||
}
|
||||
|
||||
// ToText returns the contents of `tl` as a single string.
|
||||
func (tl *TextList) ToText() string {
|
||||
func (tl TextList) ToText() string {
|
||||
tl.printTexts("ToText: before sorting")
|
||||
|
||||
tl.SortPosition()
|
||||
|
||||
lines := tl.toLines()
|
||||
@ -776,12 +798,16 @@ func (tl *TextList) ToText() string {
|
||||
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
|
||||
// is horizontal.
|
||||
func (tl *TextList) SortPosition() {
|
||||
fontHeight := tl.height()
|
||||
// We sort with a y tolerance to allow for subscripts, diacritics etc.
|
||||
tol := min(fontHeight*0.2, 5.0)
|
||||
common.Log.Trace("SortPosition: fontHeight=%.1f tol=%.1f", fontHeight, tol)
|
||||
sort.SliceStable(*tl, func(i, j int) bool {
|
||||
ti, tj := (*tl)[i], (*tl)[j]
|
||||
if ti.Orient != tj.Orient {
|
||||
return ti.Orient < tj.Orient
|
||||
}
|
||||
if ti.OrientedStart.Y != tj.OrientedStart.Y {
|
||||
if math.Abs(ti.OrientedStart.Y-tj.OrientedStart.Y) > tol {
|
||||
return ti.OrientedStart.Y > tj.OrientedStart.Y
|
||||
}
|
||||
return ti.OrientedStart.X < tj.OrientedStart.X
|
||||
@ -793,26 +819,31 @@ type Line struct {
|
||||
Y float64 // y position of line.
|
||||
Dx []float64 // x distance between successive words in line.
|
||||
Text string // text in the line.
|
||||
Words []string // words in the line
|
||||
Words []string // words in the line.
|
||||
}
|
||||
|
||||
// toLines returns the text and positions in `tl` as a slice of Line.
|
||||
// NOTE: Caller must sort the text list by top-to-bottom, left-to-write (for orientation adjusted so
|
||||
// NOTE: Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so
|
||||
// that text is horizontal) before calling this function.
|
||||
func (tl TextList) toLines() []Line {
|
||||
// We divide `tl` into slices which contain texts with the same orientation, extract the lines
|
||||
// for each orientation then return the concatention of these lines sorted by orientation.
|
||||
tlOrient := make(map[int]TextList, len(tl))
|
||||
for _, t := range tl {
|
||||
tlOrient[t.Orient] = append(tlOrient[t.Orient], t)
|
||||
}
|
||||
lines := make([]Line, 0, 4)
|
||||
for _, o := range []int{0, 90, 180, 270} {
|
||||
var lines []Line
|
||||
for _, o := range orientKeys(tlOrient) {
|
||||
lines = append(lines, tlOrient[o].toLinesOrient()...)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
// toLinesOrient returns the text and positions in `tl` as a slice of Line.
|
||||
// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
|
||||
// NOTE: This function only works on text lists where all text is the same orientation so it should
|
||||
// only be called from toLines.
|
||||
// Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so
|
||||
// that text is horizontal) before calling this function.
|
||||
func (tl TextList) toLinesOrient() []Line {
|
||||
tl.printTexts("toLines: before")
|
||||
if len(tl) == 0 {
|
||||
@ -827,13 +858,14 @@ func (tl TextList) toLinesOrient() []Line {
|
||||
|
||||
averageCharWidth := exponAve{}
|
||||
wordSpacing := exponAve{}
|
||||
lastEndX := 0.0 // tl[i-1].End.X
|
||||
lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X
|
||||
|
||||
for _, t := range tl {
|
||||
if t.OrientedStart.Y < y {
|
||||
if len(words) > 0 {
|
||||
line := newLine(y, x, words)
|
||||
if averageCharWidth.running {
|
||||
line = combineDiacritics(line, averageCharWidth.ave)
|
||||
line = removeDuplicates(line, averageCharWidth.ave)
|
||||
}
|
||||
lines = append(lines, line)
|
||||
@ -893,6 +925,24 @@ func (tl TextList) toLinesOrient() []Line {
|
||||
return lines
|
||||
}
|
||||
|
||||
// orientKeys returns the keys of `tlOrient` as a sorted slice.
|
||||
func orientKeys(tlOrient map[int]TextList) []int {
|
||||
keys := []int{}
|
||||
for k := range tlOrient {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Ints(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
// min returns the lesser of `a` and `b`.
|
||||
func min(a, b float64) float64 {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// exponAve implements an exponential average.
|
||||
type exponAve struct {
|
||||
ave float64 // Current average value.
|
||||
@ -974,6 +1024,144 @@ func removeDuplicates(line Line, charWidth float64) Line {
|
||||
return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
|
||||
}
|
||||
|
||||
// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
|
||||
// `charWidth` is the average character width for the line.
|
||||
// We have to do this because PDF can render diacritics separately to the characters they attach to
|
||||
// in extracted text.
|
||||
func combineDiacritics(line Line, charWidth float64) Line {
|
||||
if len(line.Dx) == 0 {
|
||||
return line
|
||||
}
|
||||
|
||||
tol := charWidth * 0.2
|
||||
common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)
|
||||
|
||||
words := []string{}
|
||||
dxList := []float64{}
|
||||
w := line.Words[0]
|
||||
w, c := countDiacritic(w)
|
||||
delta := 0.0
|
||||
dx0 := 0.0
|
||||
parts := []string{w}
|
||||
numChars := c
|
||||
|
||||
for i := 0; i < len(line.Dx); i++ {
|
||||
w = line.Words[i+1]
|
||||
w, c := countDiacritic(w)
|
||||
dx := line.Dx[i]
|
||||
if numChars+c <= 1 && delta+dx <= tol {
|
||||
if len(parts) == 0 {
|
||||
dx0 = dx
|
||||
} else {
|
||||
delta += dx
|
||||
}
|
||||
parts = append(parts, w)
|
||||
numChars += c
|
||||
} else {
|
||||
if len(parts) > 0 {
|
||||
if len(words) > 0 {
|
||||
dxList = append(dxList, dx0)
|
||||
}
|
||||
words = append(words, combine(parts))
|
||||
}
|
||||
parts = []string{w}
|
||||
numChars = c
|
||||
dx0 = dx
|
||||
delta = 0.0
|
||||
}
|
||||
}
|
||||
if len(parts) > 0 {
|
||||
if len(words) > 0 {
|
||||
dxList = append(dxList, dx0)
|
||||
}
|
||||
words = append(words, combine(parts))
|
||||
}
|
||||
|
||||
if len(words) != len(dxList)+1 {
|
||||
common.Log.Error("Inconsistent: \nwords=%d %q\ndxList=%d %.2f",
|
||||
len(words), words, len(dxList), dxList)
|
||||
return line
|
||||
}
|
||||
return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words}
|
||||
}
|
||||
|
||||
// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
|
||||
func combine(parts []string) string {
|
||||
if len(parts) == 1 {
|
||||
// Must be a non-diacritic.
|
||||
return parts[0]
|
||||
}
|
||||
|
||||
// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
|
||||
diacritic := map[string]bool{}
|
||||
for _, w := range parts {
|
||||
r := []rune(w)[0]
|
||||
diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
|
||||
}
|
||||
sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })
|
||||
|
||||
// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
|
||||
for i, w := range parts {
|
||||
parts[i] = strings.TrimSpace(norm.NFKC.String(w))
|
||||
}
|
||||
return strings.Join(parts, "")
|
||||
}
|
||||
|
||||
// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
|
||||
// non-diacritics in `w` (0 or 1)
|
||||
func countDiacritic(w string) (string, int) {
|
||||
runes := []rune(w)
|
||||
if len(runes) != 1 {
|
||||
return w, 1
|
||||
}
|
||||
r := runes[0]
|
||||
c := 1
|
||||
if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) {
|
||||
c = 0
|
||||
}
|
||||
if w2, ok := diacritics[r]; ok {
|
||||
c = 0
|
||||
w = w2
|
||||
}
|
||||
return w, c
|
||||
}
|
||||
|
||||
// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
|
||||
// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
|
||||
var diacritics = map[rune]string{
|
||||
0x0060: "\u0300",
|
||||
0x02CB: "\u0300",
|
||||
0x0027: "\u0301",
|
||||
0x02B9: "\u0301",
|
||||
0x02CA: "\u0301",
|
||||
0x005e: "\u0302",
|
||||
0x02C6: "\u0302",
|
||||
0x007E: "\u0303",
|
||||
0x02C9: "\u0304",
|
||||
0x00B0: "\u030A",
|
||||
0x02BA: "\u030B",
|
||||
0x02C7: "\u030C",
|
||||
0x02C8: "\u030D",
|
||||
0x0022: "\u030E",
|
||||
0x02BB: "\u0312",
|
||||
0x02BC: "\u0313",
|
||||
0x0486: "\u0313",
|
||||
0x055A: "\u0313",
|
||||
0x02BD: "\u0314",
|
||||
0x0485: "\u0314",
|
||||
0x0559: "\u0314",
|
||||
0x02D4: "\u031D",
|
||||
0x02D5: "\u031E",
|
||||
0x02D6: "\u031F",
|
||||
0x02D7: "\u0320",
|
||||
0x02B2: "\u0321",
|
||||
0x02CC: "\u0329",
|
||||
0x02B7: "\u032B",
|
||||
0x02CD: "\u0331",
|
||||
0x005F: "\u0332",
|
||||
0x204E: "\u0359",
|
||||
}
|
||||
|
||||
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
|
||||
// empty.
|
||||
func (to *textObject) getCurrentFont() *model.PdfFont {
|
||||
|
@ -16,6 +16,7 @@ import (
|
||||
|
||||
"github.com/unidoc/unidoc/common"
|
||||
"github.com/unidoc/unidoc/pdf/model"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
// NOTE: We do a best effort at finding the PDF file because we don't keep PDF test files in this repo so you
|
||||
@ -196,8 +197,10 @@ var fileExtractionTests = []struct {
|
||||
},
|
||||
{filename: "Ito_Formula.pdf",
|
||||
expectedPageText: map[int][]string{
|
||||
// 1: []string{"In the Itô stochastic calculus"},
|
||||
1: []string{"In standard, non-stochastic calculus, one computes a derivative"},
|
||||
1: []string{
|
||||
"In the Itô stochastic calculus",
|
||||
"In standard, non-stochastic calculus, one computes a derivative"},
|
||||
2: []string{"Financial Economics Itô’s Formula"},
|
||||
},
|
||||
},
|
||||
{filename: "circ2.pdf",
|
||||
@ -215,6 +218,11 @@ var fileExtractionTests = []struct {
|
||||
1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
|
||||
},
|
||||
},
|
||||
{filename: "thanh.pdf",
|
||||
expectedPageText: map[int][]string{
|
||||
1: []string{"Hàn Thé̂ Thành"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the extracted
|
||||
@ -244,6 +252,7 @@ func testExtractFile(t *testing.T, filename string, expectedPageText map[int][]s
|
||||
if !ok {
|
||||
t.Fatalf("%q doesn't have page %d", filename, pageNum)
|
||||
}
|
||||
actualText = norm.NFKC.String(actualText)
|
||||
if !containsSentences(t, expectedSentences, actualText) {
|
||||
t.Fatalf("Text mismatch filepath=%q page=%d", filepath, pageNum)
|
||||
}
|
||||
@ -291,8 +300,9 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) {
|
||||
// containsSentences returns true if all strings `expectedSentences` are contained in `actualText`.
|
||||
func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool {
|
||||
for _, e := range expectedSentences {
|
||||
e = norm.NFKC.String(e)
|
||||
if !strings.Contains(actualText, e) {
|
||||
t.Errorf("No match for %#q", e)
|
||||
t.Errorf("No match for %q", e)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ func (font PdfFont) GetFontDescriptor() (*PdfFontDescriptor, error) {
|
||||
case *pdfCIDFontType2:
|
||||
return t.fontDescriptor, nil
|
||||
}
|
||||
common.Log.Debug("ERROR: Cannot get font descriptor for font type %t (%s)", font, font)
|
||||
common.Log.Debug("ERROR: Cannot get font descriptor for font type %T (%s)", font, font)
|
||||
return nil, errors.New("font descriptor not found")
|
||||
}
|
||||
|
||||
@ -67,7 +67,6 @@ func (font PdfFont) String() string {
|
||||
enc = font.context.Encoder().String()
|
||||
}
|
||||
return fmt.Sprintf("FONT{%T %s %s}", font.context, font.baseFields().coreString(), enc)
|
||||
|
||||
}
|
||||
|
||||
// BaseFont returns the font's "BaseFont" field.
|
||||
@ -214,7 +213,7 @@ func NewStandard14FontWithEncoding(basefont Standard14Font, alphabet map[rune]in
|
||||
return &PdfFont{context: &std}, encoder, nil
|
||||
}
|
||||
|
||||
// GetAlphabet returns a map of the runes in `text`.
|
||||
// GetAlphabet returns a map of the runes in `text` and their frequencies.
|
||||
func GetAlphabet(text string) map[rune]int {
|
||||
alphabet := map[rune]int{}
|
||||
for _, r := range text {
|
||||
@ -477,7 +476,7 @@ func (font PdfFont) ToPdfObject() core.PdfObject {
|
||||
if t := font.actualFont(); t != nil {
|
||||
return t.ToPdfObject()
|
||||
}
|
||||
common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object",
|
||||
common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object.",
|
||||
font.context)
|
||||
return core.MakeNull()
|
||||
}
|
||||
@ -560,7 +559,7 @@ func (font PdfFont) actualFont() fonts.Font {
|
||||
case *pdfCIDFontType2:
|
||||
return t
|
||||
default:
|
||||
common.Log.Debug("ERROR: actualFont. Unknown font type %t. font=%s", t, font)
|
||||
common.Log.Debug("ERROR: actualFont. Unknown font type %T. font=%s", t, font)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
@ -581,7 +580,7 @@ func (font PdfFont) baseFields() *fontCommon {
|
||||
case *pdfCIDFontType2:
|
||||
return t.baseFields()
|
||||
default:
|
||||
common.Log.Debug("ERROR: base. Unknown font type %t. font=%s", t, font.String())
|
||||
common.Log.Debug("ERROR: base. Unknown font type %T. font=%s", t, font.String())
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -131,7 +131,6 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
|
||||
// returned to indicate whether or not the entry was found in the glyph to charcode mapping.
|
||||
func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||
if width, ok := font.charWidths[code]; ok {
|
||||
common.Log.Debug("GetCharMetrics 1: code=%d width=%.1f font=%s", code, width, font)
|
||||
return fonts.CharMetrics{Wx: width}, true
|
||||
}
|
||||
if isBuiltin(Standard14Font(font.basefont)) {
|
||||
@ -142,12 +141,11 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
|
||||
if glyph, ok := font.encoder.CharcodeToGlyph(code); ok {
|
||||
if metrics, ok := font.fontMetrics[glyph]; ok {
|
||||
font.charWidths[code] = metrics.Wx
|
||||
common.Log.Debug("GetCharMetrics 2: code=%d glyph=%q width=%.1f", code, glyph, metrics.Wx)
|
||||
return metrics, true
|
||||
}
|
||||
}
|
||||
}
|
||||
common.Log.Debug("GetCharMetrics 3: code=%d font=%s", code, font)
|
||||
common.Log.Debug("GetCharMetrics: No match for code=%d font=%s", code, font)
|
||||
return fonts.CharMetrics{}, false
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user