From f373881a48afd519e46bf20e1c8038f28aad7ecf Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Tue, 27 Nov 2018 13:37:12 +1100 Subject: [PATCH 1/3] Removed some unused struct fields. --- pdf/extractor/text.go | 119 +++++++++++++++++++++------------------- pdf/model/font.go | 13 ++--- pdf/model/fonts/font.go | 2 +- 3 files changed, 69 insertions(+), 65 deletions(-) diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go index f23a59a2..c6e99440 100644 --- a/pdf/extractor/text.go +++ b/pdf/extractor/text.go @@ -49,7 +49,7 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) { cstreamParser := contentstream.NewContentStreamParser(e.contents) operations, err := cstreamParser.Parse() if err != nil { - common.Log.Debug("ExtractXYText: parse failed. err=%v", err) + common.Log.Debug("ERROR: ExtractXYText parse failed. err=%v", err) return textList, state.numChars, state.numMisses, err } @@ -309,29 +309,27 @@ func (to *textObject) nextLine() { // setTextMatrix "Tm". // Set the text matrix, Tm, and the text line matrix, Tlm to the Matrix specified by the 6 numbers -// in `f` (page 250) +// in `f` (page 250). func (to *textObject) setTextMatrix(f []float64) { a, b, c, d, tx, ty := f[0], f[1], f[2], f[3], f[4], f[5] to.Tm = contentstream.NewMatrix(a, b, c, d, tx, ty) - to.Tlm = contentstream.NewMatrix(a, b, c, d, tx, ty) - common.Log.Debug("setTextMatrix: Tm=%s", to.Tm) + to.Tlm = to.Tm } -// showText "Tj" Show a text string. +// showText "Tj". Show a text string. func (to *textObject) showText(charcodes []byte) error { return to.renderText(charcodes) } -// showTextAdjusted "TJ" Show text with adjustable spacing. +// showTextAdjusted "TJ". Show text with adjustable spacing. func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error { vertical := false for _, o := range args.Elements() { switch o.(type) { case *core.PdfObjectFloat, *core.PdfObjectInteger: - // The following is supposed to be equivalent to the existing Unidoc implementation. x, err := core.GetNumberAsFloat(o) if err != nil { - common.Log.Debug("showTextAdjusted: Bad numerical arg. o=%s args=%+v", o, args) + common.Log.Debug("ERROR: showTextAdjusted. Bad numerical arg. o=%s args=%+v", o, args) return err } dx, dy := -x*0.001*to.State.Tfs, 0.0 @@ -340,23 +338,23 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error { } td := translationMatrix(Point{X: dx, Y: dy}) to.Tm = td.Mult(to.Tm) - common.Log.Debug("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm) + common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.Tm) case *core.PdfObjectString: charcodes, ok := core.GetStringBytes(o) if !ok { - common.Log.Debug("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args) + common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args) return core.ErrTypeError } to.renderText(charcodes) default: - common.Log.Debug("showTextAdjusted. Unexpected type (%T) args=%+v", o, args) + common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args) return core.ErrTypeError } } return nil } -// setTextLeading "TL" Set text leading. +// setTextLeading "TL". Set text leading. func (to *textObject) setTextLeading(y float64) { if to == nil { return @@ -364,7 +362,7 @@ func (to *textObject) setTextLeading(y float64) { to.State.Tl = y } -// setCharSpacing "Tc" Set character spacing. +// setCharSpacing "Tc". Set character spacing. func (to *textObject) setCharSpacing(x float64) { if to == nil { return @@ -372,7 +370,7 @@ func (to *textObject) setCharSpacing(x float64) { to.State.Tc = x } -// setFont "Tf" Set font. +// setFont "Tf". Set font. func (to *textObject) setFont(name string, size float64) error { if to == nil { return nil @@ -395,7 +393,7 @@ func (to *textObject) setFont(name string, size float64) error { return nil } -// setTextRenderMode "Tr" Set text rendering mode. +// setTextRenderMode "Tr". Set text rendering mode. func (to *textObject) setTextRenderMode(mode int) { if to == nil { return @@ -403,7 +401,7 @@ func (to *textObject) setTextRenderMode(mode int) { to.State.Tmode = RenderMode(mode) } -// setTextRise "Ts" Set text rise. +// setTextRise "Ts". Set text rise. func (to *textObject) setTextRise(y float64) { if to == nil { return @@ -411,7 +409,7 @@ func (to *textObject) setTextRise(y float64) { to.State.Trise = y } -// setWordSpacing "Tw" Set word spacing. +// setWordSpacing "Tw". Set word spacing. func (to *textObject) setWordSpacing(y float64) { if to == nil { return @@ -419,7 +417,7 @@ func (to *textObject) setWordSpacing(y float64) { to.State.Tw = y } -// setHorizScaling "Tz" Set horizontal scaling. +// setHorizScaling "Tz". Set horizontal scaling. func (to *textObject) setHorizScaling(y float64) { if to == nil { return @@ -573,9 +571,6 @@ type textObject struct { Tm contentstream.Matrix // Text matrix. For the character pointer. Tlm contentstream.Matrix // Text line matrix. For the start of line pointer. Texts []XYText // Text gets written here. - - // These fields are used to implement existing UniDoc behaviour. - xPos, yPos float64 } // newTextState returns a default textState. @@ -625,8 +620,7 @@ func (to *textObject) renderText(data []byte) error { 0, tfs, 0, state.Trise) - common.Log.Debug("==========================================") - common.Log.Debug("%d codes=%+v runes=%q", len(charcodes), charcodes, runes) + common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runes) for i, r := range runes { @@ -653,14 +647,13 @@ func (to *textObject) renderText(data []byte) error { // c is the character size in unscaled text units. c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio} + // t0 is the end of this character. // t is the displacement of the text cursor when the character is rendered. - // float tx = displacementX * fontSize * horizontalScaling; - // w = 0 t0 := Point{X: (c.X*tfs + w) * th} t := Point{X: (c.X*tfs + state.Tc + w) * th} // td, td0 are t, t0 in matrix form. - // td0 is where this char ends. td is where the next char stats. + // td0 is where this character ends. td is where the next character starts. td0 := translationMatrix(t0) td := translationMatrix(t) @@ -668,19 +661,16 @@ func (to *textObject) renderText(data []byte) error { common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.Tc, w, state.Tw) common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.Tm).Mult(to.gs.CTM)) - nextTm := td.Mult(to.Tm) - common.Log.Trace("nextTm=%s", nextTm) - - xyt := newXYText( + xyt := to.newXYText( string(r), trm, translation(td0.Mult(to.Tm).Mult(to.gs.CTM)), spaceWidth*trm.ScalingFactorX()) - common.Log.Trace("i=%d code=%d, xyt=%s", i, code, xyt) + common.Log.Trace("i=%d code=%d xyt=%s trm=%s", i, code, xyt, trm) to.Texts = append(to.Texts, xyt) // update the text matrix by the displacement of the text location. - to.Tm = nextTm + to.Tm = td.Mult(to.Tm) common.Log.Trace("to.Tm=%s", to.Tm) } @@ -711,36 +701,36 @@ func (to *textObject) moveTo(tx, ty float64) { } // XYText represents text drawn on a page and its position in device coordinates. +// All dimensions are in device coordinates. type XYText struct { - Trm contentstream.Matrix - OrientedStart Point // Left of text in orientation where text is horizontal. - OrientedEnd Point // Right of text in orientation where text is horizontal. - ColorStroking model.PdfColor // Colour that text is stroked with, if any. - ColorNonStroking model.PdfColor // Colour that text is filled with, if any. - Orient int - Text string - SpaceWidth float64 - Font string - FontSize float64 + Text string // The text. + Orient int // The text orientation. + OrientedStart Point // Left of text in orientation where text is horizontal. + OrientedEnd Point // Right of text in orientation where text is horizontal. + SpaceWidth float64 // Best guess at the width of a space in the font the text was rendered with. + count int64 // To help with reading debug logs. } -func newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText { +// newXYText returns an XYText for text `text` rendered with text rendering matrix `trm` and end +// of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in +// the font the text is rendered in device coordinates. +func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText { + to.e.textCount++ theta := trm.Angle() return XYText{ Text: text, - Trm: trm, + Orient: theta, OrientedStart: translation(trm).Rotate(theta), OrientedEnd: end.Rotate(theta), - Orient: theta, SpaceWidth: spaceWidth, + count: to.e.textCount, } } // String returns a string describing `t`. func (t XYText) String() string { - return fmt.Sprintf("XYText{%s %.1f |%d| [%.3f,%.3f] %q}", - t.Trm.String(), t.Width(), t.Orient, t.OrientedStart.X, t.OrientedStart.Y, - truncate(t.Text, 100)) + return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f |%d| %q}", + t.count, t.OrientedStart.X, t.OrientedStart.Y, t.Width(), t.Orient, truncate(t.Text, 100)) } // Width returns the width of `t`.Text in the text direction. @@ -752,12 +742,12 @@ func (t XYText) Width() float64 { type TextList []XYText // Length returns the number of elements in `tl`. -func (tl *TextList) Length() int { - return len(*tl) +func (tl TextList) Length() int { + return len(tl) } // ToText returns the contents of `tl` as a single string. -func (tl *TextList) ToText() string { +func (tl TextList) ToText() string { tl.printTexts("ToText: before sorting") tl.SortPosition() @@ -790,26 +780,31 @@ type Line struct { Y float64 // y position of line. Dx []float64 // x distance between successive words in line. Text string // text in the line. - Words []string // words in the line + Words []string // words in the line. } // toLines returns the text and positions in `tl` as a slice of Line. -// NOTE: Caller must sort the text list by top-to-bottom, left-to-write (for orientation adjusted so +// NOTE: Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so // that text is horizontal) before calling this function. func (tl TextList) toLines() []Line { + // We divide `tl` into slices which contain texts with the same orientation, extract the lines + // for each orientation then return the concatention of these lines sorted by orientation. tlOrient := map[int]TextList{} for _, t := range tl { tlOrient[t.Orient] = append(tlOrient[t.Orient], t) } lines := []Line{} - for _, o := range []int{0, 90, 180, 270} { + for _, o := range orientKeys(tlOrient) { lines = append(lines, tlOrient[o].toLinesOrient()...) } return lines } // toLinesOrient returns the text and positions in `tl` as a slice of Line. -// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function. +// NOTE: This function only works on text lists where all text is the same orientation so it should +// only be called from toLines. +// Caller must sort the text list top-to-bottom, left-to-write (for orientation adjusted so +// that text is horizontal) before calling this function. func (tl TextList) toLinesOrient() []Line { tl.printTexts("toLines: before") if len(tl) == 0 { @@ -824,7 +819,7 @@ func (tl TextList) toLinesOrient() []Line { averageCharWidth := ExponAve{} wordSpacing := ExponAve{} - lastEndX := 0.0 // tl[i-1].End.X + lastEndX := 0.0 // lastEndX is tl[i-1].OrientedEnd.X for _, t := range tl { if t.OrientedStart.Y < y { @@ -865,7 +860,7 @@ func (tl TextList) toLinesOrient() []Line { common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g", t.Width(), min(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth) common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t", - t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX, + t.Text, t.OrientedStart.X, t.OrientedStart.Y, lastEndX, nextWordX, nextWordX-t.OrientedStart.X, isSpace) if isSpace { @@ -890,6 +885,16 @@ func (tl TextList) toLinesOrient() []Line { return lines } +// orientKeys returns the keys of `tlOrient` as a sorted slice. +func orientKeys(tlOrient map[int]TextList) []int { + keys := []int{} + for k := range tlOrient { + keys = append(keys, k) + } + sort.Ints(keys) + return keys +} + // min returns the lesser of `a` and `b`. func min(a, b float64) float64 { if a < b { diff --git a/pdf/model/font.go b/pdf/model/font.go index 6c6e51d0..d8a745e7 100644 --- a/pdf/model/font.go +++ b/pdf/model/font.go @@ -27,7 +27,7 @@ type Font interface { SetEncoder(encoder textencoding.TextEncoder) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) - GetAverageCharWidth() float64 + GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove. ToPdfObject() core.PdfObject } @@ -52,7 +52,7 @@ func (font PdfFont) GetFontDescriptor() (*PdfFontDescriptor, error) { case *pdfCIDFontType2: return t.fontDescriptor, nil } - common.Log.Debug("ERROR: Cannot get font descriptor for font type %t (%s)", font, font) + common.Log.Debug("ERROR: Cannot get font descriptor for font type %T (%s)", font, font) return nil, errors.New("font descriptor not found") } @@ -63,7 +63,6 @@ func (font PdfFont) String() string { enc = font.context.Encoder().String() } return fmt.Sprintf("FONT{%T %s %s}", font.context, font.baseFields().coreString(), enc) - } // BaseFont returns the font's "BaseFont" field. @@ -210,7 +209,7 @@ func NewStandard14FontWithEncoding(basefont Standard14Font, alphabet map[rune]in return &PdfFont{context: &std}, encoder, nil } -// GetAlphabet returns a map of the runes in `text`. +// GetAlphabet returns a map of the runes in `text` and their frequencies. func GetAlphabet(text string) map[rune]int { alphabet := map[rune]int{} for _, r := range text { @@ -473,7 +472,7 @@ func (font PdfFont) ToPdfObject() core.PdfObject { if t := font.actualFont(); t != nil { return t.ToPdfObject() } - common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object", + common.Log.Debug("ERROR: ToPdfObject Not implemented for font type=%#T. Returning null object.", font.context) return core.MakeNull() } @@ -576,7 +575,7 @@ func (font PdfFont) actualFont() Font { case *pdfCIDFontType2: return t default: - common.Log.Debug("ERROR: actualFont. Unknown font type %t. font=%s", t, font) + common.Log.Debug("ERROR: actualFont. Unknown font type %T. font=%s", t, font) return nil } } @@ -597,7 +596,7 @@ func (font PdfFont) baseFields() *fontCommon { case *pdfCIDFontType2: return t.baseFields() default: - common.Log.Debug("ERROR: base. Unknown font type %t. font=%s", t, font.String()) + common.Log.Debug("ERROR: base. Unknown font type %T. font=%s", t, font.String()) return nil } } diff --git a/pdf/model/fonts/font.go b/pdf/model/fonts/font.go index 34755e7b..8b7b3cf5 100644 --- a/pdf/model/fonts/font.go +++ b/pdf/model/fonts/font.go @@ -18,7 +18,7 @@ type Font interface { Encoder() textencoding.TextEncoder SetEncoder(encoder textencoding.TextEncoder) GetGlyphCharMetrics(glyph string) (CharMetrics, bool) - GetAverageCharWidth() float64 + GetAverageCharWidth() float64 // XXX(peterwilliams97) Not used. Remove. ToPdfObject() core.PdfObject } From 36a1148962ceaad33c523e70b9192d6f4e733619 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Wed, 28 Nov 2018 18:06:03 +1100 Subject: [PATCH 2/3] Combine diacritics in text extraction. --- pdf/extractor/extractor.go | 3 + pdf/extractor/text.go | 184 ++++++++++++++++++++++++++++++++++++- pdf/extractor/text_test.go | 16 +++- pdf/model/font_simple.go | 4 +- 4 files changed, 198 insertions(+), 9 deletions(-) diff --git a/pdf/extractor/extractor.go b/pdf/extractor/extractor.go index 12c2488e..dad88a7c 100644 --- a/pdf/extractor/extractor.go +++ b/pdf/extractor/extractor.go @@ -18,6 +18,9 @@ type Extractor struct { // accessCount is used to set fontEntry.access to an incrementing number. accessCount int64 + + // textCount is an incrementing number used to identify XYTest objects. + textCount int64 } // New returns an Extractor instance for extracting content from the input PDF page. diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go index c6e99440..d6a47fc9 100644 --- a/pdf/extractor/text.go +++ b/pdf/extractor/text.go @@ -13,11 +13,13 @@ import ( "runtime" "sort" "strings" + "unicode" "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/contentstream" "github.com/unidoc/unidoc/pdf/core" "github.com/unidoc/unidoc/pdf/model" + "golang.org/x/text/unicode/norm" ) // ExtractText processes and extracts all text data in content streams and returns as a string. @@ -601,6 +603,9 @@ func (to *textObject) renderText(data []byte) error { charcodes := font.BytesToCharcodes(data) runes, numChars, numMisses := font.CharcodesToUnicode(charcodes) + if numMisses > 0 { + common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses) + } to.State.numChars += numChars to.State.numMisses += numMisses @@ -624,6 +629,11 @@ func (to *textObject) renderText(data []byte) error { for i, r := range runes { + // XXX(peterwilliams97) Need to find and fix cases where this happens. + if r == "\x00" { + continue + } + code := charcodes[i] // The location of the text on the page in device coordinates is given by trm, the text // rendering matrix. @@ -665,6 +675,7 @@ func (to *textObject) renderText(data []byte) error { string(r), trm, translation(td0.Mult(to.Tm).Mult(to.gs.CTM)), + 1.0*trm.ScalingFactorY(), spaceWidth*trm.ScalingFactorX()) common.Log.Trace("i=%d code=%d xyt=%s trm=%s", i, code, xyt, trm) to.Texts = append(to.Texts, xyt) @@ -707,6 +718,7 @@ type XYText struct { Orient int // The text orientation. OrientedStart Point // Left of text in orientation where text is horizontal. OrientedEnd Point // Right of text in orientation where text is horizontal. + Height float64 // Text height. SpaceWidth float64 // Best guess at the width of a space in the font the text was rendered with. count int64 // To help with reading debug logs. } @@ -714,14 +726,22 @@ type XYText struct { // newXYText returns an XYText for text `text` rendered with text rendering matrix `trm` and end // of character device coordinates `end`. `spaceWidth` is our best guess at the width of a space in // the font the text is rendered in device coordinates. -func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point, spaceWidth float64) XYText { +func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point, + height, spaceWidth float64) XYText { to.e.textCount++ theta := trm.Angle() + if theta%180 == 0 { + height = trm.ScalingFactorY() + } else { + height = trm.ScalingFactorX() + } + return XYText{ Text: text, Orient: theta, OrientedStart: translation(trm).Rotate(theta), OrientedEnd: end.Rotate(theta), + Height: height, SpaceWidth: spaceWidth, count: to.e.textCount, } @@ -729,7 +749,7 @@ func (to *textObject) newXYText(text string, trm contentstream.Matrix, end Point // String returns a string describing `t`. func (t XYText) String() string { - return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f |%d| %q}", + return fmt.Sprintf("XYText{@%03d [%.3f,%.3f] %.1f %d° %q}", t.count, t.OrientedStart.X, t.OrientedStart.Y, t.Width(), t.Orient, truncate(t.Text, 100)) } @@ -746,9 +766,21 @@ func (tl TextList) Length() int { return len(tl) } +// height returns the max height of the elements in `tl`. +func (tl TextList) height() float64 { + fontHeight := 0.0 + for _, t := range tl { + if t.Height > fontHeight { + fontHeight = t.Height + } + } + return fontHeight +} + // ToText returns the contents of `tl` as a single string. func (tl TextList) ToText() string { tl.printTexts("ToText: before sorting") + tl.SortPosition() lines := tl.toLines() @@ -763,12 +795,16 @@ func (tl TextList) ToText() string { // Sorting is by orientation then top to bottom, left to right when page is orientated so that text // is horizontal. func (tl *TextList) SortPosition() { + fontHeight := tl.height() + // We sort with a y tolerance to allow for subscripts, diacritics etc. + tol := min(fontHeight*0.2, 5.0) + common.Log.Trace("SortPosition: fontHeight=%.1f tol=%.1f", fontHeight, tol) sort.SliceStable(*tl, func(i, j int) bool { ti, tj := (*tl)[i], (*tl)[j] if ti.Orient != tj.Orient { return ti.Orient < tj.Orient } - if ti.OrientedStart.Y != tj.OrientedStart.Y { + if math.Abs(ti.OrientedStart.Y-tj.OrientedStart.Y) > tol { return ti.OrientedStart.Y > tj.OrientedStart.Y } return ti.OrientedStart.X < tj.OrientedStart.X @@ -826,6 +862,7 @@ func (tl TextList) toLinesOrient() []Line { if len(words) > 0 { line := newLine(y, x, words) if averageCharWidth.running { + line = combineDiacritics(line, averageCharWidth.ave) line = removeDuplicates(line, averageCharWidth.ave) } lines = append(lines, line) @@ -978,6 +1015,147 @@ func removeDuplicates(line Line, charWidth float64) Line { return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words} } +// combineDiacritics returns `line` with diacritics close to characters combined with the characters. +// `charWidth` is the average character width for the line. +// We have to do this because PDF can render diacritics separately to the characters they attach to +// in extracted text. +func combineDiacritics(line Line, charWidth float64) Line { + if len(line.Dx) == 0 { + return line + } + + tol := charWidth * 0.2 + common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol) + + words := []string{} + dxList := []float64{} + w := line.Words[0] + w, c := countDiacritic(w) + delta := 0.0 + dx0 := 0.0 + parts := []string{w} + numChars := c + + for i := 0; i < len(line.Dx); i++ { + w = line.Words[i+1] + w, c := countDiacritic(w) + dx := line.Dx[i] + if numChars+c <= 1 && delta+dx <= tol { + if len(parts) == 0 { + dx0 = dx + } else { + delta += dx + } + parts = append(parts, w) + numChars += c + } else { + if len(parts) > 0 { + if len(words) > 0 { + dxList = append(dxList, dx0) + } + words = append(words, combine(parts)) + } + parts = []string{w} + numChars = c + dx0 = dx + delta = 0.0 + } + } + if len(parts) > 0 { + if len(words) > 0 { + dxList = append(dxList, dx0) + } + words = append(words, combine(parts)) + } + + if len(words) != len(dxList)+1 { + common.Log.Error("Inconsistent: \nwords=%d %q\ndxList=%d %.2f", + len(words), words, len(dxList), dxList) + return line + } + return Line{Y: line.Y, Dx: dxList, Text: strings.Join(words, ""), Words: words} +} + +// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`. +func combine(parts []string) string { + if len(parts) == 1 { + // Must be a non-diacritic. + return parts[0] + } + + // We need to put the diacritics before the non-diacritic for NFKC normalization to work. + diacritic := map[string]bool{} + for _, w := range parts { + r := []rune(w)[0] + diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) + } + sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] }) + + // Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic. + for i, w := range parts { + parts[i] = strings.TrimSpace(norm.NFKC.String(w)) + } + return strings.Join(parts, "") +} + +// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of +// non-diacritics in `w` (0 or 1) +func countDiacritic(w string) (string, int) { + runes := []rune(w) + if len(runes) != 1 { + return w, 1 + } + r := runes[0] + if w2, ok := diacritics[r]; ok { + w = w2 + } + c := 1 + if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) { + c = 0 + } + if w2, ok := diacritics[r]; ok { + c = 0 + w = w2 + } + return w, c +} + +// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk +// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox. +var diacritics = map[rune]string{ + 0x0060: "\u0300", + 0x02CB: "\u0300", + 0x0027: "\u0301", + 0x02B9: "\u0301", + 0x02CA: "\u0301", + 0x005e: "\u0302", + 0x02C6: "\u0302", + 0x007E: "\u0303", + 0x02C9: "\u0304", + 0x00B0: "\u030A", + 0x02BA: "\u030B", + 0x02C7: "\u030C", + 0x02C8: "\u030D", + 0x0022: "\u030E", + 0x02BB: "\u0312", + 0x02BC: "\u0313", + 0x0486: "\u0313", + 0x055A: "\u0313", + 0x02BD: "\u0314", + 0x0485: "\u0314", + 0x0559: "\u0314", + 0x02D4: "\u031D", + 0x02D5: "\u031E", + 0x02D6: "\u031F", + 0x02D7: "\u0320", + 0x02B2: "\u0321", + 0x02CC: "\u0329", + 0x02B7: "\u032B", + 0x02CD: "\u0331", + 0x005F: "\u0332", + 0x204E: "\u0359", +} + // getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is // empty. func (to *textObject) getCurrentFont() *model.PdfFont { diff --git a/pdf/extractor/text_test.go b/pdf/extractor/text_test.go index d9f84b46..c3a4fb7a 100644 --- a/pdf/extractor/text_test.go +++ b/pdf/extractor/text_test.go @@ -17,6 +17,7 @@ import ( "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/model" + "golang.org/x/text/unicode/norm" ) // XXX(peterwilliams97) NOTE: We do a best effort at finding the PDF file because we don't keep PDF @@ -187,8 +188,10 @@ var extract2Tests = []struct { }, {filename: "Ito_Formula.pdf", expectedPageText: map[int][]string{ - // 1: []string{"In the Itô stochastic calculus"}, - 1: []string{"In standard, non-stochastic calculus, one computes a derivative"}, + 1: []string{ + "In the Itô stochastic calculus", + "In standard, non-stochastic calculus, one computes a derivative"}, + 2: []string{"Financial Economics Itô’s Formula"}, }, }, {filename: "circ2.pdf", @@ -206,6 +209,11 @@ var extract2Tests = []struct { 1: []string{"entropy of a system of n identical resonators in a stationary radiation field"}, }, }, + {filename: "thanh.pdf", + expectedPageText: map[int][]string{ + 1: []string{"Hàn Thé̂ Thành"}, + }, + }, } // testExtract2 tests the ExtractText2 text extractor on `filename` and compares the extracted @@ -229,6 +237,7 @@ func testExtract2(t *testing.T, filename string, expectedPageText map[int][]stri if !ok { t.Fatalf("%q doesn't have page %d", filename, pageNum) } + actualText = norm.NFKC.String(actualText) if !containsSentences(t, expectedSentences, actualText) { t.Fatalf("Text mismatch filename=%q page=%d", path, pageNum) } @@ -276,8 +285,9 @@ func extractPageTexts(t *testing.T, filename string) (int, map[int]string) { // containsSentences returns true if all strings `expectedSentences` are contained in `actualText`. func containsSentences(t *testing.T, expectedSentences []string, actualText string) bool { for _, e := range expectedSentences { + e = norm.NFKC.String(e) if !strings.Contains(actualText, e) { - t.Errorf("No match for %#q", e) + t.Errorf("No match for %q", e) return false } } diff --git a/pdf/model/font_simple.go b/pdf/model/font_simple.go index 5c509714..8e570c1a 100644 --- a/pdf/model/font_simple.go +++ b/pdf/model/font_simple.go @@ -131,7 +131,6 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, // returned to indicate whether or not the entry was found in the glyph to charcode mapping. func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) { if width, ok := font.charWidths[code]; ok { - common.Log.Debug("GetCharMetrics 1: code=%d width=%.1f font=%s", code, width, font) return fonts.CharMetrics{Wx: width}, true } if isBuiltin(Standard14Font(font.basefont)) { @@ -142,12 +141,11 @@ func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) if glyph, ok := font.encoder.CharcodeToGlyph(code); ok { if metrics, ok := font.fontMetrics[glyph]; ok { font.charWidths[code] = metrics.Wx - common.Log.Debug("GetCharMetrics 2: code=%d glyph=%q width=%.1f", code, glyph, metrics.Wx) return metrics, true } } } - common.Log.Debug("GetCharMetrics 3: code=%d font=%s", code, font) + common.Log.Debug("GetCharMetrics: No match for code=%d font=%s", code, font) return fonts.CharMetrics{}, false } From 6529b42a704857e3f4da52696d431eba26d16950 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Wed, 28 Nov 2018 18:22:42 +1100 Subject: [PATCH 3/3] Remove duplicate code. --- pdf/extractor/text.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go index d6a47fc9..523bd1f3 100644 --- a/pdf/extractor/text.go +++ b/pdf/extractor/text.go @@ -1106,9 +1106,6 @@ func countDiacritic(w string) (string, int) { return w, 1 } r := runes[0] - if w2, ok := diacritics[r]; ok { - w = w2 - } c := 1 if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r) { c = 0