Fixed text position tracking.

This commit is contained in:
Peter Williams 2018-10-30 21:55:30 +11:00
parent ee3e2a45a0
commit b0c440dd00
6 changed files with 170 additions and 40 deletions

View File

@ -49,5 +49,5 @@ func (p *Point) transformByMatrix(m contentstream.Matrix) {
// String returns a string describing `p`.
func (p *Point) String() string {
return fmt.Sprintf("(%.1f,%.1f)", p.X, p.Y)
return fmt.Sprintf("(%.2f,%.2f)", p.X, p.Y)
}

View File

@ -9,6 +9,8 @@ import (
"errors"
"fmt"
"math"
"path/filepath"
"runtime"
"sort"
"strings"
@ -585,8 +587,10 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat
func (to *textObject) renderText(data []byte) error {
font := to.getCurrentFont()
text, numChars, numMisses := font.CharcodeBytesToUnicode(data)
runes := []rune(text)
charcodes := font.BytesToCharcodes(data)
runes, numChars, numMisses := font.CharcodesToUnicode(charcodes)
to.State.numChars += numChars
to.State.numMisses += numMisses
@ -598,7 +602,7 @@ func (to *textObject) renderText(data []byte) error {
spaceMetrics, _ = model.DefaultFont().GetRuneCharMetrics(' ')
}
spaceWidth := spaceMetrics.Wx * glyphTextRatio
common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, text,
common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes,
font, tfs)
stateMatrix := contentstream.NewMatrix(
@ -606,7 +610,8 @@ func (to *textObject) renderText(data []byte) error {
0, tfs,
0, state.Trise)
for _, r := range runes {
for i, r := range runes {
code := charcodes[i]
// The location of the text on the page in device coordinates is given by trm, the text
// rendering matrix.
trm := stateMatrix.Mult(to.Tm).Mult(to.gs.CTM)
@ -616,40 +621,33 @@ func (to *textObject) renderText(data []byte) error {
// w is the unscaled movement at the end of a word.
w := 0.0
if r == ' ' {
if r == " " {
w = state.Tw
}
m, err := font.GetRuneCharMetrics(r)
if err != nil {
common.Log.Debug("ERROR: No metric for 0x%04x=%c %s", r, r, font)
return err
m, ok := font.GetCharMetrics(code)
if !ok {
common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%c %s", code, r, r, font)
return errors.New("no char metrics")
}
// c is the character size in unscaled text units.
c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
// cScaled is the character size
cScaled := Point{X: c.X * tfs * th}
// t is the displacement of the text cursor when the character is rendered.
t := Point{X: (c.X*tfs + state.Tc + w) * th}
common.Log.Debug("t=%s cScaled=%s c=%s tfs=%.2f state.Tc=%.2f w=%.2f th=%.2f",
t.String(), cScaled.String(), c.String(), tfs, state.Tc, w, th)
// td is t in matrix from
// td is t in matrix form.
td := translationMatrix(t)
common.Log.Debug("displacement=%s t=%s td=%s m=%s",
c.String(), t.String(), td.String(), m.String())
nextTm := to.Tm.Mult(td)
common.Log.Debug(" next: td=%s %s->%s", td, to.Tm, nextTm)
xyt := XYText{Text: string(r),
Point: translation(trm),
End: translation(trm).Displace(cScaled),
End: translation(to.Tm.Mult(td).Mult(to.gs.CTM)),
SpaceWidth: spaceWidth * trm.ScalingFactorX(),
}
to.Texts = append(to.Texts, xyt)
common.Log.Debug(" xyt=%s", xyt.String())
// update the text matrix by the displacement of the text location.
to.Tm = nextTm
@ -738,6 +736,7 @@ func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text
// ToText returns the contents of `tl` as a single string.
func (tl *TextList) ToText() string {
tl.printTexts("ToText: before sorting")
tl.SortPosition()
lines := tl.toLines()
@ -770,7 +769,7 @@ type Line struct {
// toLines return the text and positions in `tl` as a slice of Line.
// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
func (tl *TextList) toLines() []Line {
tl.printTexts()
tl.printTexts("toLines: before")
if len(*tl) == 0 {
return []Line{}
}
@ -870,13 +869,28 @@ func (exp *ExponAve) update(x float64) float64 {
}
// printTexts is a debugging function. XXX Remove this.
func (tl *TextList) printTexts() {
func (tl *TextList) printTexts(message string) {
return
common.Log.Error("=====================================")
common.Log.Error("%d texts", len(*tl))
for i, t := range (*tl)[1:] {
fmt.Printf("%5d: %s\n", i, t.String())
_, file, line, ok := runtime.Caller(1)
if !ok {
file = "???"
line = 0
} else {
file = filepath.Base(file)
}
prefix := fmt.Sprintf("[%s:%d]", file, line)
common.Log.Error("=====================================")
common.Log.Error("printTexts %s %s", prefix, message)
common.Log.Error("%d texts", len(*tl))
parts := []string{}
for i, t := range *tl {
fmt.Printf("%5d: %s\n", i, t.String())
parts = append(parts, t.Text)
}
common.Log.Error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
fmt.Printf("%s\n", strings.Join(parts, ""))
common.Log.Error("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
}
// newLine returns the Line representation of strings `words` with y coordinate `y` and x

View File

@ -3,3 +3,5 @@ Font Metrics
1 Leave char->unicode until end
2 Build metrics tables for charcodes
3 Remove double Font interface definition
4 Express CharcodeBytesToUnicode2 in terms of

View File

@ -18,13 +18,26 @@ import (
"github.com/unidoc/unidoc/pdf/model/textencoding"
)
// Font represents a font which is a series of glyphs. Character codes from PDF strings can be
// mapped to and from glyphs. Each glyph has metrics.
// XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14
// Remove this hack.
type Font interface {
Encoder() textencoding.TextEncoder
SetEncoder(encoder textencoding.TextEncoder)
GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
GetAverageCharWidth() float64
ToPdfObject() core.PdfObject
}
// PdfFont represents an underlying font structure which can be of type:
// - Type0
// - Type1
// - TrueType
// etc.
type PdfFont struct {
context fonts.Font // The underlying font: Type0, Type1, Truetype, etc..
context Font // The underlying font: Type0, Type1, Truetype, etc..
}
// String returns a string that describes `font`.
@ -303,12 +316,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
// conforming writers, instead of using a simple font, shall use a Type 0 font with an Identity-H
// encoding and use the glyph indices as character codes, as described following Table 118.
func (font PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
_, out, numChars, numMisses := font.CharcodeBytesToUnicode2(data)
return out, numChars, numMisses
}
func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int, int) {
common.Log.Trace("showText: data=[% 02x]=%#q", data, data)
common.Log.Trace("CharcodeBytesToUnicode: data=[% 02x]=%#q", data, data)
charcodes := make([]uint16, 0, len(data)+len(data)%2)
if font.baseFields().isCIDFont() {
@ -363,7 +371,74 @@ func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int,
}
out := strings.Join(charstrings, "")
return charcodes, out, len([]rune(out)), numMisses
return out, len([]rune(out)), numMisses
}
// BytesToCharcodes converts the bytes in a PDF string to character codes.
func (font PdfFont) BytesToCharcodes(data []byte) []uint16 {
common.Log.Trace("BytesToCharcodes: data=[% 02x]=%#q", data, data)
charcodes := make([]uint16, 0, len(data)+len(data)%2)
if font.baseFields().isCIDFont() {
if len(data) == 1 {
data = []byte{0, data[0]}
}
if len(data)%2 != 0 {
common.Log.Debug("ERROR: Padding data=%+v to even length", data)
data = append(data, 0)
}
for i := 0; i < len(data); i += 2 {
b := uint16(data[i])<<8 | uint16(data[i+1])
charcodes = append(charcodes, b)
}
} else {
for _, b := range data {
charcodes = append(charcodes, uint16(b))
}
}
return charcodes
}
// CharcodesToUnicode converts the character codes `charcodes` to a slice of unicode strings.
func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int) {
charstrings := make([]string, 0, len(charcodes))
numMisses := 0
for _, code := range charcodes {
if font.baseFields().toUnicodeCmap != nil {
r, ok := font.baseFields().toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code))
if ok {
charstrings = append(charstrings, r)
continue
}
}
// Fall back to encoding
encoder := font.Encoder()
if encoder != nil {
r, ok := encoder.CharcodeToRune(code)
if ok {
charstrings = append(charstrings, textencoding.RuneToString(r))
continue
}
}
common.Log.Debug("ERROR: No rune. code=0x%04x charcodes=[% 04x] CID=%t\n"+
"\tfont=%s\n\tencoding=%s",
code, charcodes, font.baseFields().isCIDFont(), font, encoder)
numMisses++
charstrings = append(charstrings, cmap.MissingCodeString)
}
if numMisses != 0 {
common.Log.Debug("ERROR: Couldn't convert to unicode. Using input.\n"+
"\tnumChars=%d numMisses=%d\n"+
"\tfont=%s",
len(charcodes), numMisses, font)
}
if len(charcodes) != len(charstrings) {
panic(fmt.Errorf("charcodes=%d charstrings=%d", len(charcodes), len(charstrings)))
}
return charstrings, len(charstrings), numMisses
}
// ToPdfObject converts the PdfFont object to its PDF representation.
@ -402,9 +477,21 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
t := font.actualFont()
if t == nil {
common.Log.Debug("ERROR: GetGlyphCharMetrics Not implemented for font type=%#T", font.context)
return fonts.CharMetrics{GlyphName: glyph}, false
}
metrics, ok := t.GetGlyphCharMetrics(glyph)
return metrics, ok
}
// GetCharMetrics returns the char metrics for character code `code`.
func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
t := font.actualFont()
if t == nil {
common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context)
return fonts.CharMetrics{}, false
}
return t.GetGlyphCharMetrics(glyph)
m, ok := t.GetCharMetrics(code)
return m, ok
}
// GetRuneCharMetrics returns the char metrics for rune `r`.
@ -438,7 +525,7 @@ func (font PdfFont) GetAverageCharWidth() float64 {
}
// actualFont returns the Font in font.context
func (font PdfFont) actualFont() fonts.Font {
func (font PdfFont) actualFont() Font {
if font.context == nil {
common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
}

View File

@ -121,6 +121,15 @@ func (font pdfFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, b
return font.DescendantFont.GetGlyphCharMetrics(glyph)
}
// !@#$ stub
func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
if font.DescendantFont == nil {
common.Log.Debug("ERROR: No descendant. font=%s", font)
return fonts.CharMetrics{}, false
}
return font.DescendantFont.GetCharMetrics(code)
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font pdfFontType0) GetAverageCharWidth() float64 {
if font.DescendantFont == nil {
@ -238,6 +247,11 @@ func (font pdfCIDFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics
return fonts.CharMetrics{}, true
}
// !@#$ stub
func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
return fonts.CharMetrics{}, true
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font pdfCIDFontType0) GetAverageCharWidth() float64 {
return 0.0
@ -347,6 +361,12 @@ func (font pdfCIDFontType2) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics
return metrics, true
}
// !@#$ stub
func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
metrics := fonts.CharMetrics{}
return metrics, true
}
// GetAverageCharWidth returns the average width of all the characters in `font`.
func (font pdfCIDFontType2) GetAverageCharWidth() float64 {
if len(font.runeToWidthMap) == 0 {

View File

@ -93,13 +93,20 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
return metrics, ok
}
metrics := fonts.CharMetrics{}
code, found := font.encoder.GlyphToCharcode(glyph)
if !found {
return metrics, false
return fonts.CharMetrics{GlyphName: glyph}, false
}
// !@#$ Shouldn't we fall back from GetCharMetrics to GetGlyphCharMetrics?
metrics, ok := font.GetCharMetrics(code)
metrics.GlyphName = glyph
return metrics, ok
}
// GetCharMetrics returns the character metrics for the specified character code. A bool flag is
// returned to indicate whether or not the entry was found in the glyph to charcode mapping.
func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
metrics := fonts.CharMetrics{}
if int(code) < font.firstChar {
common.Log.Debug("Code lower than firstchar (%d < %d)", code, font.firstChar)