mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-29 13:48:54 +08:00
Fixed text position tracking.
This commit is contained in:
parent
ee3e2a45a0
commit
b0c440dd00
@ -49,5 +49,5 @@ func (p *Point) transformByMatrix(m contentstream.Matrix) {
|
||||
|
||||
// String returns a string describing `p`.
|
||||
func (p *Point) String() string {
|
||||
return fmt.Sprintf("(%.1f,%.1f)", p.X, p.Y)
|
||||
return fmt.Sprintf("(%.2f,%.2f)", p.X, p.Y)
|
||||
}
|
||||
|
@ -9,6 +9,8 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
@ -585,8 +587,10 @@ func newTextObject(e *Extractor, gs contentstream.GraphicsState, state *textStat
|
||||
func (to *textObject) renderText(data []byte) error {
|
||||
font := to.getCurrentFont()
|
||||
|
||||
text, numChars, numMisses := font.CharcodeBytesToUnicode(data)
|
||||
runes := []rune(text)
|
||||
charcodes := font.BytesToCharcodes(data)
|
||||
|
||||
runes, numChars, numMisses := font.CharcodesToUnicode(charcodes)
|
||||
|
||||
to.State.numChars += numChars
|
||||
to.State.numMisses += numMisses
|
||||
|
||||
@ -598,7 +602,7 @@ func (to *textObject) renderText(data []byte) error {
|
||||
spaceMetrics, _ = model.DefaultFont().GetRuneCharMetrics(' ')
|
||||
}
|
||||
spaceWidth := spaceMetrics.Wx * glyphTextRatio
|
||||
common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, text,
|
||||
common.Log.Debug("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runes,
|
||||
font, tfs)
|
||||
|
||||
stateMatrix := contentstream.NewMatrix(
|
||||
@ -606,7 +610,8 @@ func (to *textObject) renderText(data []byte) error {
|
||||
0, tfs,
|
||||
0, state.Trise)
|
||||
|
||||
for _, r := range runes {
|
||||
for i, r := range runes {
|
||||
code := charcodes[i]
|
||||
// The location of the text on the page in device coordinates is given by trm, the text
|
||||
// rendering matrix.
|
||||
trm := stateMatrix.Mult(to.Tm).Mult(to.gs.CTM)
|
||||
@ -616,40 +621,33 @@ func (to *textObject) renderText(data []byte) error {
|
||||
|
||||
// w is the unscaled movement at the end of a word.
|
||||
w := 0.0
|
||||
if r == ' ' {
|
||||
if r == " " {
|
||||
w = state.Tw
|
||||
}
|
||||
|
||||
m, err := font.GetRuneCharMetrics(r)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: No metric for 0x%04x=%c %s", r, r, font)
|
||||
return err
|
||||
m, ok := font.GetCharMetrics(code)
|
||||
if !ok {
|
||||
common.Log.Debug("ERROR: No metric for code=%d r=0x%04x=%c %s", code, r, r, font)
|
||||
return errors.New("no char metrics")
|
||||
}
|
||||
|
||||
// c is the character size in unscaled text units.
|
||||
c := Point{X: m.Wx * glyphTextRatio, Y: m.Wy * glyphTextRatio}
|
||||
// cScaled is the character size
|
||||
cScaled := Point{X: c.X * tfs * th}
|
||||
|
||||
// t is the displacement of the text cursor when the character is rendered.
|
||||
t := Point{X: (c.X*tfs + state.Tc + w) * th}
|
||||
|
||||
common.Log.Debug("t=%s cScaled=%s c=%s tfs=%.2f state.Tc=%.2f w=%.2f th=%.2f",
|
||||
t.String(), cScaled.String(), c.String(), tfs, state.Tc, w, th)
|
||||
|
||||
// td is t in matrix from
|
||||
// td is t in matrix form.
|
||||
td := translationMatrix(t)
|
||||
common.Log.Debug("displacement=%s t=%s td=%s m=%s",
|
||||
c.String(), t.String(), td.String(), m.String())
|
||||
|
||||
nextTm := to.Tm.Mult(td)
|
||||
common.Log.Debug(" next: td=%s %s->%s", td, to.Tm, nextTm)
|
||||
|
||||
xyt := XYText{Text: string(r),
|
||||
Point: translation(trm),
|
||||
End: translation(trm).Displace(cScaled),
|
||||
End: translation(to.Tm.Mult(td).Mult(to.gs.CTM)),
|
||||
SpaceWidth: spaceWidth * trm.ScalingFactorX(),
|
||||
}
|
||||
to.Texts = append(to.Texts, xyt)
|
||||
common.Log.Debug(" xyt=%s", xyt.String())
|
||||
|
||||
// update the text matrix by the displacement of the text location.
|
||||
to.Tm = nextTm
|
||||
@ -738,6 +736,7 @@ func (tl *TextList) AppendText(gs contentstream.GraphicsState, p, e Point, text
|
||||
|
||||
// ToText returns the contents of `tl` as a single string.
|
||||
func (tl *TextList) ToText() string {
|
||||
tl.printTexts("ToText: before sorting")
|
||||
tl.SortPosition()
|
||||
|
||||
lines := tl.toLines()
|
||||
@ -770,7 +769,7 @@ type Line struct {
|
||||
// toLines return the text and positions in `tl` as a slice of Line.
|
||||
// NOTE: Caller must sort the text list top-to-bottom, left-to-write before calling this function.
|
||||
func (tl *TextList) toLines() []Line {
|
||||
tl.printTexts()
|
||||
tl.printTexts("toLines: before")
|
||||
if len(*tl) == 0 {
|
||||
return []Line{}
|
||||
}
|
||||
@ -870,13 +869,28 @@ func (exp *ExponAve) update(x float64) float64 {
|
||||
}
|
||||
|
||||
// printTexts is a debugging function. XXX Remove this.
|
||||
func (tl *TextList) printTexts() {
|
||||
func (tl *TextList) printTexts(message string) {
|
||||
return
|
||||
common.Log.Error("=====================================")
|
||||
common.Log.Error("%d texts", len(*tl))
|
||||
for i, t := range (*tl)[1:] {
|
||||
fmt.Printf("%5d: %s\n", i, t.String())
|
||||
_, file, line, ok := runtime.Caller(1)
|
||||
if !ok {
|
||||
file = "???"
|
||||
line = 0
|
||||
} else {
|
||||
file = filepath.Base(file)
|
||||
}
|
||||
prefix := fmt.Sprintf("[%s:%d]", file, line)
|
||||
|
||||
common.Log.Error("=====================================")
|
||||
common.Log.Error("printTexts %s %s", prefix, message)
|
||||
common.Log.Error("%d texts", len(*tl))
|
||||
parts := []string{}
|
||||
for i, t := range *tl {
|
||||
fmt.Printf("%5d: %s\n", i, t.String())
|
||||
parts = append(parts, t.Text)
|
||||
}
|
||||
common.Log.Error("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
|
||||
fmt.Printf("%s\n", strings.Join(parts, ""))
|
||||
common.Log.Error("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
|
||||
}
|
||||
|
||||
// newLine returns the Line representation of strings `words` with y coordinate `y` and x
|
||||
|
@ -3,3 +3,5 @@ Font Metrics
|
||||
|
||||
1 Leave char->unicode until end
|
||||
2 Build metrics tables for charcodes
|
||||
3 Remove double Font interface definition
|
||||
4 Express CharcodeBytesToUnicode2 in terms of
|
||||
|
@ -18,13 +18,26 @@ import (
|
||||
"github.com/unidoc/unidoc/pdf/model/textencoding"
|
||||
)
|
||||
|
||||
// Font represents a font which is a series of glyphs. Character codes from PDF strings can be
|
||||
// mapped to and from glyphs. Each glyph has metrics.
|
||||
// XXX: FIXME (peterwilliams97) HACK to add GetCharMetrics() for fonts other than standard 14
|
||||
// Remove this hack.
|
||||
type Font interface {
|
||||
Encoder() textencoding.TextEncoder
|
||||
SetEncoder(encoder textencoding.TextEncoder)
|
||||
GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
|
||||
GetCharMetrics(code uint16) (fonts.CharMetrics, bool)
|
||||
GetAverageCharWidth() float64
|
||||
ToPdfObject() core.PdfObject
|
||||
}
|
||||
|
||||
// PdfFont represents an underlying font structure which can be of type:
|
||||
// - Type0
|
||||
// - Type1
|
||||
// - TrueType
|
||||
// etc.
|
||||
type PdfFont struct {
|
||||
context fonts.Font // The underlying font: Type0, Type1, Truetype, etc..
|
||||
context Font // The underlying font: Type0, Type1, Truetype, etc..
|
||||
}
|
||||
|
||||
// String returns a string that describes `font`.
|
||||
@ -303,12 +316,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
|
||||
// conforming writers, instead of using a simple font, shall use a Type 0 font with an Identity-H
|
||||
// encoding and use the glyph indices as character codes, as described following Table 118.
|
||||
func (font PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
|
||||
_, out, numChars, numMisses := font.CharcodeBytesToUnicode2(data)
|
||||
return out, numChars, numMisses
|
||||
}
|
||||
|
||||
func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int, int) {
|
||||
common.Log.Trace("showText: data=[% 02x]=%#q", data, data)
|
||||
common.Log.Trace("CharcodeBytesToUnicode: data=[% 02x]=%#q", data, data)
|
||||
|
||||
charcodes := make([]uint16, 0, len(data)+len(data)%2)
|
||||
if font.baseFields().isCIDFont() {
|
||||
@ -363,7 +371,74 @@ func (font PdfFont) CharcodeBytesToUnicode2(data []byte) ([]uint16, string, int,
|
||||
}
|
||||
|
||||
out := strings.Join(charstrings, "")
|
||||
return charcodes, out, len([]rune(out)), numMisses
|
||||
return out, len([]rune(out)), numMisses
|
||||
}
|
||||
|
||||
// BytesToCharcodes converts the bytes in a PDF string to character codes.
|
||||
func (font PdfFont) BytesToCharcodes(data []byte) []uint16 {
|
||||
common.Log.Trace("BytesToCharcodes: data=[% 02x]=%#q", data, data)
|
||||
charcodes := make([]uint16, 0, len(data)+len(data)%2)
|
||||
if font.baseFields().isCIDFont() {
|
||||
if len(data) == 1 {
|
||||
data = []byte{0, data[0]}
|
||||
}
|
||||
if len(data)%2 != 0 {
|
||||
common.Log.Debug("ERROR: Padding data=%+v to even length", data)
|
||||
data = append(data, 0)
|
||||
}
|
||||
for i := 0; i < len(data); i += 2 {
|
||||
b := uint16(data[i])<<8 | uint16(data[i+1])
|
||||
charcodes = append(charcodes, b)
|
||||
}
|
||||
} else {
|
||||
for _, b := range data {
|
||||
charcodes = append(charcodes, uint16(b))
|
||||
}
|
||||
}
|
||||
return charcodes
|
||||
}
|
||||
|
||||
// CharcodesToUnicode converts the character codes `charcodes` to a slice of unicode strings.
|
||||
func (font PdfFont) CharcodesToUnicode(charcodes []uint16) ([]string, int, int) {
|
||||
charstrings := make([]string, 0, len(charcodes))
|
||||
numMisses := 0
|
||||
for _, code := range charcodes {
|
||||
if font.baseFields().toUnicodeCmap != nil {
|
||||
r, ok := font.baseFields().toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code))
|
||||
if ok {
|
||||
charstrings = append(charstrings, r)
|
||||
continue
|
||||
}
|
||||
}
|
||||
// Fall back to encoding
|
||||
encoder := font.Encoder()
|
||||
if encoder != nil {
|
||||
r, ok := encoder.CharcodeToRune(code)
|
||||
if ok {
|
||||
charstrings = append(charstrings, textencoding.RuneToString(r))
|
||||
continue
|
||||
}
|
||||
}
|
||||
common.Log.Debug("ERROR: No rune. code=0x%04x charcodes=[% 04x] CID=%t\n"+
|
||||
"\tfont=%s\n\tencoding=%s",
|
||||
code, charcodes, font.baseFields().isCIDFont(), font, encoder)
|
||||
numMisses++
|
||||
charstrings = append(charstrings, cmap.MissingCodeString)
|
||||
|
||||
}
|
||||
|
||||
if numMisses != 0 {
|
||||
common.Log.Debug("ERROR: Couldn't convert to unicode. Using input.\n"+
|
||||
"\tnumChars=%d numMisses=%d\n"+
|
||||
"\tfont=%s",
|
||||
len(charcodes), numMisses, font)
|
||||
}
|
||||
|
||||
if len(charcodes) != len(charstrings) {
|
||||
panic(fmt.Errorf("charcodes=%d charstrings=%d", len(charcodes), len(charstrings)))
|
||||
}
|
||||
|
||||
return charstrings, len(charstrings), numMisses
|
||||
}
|
||||
|
||||
// ToPdfObject converts the PdfFont object to its PDF representation.
|
||||
@ -402,9 +477,21 @@ func (font PdfFont) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool)
|
||||
t := font.actualFont()
|
||||
if t == nil {
|
||||
common.Log.Debug("ERROR: GetGlyphCharMetrics Not implemented for font type=%#T", font.context)
|
||||
return fonts.CharMetrics{GlyphName: glyph}, false
|
||||
}
|
||||
metrics, ok := t.GetGlyphCharMetrics(glyph)
|
||||
return metrics, ok
|
||||
}
|
||||
|
||||
// GetCharMetrics returns the char metrics for character code `code`.
|
||||
func (font PdfFont) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||
t := font.actualFont()
|
||||
if t == nil {
|
||||
common.Log.Debug("ERROR: GetCharMetrics Not implemented for font type=%#T", font.context)
|
||||
return fonts.CharMetrics{}, false
|
||||
}
|
||||
return t.GetGlyphCharMetrics(glyph)
|
||||
m, ok := t.GetCharMetrics(code)
|
||||
return m, ok
|
||||
}
|
||||
|
||||
// GetRuneCharMetrics returns the char metrics for rune `r`.
|
||||
@ -438,7 +525,7 @@ func (font PdfFont) GetAverageCharWidth() float64 {
|
||||
}
|
||||
|
||||
// actualFont returns the Font in font.context
|
||||
func (font PdfFont) actualFont() fonts.Font {
|
||||
func (font PdfFont) actualFont() Font {
|
||||
if font.context == nil {
|
||||
common.Log.Debug("ERROR: actualFont. context is nil. font=%s", font)
|
||||
}
|
||||
|
@ -121,6 +121,15 @@ func (font pdfFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, b
|
||||
return font.DescendantFont.GetGlyphCharMetrics(glyph)
|
||||
}
|
||||
|
||||
// !@#$ stub
|
||||
func (font pdfFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||
if font.DescendantFont == nil {
|
||||
common.Log.Debug("ERROR: No descendant. font=%s", font)
|
||||
return fonts.CharMetrics{}, false
|
||||
}
|
||||
return font.DescendantFont.GetCharMetrics(code)
|
||||
}
|
||||
|
||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
||||
func (font pdfFontType0) GetAverageCharWidth() float64 {
|
||||
if font.DescendantFont == nil {
|
||||
@ -238,6 +247,11 @@ func (font pdfCIDFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics
|
||||
return fonts.CharMetrics{}, true
|
||||
}
|
||||
|
||||
// !@#$ stub
|
||||
func (font pdfCIDFontType0) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||
return fonts.CharMetrics{}, true
|
||||
}
|
||||
|
||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
||||
func (font pdfCIDFontType0) GetAverageCharWidth() float64 {
|
||||
return 0.0
|
||||
@ -347,6 +361,12 @@ func (font pdfCIDFontType2) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics
|
||||
return metrics, true
|
||||
}
|
||||
|
||||
// !@#$ stub
|
||||
func (font pdfCIDFontType2) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||
metrics := fonts.CharMetrics{}
|
||||
return metrics, true
|
||||
}
|
||||
|
||||
// GetAverageCharWidth returns the average width of all the characters in `font`.
|
||||
func (font pdfCIDFontType2) GetAverageCharWidth() float64 {
|
||||
if len(font.runeToWidthMap) == 0 {
|
||||
|
@ -93,13 +93,20 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
|
||||
return metrics, ok
|
||||
}
|
||||
|
||||
metrics := fonts.CharMetrics{}
|
||||
|
||||
code, found := font.encoder.GlyphToCharcode(glyph)
|
||||
if !found {
|
||||
return metrics, false
|
||||
return fonts.CharMetrics{GlyphName: glyph}, false
|
||||
}
|
||||
// !@#$ Shouldn't we fall back from GetCharMetrics to GetGlyphCharMetrics?
|
||||
metrics, ok := font.GetCharMetrics(code)
|
||||
metrics.GlyphName = glyph
|
||||
return metrics, ok
|
||||
}
|
||||
|
||||
// GetCharMetrics returns the character metrics for the specified character code. A bool flag is
|
||||
// returned to indicate whether or not the entry was found in the glyph to charcode mapping.
|
||||
func (font pdfFontSimple) GetCharMetrics(code uint16) (fonts.CharMetrics, bool) {
|
||||
metrics := fonts.CharMetrics{}
|
||||
|
||||
if int(code) < font.firstChar {
|
||||
common.Log.Debug("Code lower than firstchar (%d < %d)", code, font.firstChar)
|
||||
|
Loading…
x
Reference in New Issue
Block a user