mirror of
https://github.com/unidoc/unipdf.git
synced 2025-05-13 19:29:10 +08:00
Merge branch 'development' of https://github.com/unidoc/unipdf into columns
This commit is contained in:
commit
29f2d9b8cf
@ -13,6 +13,7 @@ import (
|
||||
|
||||
"github.com/unidoc/unipdf/v3/common"
|
||||
"github.com/unidoc/unipdf/v3/contentstream"
|
||||
"github.com/unidoc/unipdf/v3/contentstream/draw"
|
||||
"github.com/unidoc/unipdf/v3/core"
|
||||
"github.com/unidoc/unipdf/v3/internal/textencoding"
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
@ -175,12 +176,14 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
|
||||
width := rect.Width()
|
||||
height := rect.Height()
|
||||
|
||||
var rotation float64
|
||||
if mkDict, has := core.GetDict(wa.MK); has {
|
||||
bsDict, _ := core.GetDict(wa.BS)
|
||||
err := style.applyAppearanceCharacteristics(mkDict, bsDict, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rotation, _ = core.GetNumberAsFloat(mkDict.Get("R"))
|
||||
}
|
||||
|
||||
// Get and process the default appearance string (DA) operands.
|
||||
@ -192,6 +195,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
|
||||
}
|
||||
|
||||
cc := contentstream.NewContentCreator()
|
||||
|
||||
if style.BorderSize > 0 {
|
||||
drawRect(cc, style, width, height)
|
||||
}
|
||||
@ -205,6 +209,28 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
|
||||
|
||||
cc.Add_BMC("Tx")
|
||||
cc.Add_q()
|
||||
|
||||
bboxWidth, bboxHeight := width, height
|
||||
if rotation != 0 {
|
||||
// Calculate bounding box before rotation.
|
||||
revRotation := -rotation
|
||||
bbox := draw.Path{Points: []draw.Point{
|
||||
draw.NewPoint(0, 0).Rotate(revRotation),
|
||||
draw.NewPoint(width, 0).Rotate(revRotation),
|
||||
draw.NewPoint(0, height).Rotate(revRotation),
|
||||
draw.NewPoint(width, height).Rotate(revRotation),
|
||||
}}.GetBoundingBox()
|
||||
|
||||
// Update width and height, as the appearance is generated based on
|
||||
// the bounding of the annotation with no rotation.
|
||||
width = bbox.Width
|
||||
height = bbox.Height
|
||||
|
||||
// Apply rotation.
|
||||
cc.RotateDeg(rotation)
|
||||
cc.Translate(bbox.X, bbox.Y)
|
||||
}
|
||||
|
||||
// Graphic state changes.
|
||||
cc.Add_BT()
|
||||
|
||||
@ -461,7 +487,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
|
||||
|
||||
xform := model.NewXObjectForm()
|
||||
xform.Resources = resources
|
||||
xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, width, height})
|
||||
xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, bboxWidth, bboxHeight})
|
||||
xform.SetContentStream(cc.Bytes(), defStreamEncoder())
|
||||
|
||||
apDict := core.MakeDict()
|
||||
|
@ -62,3 +62,54 @@ bruce.pdf for char spacing save/restore.
|
||||
|
||||
challenging-modified.pdf
|
||||
transitions_test.pdf
|
||||
|
||||
|
||||
Code Restructure?
|
||||
-----------------
|
||||
```
|
||||
type textPara struct {
|
||||
serial int // Sequence number for debugging.
|
||||
model.PdfRectangle // Bounding box.
|
||||
w, h int
|
||||
cells []textCell
|
||||
}
|
||||
|
||||
type textCell struct {
|
||||
serial int // Sequence number for debugging.
|
||||
model.PdfRectangle // Bounding box.
|
||||
eBBox model.PdfRectangle // Extended bounding box needed to compute reading order.
|
||||
lines []*textLine // Paragraph text gets broken into lines.
|
||||
}
|
||||
```
|
||||
|
||||
x x x x x x
|
||||
x
|
||||
x x
|
||||
x
|
||||
x x x
|
||||
x
|
||||
x
|
||||
|
||||
1. Compute all row candidates
|
||||
alignedY No intervening paras
|
||||
2. Compute all column candidates
|
||||
alignedX No intervening paras
|
||||
|
||||
Table candidate
|
||||
1. Top row fully populated
|
||||
2. Left column fully populated
|
||||
3. All cells in table are aligned with 1 top row element and 1 left column candidate
|
||||
4. Mininum number of cells must be filled
|
||||
|
||||
Computation time
|
||||
1. Row candidates O(N)
|
||||
Sort top to bottom, left to right
|
||||
Search
|
||||
2. Column candidates O(N)
|
||||
Sort left to right, top to bottom
|
||||
Search
|
||||
3. Find intersections O(N^2)
|
||||
For each row
|
||||
Find columns that start at row -> table candiates
|
||||
Sort table candidates by w x h descending
|
||||
4. Test each candidate O(N^4)
|
||||
|
@ -22,8 +22,6 @@ import (
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
)
|
||||
|
||||
const verbose = false
|
||||
|
||||
// maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack
|
||||
// overflow and high enough to accomodate customers' PDFs
|
||||
const maxFormStack = 10
|
||||
@ -49,7 +47,7 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM
|
||||
|
||||
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
|
||||
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
|
||||
pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0)
|
||||
pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, transform.IdentityMatrix(), 0)
|
||||
if err != nil {
|
||||
return nil, numChars, numMisses, err
|
||||
}
|
||||
@ -62,7 +60,8 @@ func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
|
||||
// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
|
||||
// PageText.
|
||||
// This can be called on a page or a form XObject.
|
||||
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (
|
||||
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources,
|
||||
parentCTM transform.Matrix, level int) (
|
||||
*PageText, int, int, error) {
|
||||
common.Log.Trace("extractPageText: level=%d", level)
|
||||
pageText := &PageText{pageSize: e.mediaBox}
|
||||
@ -97,7 +96,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
|
||||
|
||||
operand := op.Operand
|
||||
|
||||
if verbose {
|
||||
if verboseGeom {
|
||||
common.Log.Info("&&& op=%s", op)
|
||||
}
|
||||
|
||||
@ -106,7 +105,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
|
||||
savedStates.push(&state)
|
||||
// common.Log.Info("Save state: stack=%d\n %s", len(savedStates), state.String())
|
||||
case "Q":
|
||||
if verbose {
|
||||
if verboseGeom {
|
||||
common.Log.Info("Restore state: %s", savedStates.String())
|
||||
}
|
||||
if !savedStates.empty() {
|
||||
@ -129,7 +128,10 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
|
||||
pageText.marks = append(pageText.marks, to.marks...)
|
||||
}
|
||||
inTextObj = true
|
||||
to = newTextObject(e, resources, gs, &state, &savedStates)
|
||||
graphicsState := gs
|
||||
graphicsState.CTM = parentCTM.Mult(graphicsState.CTM)
|
||||
to = newTextObject(e, resources, graphicsState, &state, &savedStates)
|
||||
|
||||
case "ET": // End Text
|
||||
// End text object, discarding text matrix. If the current
|
||||
// text object contains text marks, they are added to the
|
||||
@ -343,8 +345,9 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
|
||||
if formResources == nil {
|
||||
formResources = resources
|
||||
}
|
||||
|
||||
tList, numChars, numMisses, err := e.extractPageText(string(formContent),
|
||||
formResources, level+1)
|
||||
formResources, parentCTM.Mult(gs.CTM), level+1)
|
||||
if err != nil {
|
||||
common.Log.Debug("ERROR: %v", err)
|
||||
return err
|
||||
@ -489,8 +492,8 @@ func (to *textObject) setCharSpacing(x float64) {
|
||||
return
|
||||
}
|
||||
to.state.tc = x
|
||||
if verbose {
|
||||
common.Log.Info("setCharSpacing: %.2f state=%s", to.state.String())
|
||||
if verboseGeom {
|
||||
common.Log.Info("setCharSpacing: %.2f state=%s", x, to.state.String())
|
||||
}
|
||||
}
|
||||
|
||||
@ -758,7 +761,7 @@ func (to *textObject) renderText(data []byte) error {
|
||||
}
|
||||
font := to.getCurrentFont()
|
||||
charcodes := font.BytesToCharcodes(data)
|
||||
runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
|
||||
texts, numChars, numMisses := font.CharcodesToStrings(charcodes)
|
||||
if numMisses > 0 {
|
||||
common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
|
||||
}
|
||||
@ -777,17 +780,20 @@ func (to *textObject) renderText(data []byte) error {
|
||||
spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
|
||||
}
|
||||
spaceWidth := spaceMetrics.Wx * glyphTextRatio
|
||||
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)
|
||||
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.2f", spaceWidth, texts, font, tfs)
|
||||
|
||||
stateMatrix := transform.NewMatrix(
|
||||
tfs*th, 0,
|
||||
0, tfs,
|
||||
0, state.trise)
|
||||
if verbose {
|
||||
common.Log.Info("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runeSlices)
|
||||
if verboseGeom {
|
||||
common.Log.Info("renderText: %d codes=%+v texts=%q", len(charcodes), charcodes, texts)
|
||||
}
|
||||
|
||||
for i, r := range runeSlices {
|
||||
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, len(texts))
|
||||
|
||||
for i, text := range texts {
|
||||
r := []rune(text)
|
||||
if len(r) == 1 && r[0] == '\x00' {
|
||||
continue
|
||||
}
|
||||
@ -819,7 +825,7 @@ func (to *textObject) renderText(data []byte) error {
|
||||
// t is the displacement of the text cursor when the character is rendered.
|
||||
t0 := transform.Point{X: (c.X*tfs + w) * th}
|
||||
t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
|
||||
if verbose {
|
||||
if verboseGeom {
|
||||
common.Log.Info("tfs=%.2f tc=%.2f tw=%.2f th=%.2f", tfs, state.tc, state.tw, th)
|
||||
common.Log.Info("dx,dy=%.3f t0=%.2f t=%.2f", c, t0, t)
|
||||
}
|
||||
@ -830,7 +836,7 @@ func (to *textObject) renderText(data []byte) error {
|
||||
td := translationMatrix(t)
|
||||
end := to.gs.CTM.Mult(to.tm).Mult(td0)
|
||||
|
||||
if verbose {
|
||||
if verboseGeom {
|
||||
common.Log.Info("end:\n\tCTM=%s\n\t tm=%s\n"+
|
||||
"\t td=%s xlat=%s\n"+
|
||||
"\ttd0=%s\n\t → %s xlat=%s",
|
||||
@ -865,7 +871,7 @@ func (to *textObject) renderText(data []byte) error {
|
||||
|
||||
// update the text matrix by the displacement of the text location.
|
||||
to.tm.Concat(td)
|
||||
if i != len(runeSlices)-1 {
|
||||
if i != len(texts)-1 {
|
||||
to.logCursor()
|
||||
}
|
||||
}
|
||||
@ -908,10 +914,11 @@ func isTextSpace(text string) bool {
|
||||
|
||||
// PageText represents the layout of text on a device page.
|
||||
type PageText struct {
|
||||
marks []*textMark // Texts and their positions on a PDF page.
|
||||
viewText string // Extracted page text.
|
||||
viewMarks []TextMark // Public view of `marks`.
|
||||
pageSize model.PdfRectangle
|
||||
marks []*textMark // Texts and their positions on a PDF page.
|
||||
viewText string // Extracted page text.
|
||||
viewMarks []TextMark // Public view of text marks`.
|
||||
viewTables []TextTable // Public view of text table`.
|
||||
pageSize model.PdfRectangle // Page size. Used to calculate depth.
|
||||
}
|
||||
|
||||
// String returns a string describing `pt`.
|
||||
@ -942,6 +949,11 @@ func (pt PageText) Marks() *TextMarkArray {
|
||||
return &TextMarkArray{marks: pt.viewMarks}
|
||||
}
|
||||
|
||||
// Tables returns the tables extracted from the page.
|
||||
func (pt PageText) Tables() []TextTable {
|
||||
return pt.viewTables
|
||||
}
|
||||
|
||||
// computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
|
||||
// `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
|
||||
// The comments above the TextMark definition describe how to use the []TextMark to
|
||||
@ -953,6 +965,7 @@ func (pt *PageText) computeViews() {
|
||||
paras.writeText(b)
|
||||
pt.viewText = b.String()
|
||||
pt.viewMarks = paras.toTextMarks()
|
||||
pt.viewTables = paras.toTables()
|
||||
}
|
||||
|
||||
// TextMarkArray is a collection of TextMarks.
|
||||
@ -1119,6 +1132,13 @@ var spaceMark = TextMark{
|
||||
Meta: true,
|
||||
}
|
||||
|
||||
// TextTable represents a table.
|
||||
// Cells are ordered top-to-bottom, left-to-right.
|
||||
type TextTable struct {
|
||||
W, H int
|
||||
Cells [][]string
|
||||
}
|
||||
|
||||
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
|
||||
// empty.
|
||||
func (to *textObject) getCurrentFont() *model.PdfFont {
|
||||
|
@ -19,11 +19,11 @@ import (
|
||||
var serial serialState
|
||||
|
||||
type serialState struct {
|
||||
mark int
|
||||
word int
|
||||
bins int
|
||||
line int
|
||||
para int
|
||||
mark int
|
||||
word int
|
||||
strata int
|
||||
line int
|
||||
para int
|
||||
}
|
||||
|
||||
func (serial *serialState) reset() {
|
||||
@ -65,15 +65,25 @@ func diffReading(a, b bounded) float64 {
|
||||
return a.bbox().Llx - b.bbox().Llx
|
||||
}
|
||||
|
||||
// func boundedUnion(objs ...bounded) model.PdfRectangle {
|
||||
// rect := objs[0].bbox()
|
||||
// for _, r := range objs[1:] {
|
||||
// rect = rectUnion(rect, r.bbox())
|
||||
// }
|
||||
// return rect
|
||||
// }
|
||||
func boundedUnion(objs ...bounded) model.PdfRectangle {
|
||||
rect := objs[0].bbox()
|
||||
for _, r := range objs[1:] {
|
||||
rect = rectUnion(rect, r.bbox())
|
||||
}
|
||||
return rect
|
||||
}
|
||||
|
||||
// diffDepth returns `a` - `b` in the depth direction..
|
||||
// rectContainsBounded returns true if `a` contains `b`.
|
||||
func rectContainsBounded(a model.PdfRectangle, b bounded) bool {
|
||||
return rectContainsRect(a, b.bbox())
|
||||
}
|
||||
|
||||
// rectContainsRect returns true if `a` contains `b`.
|
||||
func rectContainsRect(a, b model.PdfRectangle) bool {
|
||||
return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury
|
||||
}
|
||||
|
||||
// diffDepth returns `a` - `b` in the depth direction.
|
||||
func diffDepth(a, b bounded) float64 {
|
||||
return bboxDepth(a) - bboxDepth(b)
|
||||
}
|
||||
@ -151,3 +161,19 @@ func overlappedXRect(r0, r1 model.PdfRectangle) bool {
|
||||
func overlappedYRect(r0, r1 model.PdfRectangle) bool {
|
||||
return (r0.Lly <= r1.Lly && r1.Lly <= r0.Ury) || (r0.Lly <= r1.Ury && r1.Ury <= r0.Ury)
|
||||
}
|
||||
|
||||
// minInt return the lesser of `a` and `b`.
|
||||
func minInt(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// maxInt return the greater of `a` and `b`.
|
||||
func maxInt(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
@ -5,8 +5,24 @@
|
||||
|
||||
package extractor
|
||||
|
||||
// The follow constant configure debugging.
|
||||
const (
|
||||
verbose = false
|
||||
verboseGeom = false
|
||||
verbosePage = false
|
||||
verbosePara = false
|
||||
verboseTable = false
|
||||
)
|
||||
|
||||
// The following constants control the approaches used in the code.
|
||||
const (
|
||||
useTables = true
|
||||
doHyphens = true
|
||||
useEBBox = false
|
||||
)
|
||||
|
||||
// The following constants are the tuning parameter for text extracton
|
||||
const (
|
||||
// Size of depth bins in points
|
||||
depthBinPoints = 6
|
||||
|
||||
|
@ -20,10 +20,12 @@ type textLine struct {
|
||||
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
|
||||
depth float64 // Distance from bottom of line to top of page.
|
||||
words []*textWord // Words in this line.
|
||||
fontsize float64
|
||||
hyphenated bool
|
||||
fontsize float64 // Largest word font size.
|
||||
hyphenated bool // Does line have at least minHyphenation runes and end in a hyphen.
|
||||
}
|
||||
|
||||
const minHyphenation = 4
|
||||
|
||||
// newTextLine creates a line with font and bbox size of `w`, removes `w` from p.bins[bestWordDepthIdx] and adds it to the line
|
||||
func newTextLine(p *textStrata, depthIdx int) *textLine {
|
||||
words := p.getStratum(depthIdx)
|
||||
@ -60,31 +62,22 @@ func (l *textLine) text() string {
|
||||
}
|
||||
}
|
||||
return strings.Join(words, "")
|
||||
|
||||
}
|
||||
|
||||
// toTextMarks returns the TextMarks contained in `l`.text().
|
||||
// `offset` is used to give the TextMarks the correct Offset values.
|
||||
func (l *textLine) toTextMarks(offset *int) []TextMark {
|
||||
var marks []TextMark
|
||||
addMark := func(mark TextMark) {
|
||||
mark.Offset = *offset
|
||||
marks = append(marks, mark)
|
||||
*offset += len(mark.Text)
|
||||
}
|
||||
addSpaceMark := func(spaceChar string) {
|
||||
mark := spaceMark
|
||||
mark.Text = spaceChar
|
||||
addMark(mark)
|
||||
}
|
||||
for _, word := range l.words {
|
||||
for _, tm := range word.marks {
|
||||
addMark(tm.ToTextMark())
|
||||
}
|
||||
wordMarks := word.toTextMarks(offset)
|
||||
marks = append(marks, wordMarks...)
|
||||
if word.spaceAfter {
|
||||
addSpaceMark(" ")
|
||||
marks = appendSpaceMark(marks, offset, " ")
|
||||
}
|
||||
}
|
||||
if len(l.text()) > 0 && len(marks) == 0 {
|
||||
panic(l.text())
|
||||
}
|
||||
return marks
|
||||
}
|
||||
|
||||
@ -130,16 +123,13 @@ func (l *textLine) mergeWordFragments() {
|
||||
}
|
||||
|
||||
// check for hyphen at end of line
|
||||
runes := []rune(l.text())
|
||||
l.hyphenated = len(runes) >= 4 &&
|
||||
l.hyphenated = isHyphenated(l.text())
|
||||
}
|
||||
|
||||
// isHyphenated returns true if `text` is a hyphenated word.
|
||||
func isHyphenated(text string) bool {
|
||||
runes := []rune(text)
|
||||
return len(runes) >= minHyphenation &&
|
||||
unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
|
||||
!unicode.IsSpace(runes[len(runes)-2])
|
||||
// if l.hyphenated {
|
||||
// // fmt.Fprintf(os.Stderr, "\n%q ", l.text())
|
||||
// common.Log.Info("### %d %q\n\t%q:%t\n\t%q:%t",
|
||||
// len(runes), l.text(),
|
||||
// runes[len(runes)-1], unicode.Is(unicode.Hyphen, runes[len(runes)-1]),
|
||||
// runes[len(runes)-2], !unicode.IsSpace(runes[len(runes)-2]),
|
||||
// )
|
||||
// }
|
||||
}
|
||||
|
@ -21,11 +21,6 @@ type textMark struct {
|
||||
model.PdfRectangle // Bounding box.
|
||||
text string // The text (decoded via ToUnicode).
|
||||
original string // Original text (decoded).
|
||||
orient int // The text orientation in degrees. This is the current TRM rounded to 10°.
|
||||
orientedStart transform.Point // Left of text in orientation where text is horizontal.
|
||||
orientedEnd transform.Point // Right of text in orientation where text is horizontal.
|
||||
height float64 // Text height.
|
||||
spaceWidth float64 // Best guess at the width of a space in the font the text was rendered with.
|
||||
font *model.PdfFont // The font the mark was drawn with.
|
||||
fontsize float64 // The font size the mark was drawn with.
|
||||
charspacing float64 // TODO (peterwilliams97: Should this be exposed in TextMark?
|
||||
@ -74,25 +69,20 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo
|
||||
bbox = clipped
|
||||
|
||||
tm := textMark{
|
||||
text: text,
|
||||
orient: orient,
|
||||
PdfRectangle: bbox,
|
||||
orientedStart: start.Rotate(theta),
|
||||
orientedEnd: end.Rotate(theta),
|
||||
height: math.Abs(height),
|
||||
spaceWidth: spaceWidth,
|
||||
font: font,
|
||||
fontsize: height,
|
||||
charspacing: charspacing,
|
||||
trm: trm,
|
||||
end: end,
|
||||
serial: serial.mark,
|
||||
text: text,
|
||||
PdfRectangle: bbox,
|
||||
font: font,
|
||||
fontsize: height,
|
||||
charspacing: charspacing,
|
||||
trm: trm,
|
||||
end: end,
|
||||
serial: serial.mark,
|
||||
}
|
||||
serial.mark++
|
||||
if !isTextSpace(tm.text) && tm.Width() == 0.0 {
|
||||
common.Log.Debug("ERROR: Zero width text. tm=%s", tm.String())
|
||||
}
|
||||
if verbose {
|
||||
if verboseGeom {
|
||||
common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
|
||||
}
|
||||
|
||||
@ -110,11 +100,6 @@ func (tm *textMark) bbox() model.PdfRectangle {
|
||||
return tm.PdfRectangle
|
||||
}
|
||||
|
||||
// Width returns the width of `tm`.text in the text direction.
|
||||
func (tm *textMark) Width() float64 {
|
||||
return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
|
||||
}
|
||||
|
||||
// ToTextMark returns the public view of `tm`.
|
||||
func (tm *textMark) ToTextMark() TextMark {
|
||||
return TextMark{
|
||||
@ -127,6 +112,23 @@ func (tm *textMark) ToTextMark() TextMark {
|
||||
}
|
||||
}
|
||||
|
||||
// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted
|
||||
// text.
|
||||
func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark {
|
||||
mark.Offset = *offset
|
||||
marks = append(marks, mark)
|
||||
*offset += len(mark.Text)
|
||||
return marks
|
||||
}
|
||||
|
||||
// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`,
|
||||
// the offset of `mark` in the extracted text.
|
||||
func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark {
|
||||
mark := spaceMark
|
||||
mark.Text = spaceChar
|
||||
return appendTextMark(marks, offset, mark)
|
||||
}
|
||||
|
||||
// nearestMultiple return the integer multiple of `m` that is closest to `x`.
|
||||
func nearestMultiple(x float64, m int) int {
|
||||
if m == 0 {
|
||||
|
@ -9,16 +9,12 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"unicode"
|
||||
"sort"
|
||||
|
||||
"github.com/unidoc/unipdf/v3/common"
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
)
|
||||
|
||||
// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
|
||||
// type so we can have methods on it.
|
||||
type paraList []*textPara
|
||||
|
||||
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
|
||||
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
|
||||
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
|
||||
@ -35,28 +31,21 @@ func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraL
|
||||
for i, para := range paraStratas {
|
||||
paras[i] = composePara(para)
|
||||
}
|
||||
if verbose || true {
|
||||
common.Log.Info("unsorted=========----------=====")
|
||||
for i, para := range paras {
|
||||
common.Log.Info("paras[%d]=%.2f%q", i, para.PdfRectangle, truncate(paras[i].text(), 200))
|
||||
}
|
||||
}
|
||||
|
||||
paras.log("unsorted")
|
||||
// paras.computeEBBoxes()
|
||||
|
||||
if useTables {
|
||||
paras = paras.extractTables()
|
||||
}
|
||||
// paras.log("tables extracted")
|
||||
paras.computeEBBoxes()
|
||||
paras = paras.extractTables()
|
||||
paras.log("EBBoxes 2")
|
||||
|
||||
// Sort the paras into reading order.
|
||||
paras.sortReadingOrder()
|
||||
if verbose || true {
|
||||
common.Log.Info("para sorted in reading order -----------=========")
|
||||
for i, para := range paras {
|
||||
tab := ""
|
||||
if para.table != nil {
|
||||
tab = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
|
||||
}
|
||||
fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tab, truncate(para.text(), 50))
|
||||
}
|
||||
}
|
||||
paras.log("sorted in reading order")
|
||||
|
||||
return paras
|
||||
}
|
||||
|
||||
@ -72,7 +61,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
|
||||
// Some bins are emptied before they iterated to (seee "surving bin" above).
|
||||
// If a `page` survives until it is iterated to then at least one `para` will be built around it.
|
||||
|
||||
if verbose {
|
||||
if verbosePage {
|
||||
common.Log.Info("dividePage")
|
||||
}
|
||||
cnt := 0
|
||||
@ -89,7 +78,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
|
||||
firstReadingIdx := page.firstReadingIndex(depthIdx)
|
||||
words := page.getStratum(firstReadingIdx)
|
||||
moveWord(firstReadingIdx, page, para, words[0])
|
||||
if verbose {
|
||||
if verbosePage {
|
||||
common.Log.Info("words[0]=%s", words[0].String())
|
||||
}
|
||||
|
||||
@ -105,7 +94,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
|
||||
|
||||
// Add words that are within maxIntraDepthGap of `para` in the depth direction.
|
||||
// i.e. Stretch para in the depth direction, vertically for English text.
|
||||
if verbose {
|
||||
if verbosePage {
|
||||
common.Log.Info("para depth %.2f - %.2f maxIntraDepthGap=%.2f ",
|
||||
para.minDepth(), para.maxDepth(), maxIntraDepthGap)
|
||||
}
|
||||
@ -159,6 +148,9 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
|
||||
|
||||
// Sort the words in `para`'s bins in the reading direction.
|
||||
para.sort()
|
||||
if verbosePage {
|
||||
common.Log.Info("para=%s", para.String())
|
||||
}
|
||||
paraStratas = append(paraStratas, para)
|
||||
}
|
||||
}
|
||||
@ -166,40 +158,11 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
|
||||
return paraStratas
|
||||
}
|
||||
|
||||
const doHyphens = true
|
||||
const useTables = true
|
||||
|
||||
// writeText writes the text in `paras` to `w`.
|
||||
func (paras paraList) writeText(w io.Writer) {
|
||||
for ip, para := range paras {
|
||||
if useTables {
|
||||
para.writeText(w)
|
||||
} else {
|
||||
for il, line := range para.lines {
|
||||
s := line.text()
|
||||
reduced := false
|
||||
if doHyphens {
|
||||
if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
|
||||
// Line ending with hyphen. Remove it.
|
||||
runes := []rune(s)
|
||||
s = string(runes[:len(runes)-1])
|
||||
reduced = true
|
||||
}
|
||||
}
|
||||
w.Write([]byte(s))
|
||||
if reduced {
|
||||
// We removed the hyphen from the end of the line so we don't need a line ending.
|
||||
continue
|
||||
}
|
||||
if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
|
||||
// Next line is the same depth so it's the same line as this one in the extracted text
|
||||
w.Write([]byte(" "))
|
||||
continue
|
||||
}
|
||||
w.Write([]byte("\n"))
|
||||
}
|
||||
w.Write([]byte("\n"))
|
||||
}
|
||||
for _, para := range paras {
|
||||
para.writeText(w)
|
||||
w.Write([]byte("\n"))
|
||||
}
|
||||
}
|
||||
|
||||
@ -208,69 +171,35 @@ func (paras paraList) writeText(w io.Writer) {
|
||||
func (paras paraList) toTextMarks() []TextMark {
|
||||
offset := 0
|
||||
var marks []TextMark
|
||||
addMark := func(mark TextMark) {
|
||||
mark.Offset = offset
|
||||
marks = append(marks, mark)
|
||||
offset += len(mark.Text)
|
||||
}
|
||||
addSpaceMark := func(spaceChar string) {
|
||||
mark := spaceMark
|
||||
mark.Text = spaceChar
|
||||
addMark(mark)
|
||||
}
|
||||
for ip, para := range paras {
|
||||
if useTables {
|
||||
paraMarks := para.toTextMarks(&offset)
|
||||
marks = append(marks, paraMarks...)
|
||||
} else {
|
||||
for il, line := range para.lines {
|
||||
lineMarks := line.toTextMarks(&offset)
|
||||
marks = append(marks, lineMarks...)
|
||||
reduced := false
|
||||
if doHyphens {
|
||||
if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
|
||||
tm := marks[len(marks)-1]
|
||||
r := []rune(tm.Text)
|
||||
if unicode.IsSpace(r[len(r)-1]) {
|
||||
panic(tm)
|
||||
}
|
||||
if len(r) == 1 {
|
||||
marks = marks[:len(marks)-1]
|
||||
offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
|
||||
} else {
|
||||
s := string(r[:len(r)-1])
|
||||
offset += len(s) - len(tm.Text)
|
||||
tm.Text = s
|
||||
}
|
||||
reduced = true
|
||||
}
|
||||
}
|
||||
if reduced {
|
||||
continue
|
||||
}
|
||||
if il != len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
|
||||
// Next line is the same depth so it's the same line as this one in the extracted text
|
||||
addSpaceMark(" ")
|
||||
continue
|
||||
}
|
||||
addSpaceMark("\n")
|
||||
}
|
||||
if ip != len(paras)-1 {
|
||||
addSpaceMark("\n")
|
||||
}
|
||||
}
|
||||
for _, para := range paras {
|
||||
paraMarks := para.toTextMarks(&offset)
|
||||
marks = append(marks, paraMarks...)
|
||||
marks = appendSpaceMark(marks, &offset, "\n")
|
||||
}
|
||||
return marks
|
||||
}
|
||||
|
||||
func (paras paraList) toTables() []TextTable {
|
||||
var tables []TextTable
|
||||
for _, para := range paras {
|
||||
if para.table != nil {
|
||||
tables = append(tables, para.table.toTextTable())
|
||||
}
|
||||
}
|
||||
return tables
|
||||
}
|
||||
|
||||
// sortReadingOrder sorts `paras` in reading order.
|
||||
func (paras paraList) sortReadingOrder() {
|
||||
common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras))
|
||||
if len(paras) <= 1 {
|
||||
return
|
||||
}
|
||||
sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 })
|
||||
paras.log("diffReadingDepth")
|
||||
adj := paras.adjMatrix()
|
||||
order := topoOrder(adj)
|
||||
printAdj(adj)
|
||||
paras.reorder(order)
|
||||
}
|
||||
|
||||
@ -290,22 +219,23 @@ func (paras paraList) adjMatrix() [][]bool {
|
||||
adj[i][j], reasons[i][j] = paras.before(i, j)
|
||||
}
|
||||
}
|
||||
if verbose && false {
|
||||
if verbosePage {
|
||||
show := func(a *textPara) string {
|
||||
return fmt.Sprintf("%6.2f %q", a.eBBox, truncate(a.text(), 70))
|
||||
}
|
||||
common.Log.Info("adjMatrix =======")
|
||||
for i := 0; i < n; i++ {
|
||||
a := paras[i]
|
||||
fmt.Printf("%4d: %q %.2f\n", i, truncate(a.text(), 50), a.PdfRectangle)
|
||||
fmt.Printf("%4d: %s\n", i, show(a))
|
||||
for j := 0; j < n; j++ {
|
||||
if i == j {
|
||||
continue
|
||||
}
|
||||
if !adj[i][j] {
|
||||
if !adj[i][j] && i != 16 {
|
||||
continue
|
||||
}
|
||||
b := paras[j]
|
||||
fmt.Printf("%8d: %10s %q %.2f\n", j,
|
||||
reasons[i][j], truncate(b.text(), 40), b.PdfRectangle)
|
||||
|
||||
fmt.Printf("%8d: %t %10s %s\n", j, adj[i][j], reasons[i][j], show(b))
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -344,7 +274,7 @@ func (paras paraList) before(i, j int) (bool, string) {
|
||||
continue
|
||||
}
|
||||
if overlappedXPara(a, c) && overlappedXPara(c, b) {
|
||||
return false, "Y intervening"
|
||||
return false, fmt.Sprintf("Y intervening: %d: %s", k, c)
|
||||
}
|
||||
}
|
||||
return true, "TO LEFT"
|
||||
@ -358,13 +288,21 @@ func overlappedXPara(r0, r1 *textPara) bool {
|
||||
|
||||
// computeEBBoxes computes the eBBox fields in the elements of `paras`.
|
||||
func (paras paraList) computeEBBoxes() {
|
||||
common.Log.Trace("computeEBBoxes:")
|
||||
if verbose {
|
||||
common.Log.Info("computeEBBoxes:")
|
||||
}
|
||||
|
||||
for i, a := range paras {
|
||||
// [llx, urx] is the reading direction interval for which no paras overlap `a`
|
||||
for _, para := range paras {
|
||||
para.eBBox = para.PdfRectangle
|
||||
}
|
||||
|
||||
for i, aa := range paras {
|
||||
a := aa.eBBox
|
||||
// [llx, urx] is the reading direction interval for which no paras overlap `a`.
|
||||
llx := -1.0e9
|
||||
urx := +1.0e9
|
||||
for j, b := range paras {
|
||||
for j, bb := range paras {
|
||||
b := bb.eBBox
|
||||
if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) {
|
||||
continue
|
||||
}
|
||||
@ -385,27 +323,65 @@ func (paras paraList) computeEBBoxes() {
|
||||
|
||||
// Go through all paras below `a` within interval [llx, urx] in the reading direction and
|
||||
// expand `a` as far as possible to left and right without overlapping any of them.
|
||||
a.eBBox = a.PdfRectangle
|
||||
for j, b := range paras {
|
||||
|
||||
for j, bb := range paras {
|
||||
b := bb.eBBox
|
||||
if i == j || b.Ury > a.Lly {
|
||||
continue
|
||||
}
|
||||
|
||||
// If `b` is completely to right of `llx`, extend `a` left to `b`.
|
||||
if llx <= b.Llx {
|
||||
a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx)
|
||||
a.Llx = math.Min(a.Llx, b.Llx)
|
||||
}
|
||||
|
||||
// If `b` is completely to left of `urx`, extend `a` right to `b`.
|
||||
if b.Urx <= urx {
|
||||
a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx)
|
||||
a.Urx = math.Max(a.Urx, b.Urx)
|
||||
}
|
||||
}
|
||||
if verbose {
|
||||
fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50))
|
||||
}
|
||||
aa.eBBox = a
|
||||
}
|
||||
if useEBBox {
|
||||
for _, para := range paras {
|
||||
para.PdfRectangle = para.eBBox
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// printAdj prints `adj` to stdout.
|
||||
func printAdj(adj [][]bool) {
|
||||
if !verbosePage {
|
||||
return
|
||||
}
|
||||
common.Log.Info("printAdj:")
|
||||
n := len(adj)
|
||||
fmt.Printf("%3s:", "")
|
||||
for x := 0; x < n; x++ {
|
||||
fmt.Printf("%3d", x)
|
||||
}
|
||||
fmt.Println()
|
||||
for y := 0; y < n; y++ {
|
||||
fmt.Printf("%3d:", y)
|
||||
for x := 0; x < n; x++ {
|
||||
s := ""
|
||||
if adj[y][x] {
|
||||
s = "X"
|
||||
}
|
||||
fmt.Printf("%3s", s)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
// topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`.
|
||||
func topoOrder(adj [][]bool) []int {
|
||||
if verbosePage {
|
||||
common.Log.Info("topoOrder:")
|
||||
}
|
||||
n := len(adj)
|
||||
visited := make([]bool, n)
|
||||
var order []int
|
||||
@ -427,11 +403,16 @@ func topoOrder(adj [][]bool) []int {
|
||||
sortNode(idx)
|
||||
}
|
||||
}
|
||||
// Order is currently reversed so change it to forward order.
|
||||
for i := 0; i < n/2; i++ {
|
||||
order[i], order[n-1-i] = order[n-1-i], order[i]
|
||||
return reversed(order)
|
||||
}
|
||||
|
||||
// reversed return `order` reversed.
|
||||
func reversed(order []int) []int {
|
||||
rev := make([]int, len(order))
|
||||
for i, v := range order {
|
||||
rev[len(order)-1-i] = v
|
||||
}
|
||||
return order
|
||||
return rev
|
||||
}
|
||||
|
||||
// reorder reorders `para` to the order in `order`.
|
||||
|
@ -12,9 +12,14 @@ import (
|
||||
"sort"
|
||||
"unicode"
|
||||
|
||||
"github.com/unidoc/unipdf/v3/common"
|
||||
"github.com/unidoc/unipdf/v3/model"
|
||||
)
|
||||
|
||||
// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
|
||||
// type so we can have methods on it.
|
||||
type paraList []*textPara
|
||||
|
||||
// textPara is a group of words in a rectangular region of a page that get read together.
|
||||
// An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.
|
||||
// We start by finding paragraph regions on a page, then we break the words into the textPara into
|
||||
@ -22,7 +27,7 @@ import (
|
||||
type textPara struct {
|
||||
serial int // Sequence number for debugging.
|
||||
model.PdfRectangle // Bounding box.
|
||||
eBBox model.PdfRectangle // Extented ounding box needed to compute reading order.
|
||||
eBBox model.PdfRectangle // Extended bounding box needed to compute reading order.
|
||||
lines []*textLine // Paragraph text gets broken into lines.
|
||||
table *textTable
|
||||
}
|
||||
@ -39,8 +44,8 @@ func newTextPara(strata *textStrata) *textPara {
|
||||
|
||||
// String returns a description of `p`.
|
||||
func (p *textPara) String() string {
|
||||
return fmt.Sprintf("serial=%d %.2f %d lines\n%s\n-------------",
|
||||
p.serial, p.PdfRectangle, len(p.lines), p.text())
|
||||
return fmt.Sprintf("serial=%d %.2f %d lines %q",
|
||||
p.serial, p.PdfRectangle, len(p.lines), truncate(p.text(), 50))
|
||||
}
|
||||
|
||||
// text returns the text of the lines in `p`.
|
||||
@ -52,47 +57,21 @@ func (p *textPara) text() string {
|
||||
|
||||
// writeText writes the text of `p` including tables to `w`.
|
||||
func (p *textPara) writeText(w io.Writer) {
|
||||
if p.table != nil {
|
||||
for y := 0; y < p.table.h; y++ {
|
||||
for x := 0; x < p.table.w; x++ {
|
||||
cell := p.table.cells[y*p.table.w+x]
|
||||
cell.writeCellText(w)
|
||||
w.Write([]byte(" "))
|
||||
}
|
||||
w.Write([]byte("\n"))
|
||||
}
|
||||
} else {
|
||||
if p.table == nil {
|
||||
p.writeCellText(w)
|
||||
w.Write([]byte("\n"))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// writeCellText writes the text of `p` not including tables to `w`.
|
||||
func (p *textPara) writeCellText(w io.Writer) {
|
||||
// w := new(bytes.Buffer)
|
||||
para := p
|
||||
for il, line := range para.lines {
|
||||
s := line.text()
|
||||
reduced := false
|
||||
if doHyphens {
|
||||
if line.hyphenated && il != len(para.lines)-1 {
|
||||
// Line ending with hyphen. Remove it.
|
||||
runes := []rune(s)
|
||||
s = string(runes[:len(runes)-1])
|
||||
reduced = true
|
||||
for y := 0; y < p.table.h; y++ {
|
||||
for x := 0; x < p.table.w; x++ {
|
||||
cell := p.table.get(x, y)
|
||||
if cell == nil {
|
||||
w.Write([]byte("\t"))
|
||||
} else {
|
||||
cell.writeCellText(w)
|
||||
}
|
||||
}
|
||||
w.Write([]byte(s))
|
||||
if reduced {
|
||||
// We removed the hyphen from the end of the line so we don't need a line ending.
|
||||
continue
|
||||
}
|
||||
if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
|
||||
// Next line is the same depth so it's the same line as this one in the extracted text
|
||||
w.Write([]byte(" "))
|
||||
continue
|
||||
}
|
||||
if il < len(para.lines)-1 {
|
||||
if y < p.table.h-1 {
|
||||
w.Write([]byte("\n"))
|
||||
}
|
||||
}
|
||||
@ -101,90 +80,103 @@ func (p *textPara) writeCellText(w io.Writer) {
|
||||
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
|
||||
// paras `p`.writeText().
|
||||
func (p *textPara) toTextMarks(offset *int) []TextMark {
|
||||
if p.table == nil {
|
||||
return p.toCellTextMarks(offset)
|
||||
}
|
||||
var marks []TextMark
|
||||
addMark := func(mark TextMark) {
|
||||
mark.Offset = *offset
|
||||
marks = append(marks, mark)
|
||||
*offset += len(mark.Text)
|
||||
}
|
||||
addSpaceMark := func(spaceChar string) {
|
||||
mark := spaceMark
|
||||
mark.Text = spaceChar
|
||||
addMark(mark)
|
||||
}
|
||||
if p.table != nil {
|
||||
for y := 0; y < p.table.h; y++ {
|
||||
for x := 0; x < p.table.w; x++ {
|
||||
cell := p.table.cells[y*p.table.w+x]
|
||||
for y := 0; y < p.table.h; y++ {
|
||||
for x := 0; x < p.table.w; x++ {
|
||||
cell := p.table.get(x, y)
|
||||
if cell == nil {
|
||||
marks = appendSpaceMark(marks, offset, "\t")
|
||||
} else {
|
||||
cellMarks := cell.toCellTextMarks(offset)
|
||||
marks = append(marks, cellMarks...)
|
||||
addSpaceMark(" ")
|
||||
}
|
||||
addSpaceMark("\n")
|
||||
marks = appendSpaceMark(marks, offset, " ")
|
||||
}
|
||||
if y < p.table.h-1 {
|
||||
marks = appendSpaceMark(marks, offset, "\n")
|
||||
}
|
||||
} else {
|
||||
marks = p.toCellTextMarks(offset)
|
||||
addSpaceMark("\n")
|
||||
}
|
||||
return marks
|
||||
}
|
||||
|
||||
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
|
||||
// writeCellText writes the text of `p` not including tables to `w`.
|
||||
func (p *textPara) writeCellText(w io.Writer) {
|
||||
for il, line := range p.lines {
|
||||
lineText := line.text()
|
||||
reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
|
||||
if reduced { // Line ending with hyphen. Remove it.
|
||||
lineText = removeLastRune(lineText)
|
||||
}
|
||||
w.Write([]byte(lineText))
|
||||
if !(reduced || il == len(p.lines)-1) {
|
||||
w.Write([]byte(getSpace(line.depth, p.lines[il+1].depth)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by
|
||||
// paras `paras`.writeCellText().
|
||||
func (p *textPara) toCellTextMarks(offset *int) []TextMark {
|
||||
var marks []TextMark
|
||||
addMark := func(mark TextMark) {
|
||||
mark.Offset = *offset
|
||||
marks = append(marks, mark)
|
||||
*offset += len(mark.Text)
|
||||
}
|
||||
addSpaceMark := func(spaceChar string) {
|
||||
mark := spaceMark
|
||||
mark.Text = spaceChar
|
||||
addMark(mark)
|
||||
}
|
||||
para := p
|
||||
|
||||
for il, line := range para.lines {
|
||||
for il, line := range p.lines {
|
||||
lineMarks := line.toTextMarks(offset)
|
||||
marks = append(marks, lineMarks...)
|
||||
reduced := false
|
||||
if doHyphens {
|
||||
if line.hyphenated && il != len(para.lines)-1 {
|
||||
tm := marks[len(marks)-1]
|
||||
r := []rune(tm.Text)
|
||||
if unicode.IsSpace(r[len(r)-1]) {
|
||||
panic(tm)
|
||||
}
|
||||
if len(r) == 1 {
|
||||
marks = marks[:len(marks)-1]
|
||||
*offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
|
||||
} else {
|
||||
s := string(r[:len(r)-1])
|
||||
*offset += len(s) - len(tm.Text)
|
||||
tm.Text = s
|
||||
}
|
||||
reduced = true
|
||||
reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
|
||||
if reduced { // Line ending with hyphen. Remove it.
|
||||
if len([]rune(line.text())) < minHyphenation {
|
||||
panic(line.text())
|
||||
}
|
||||
if len(lineMarks) < 1 {
|
||||
panic(line.text())
|
||||
}
|
||||
lineMarks = removeLastTextMarkRune(lineMarks, offset)
|
||||
}
|
||||
if reduced {
|
||||
continue
|
||||
}
|
||||
if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
|
||||
// Next line is the same depth so it's the same line as this one in the extracted text
|
||||
addSpaceMark(" ")
|
||||
continue
|
||||
}
|
||||
if il < len(para.lines)-1 {
|
||||
addSpaceMark("\n")
|
||||
marks = append(marks, lineMarks...)
|
||||
if !(reduced || il == len(p.lines)-1) {
|
||||
marks = appendSpaceMark(marks, offset, getSpace(line.depth, p.lines[il+1].depth))
|
||||
}
|
||||
}
|
||||
|
||||
addSpaceMark("\n")
|
||||
|
||||
return marks
|
||||
}
|
||||
|
||||
func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark {
|
||||
tm := marks[len(marks)-1]
|
||||
runes := []rune(tm.Text)
|
||||
if unicode.IsSpace(runes[len(runes)-1]) {
|
||||
panic(tm)
|
||||
}
|
||||
if len(runes) == 1 {
|
||||
marks = marks[:len(marks)-1]
|
||||
tm1 := marks[len(marks)-1]
|
||||
*offset = tm1.Offset + len(tm1.Text)
|
||||
} else {
|
||||
text := removeLastRune(tm.Text)
|
||||
*offset += len(text) - len(tm.Text)
|
||||
tm.Text = text
|
||||
}
|
||||
return marks
|
||||
}
|
||||
|
||||
func removeLastRune(text string) string {
|
||||
runes := []rune(text)
|
||||
if len(runes) < 2 {
|
||||
panic(text)
|
||||
}
|
||||
return string(runes[:len(runes)-1])
|
||||
}
|
||||
|
||||
// getSpace returns the space to insert between lines of depth `depth1` and `depth2`.
|
||||
// Next line is the same depth so it's the same line as this one in the extracted text
|
||||
func getSpace(depth1, depth2 float64) string {
|
||||
eol := !isZero(depth1 - depth2)
|
||||
if eol {
|
||||
return "\n"
|
||||
}
|
||||
return " "
|
||||
}
|
||||
|
||||
// bbox makes textPara implement the `bounded` interface.
|
||||
func (p *textPara) bbox() model.PdfRectangle {
|
||||
return p.PdfRectangle
|
||||
@ -271,5 +263,42 @@ func composePara(strata *textStrata) *textPara {
|
||||
if len(para.lines) == 0 {
|
||||
panic(para)
|
||||
}
|
||||
if verbosePara {
|
||||
common.Log.Info("!!! para=%s", para.String())
|
||||
for i, line := range para.lines {
|
||||
fmt.Printf("%4d: %s\n", i, line)
|
||||
for j, word := range line.words {
|
||||
fmt.Printf("%8d: %s\n", j, word)
|
||||
for k, mark := range word.marks {
|
||||
fmt.Printf("%12d: %s\n", k, mark)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return para
|
||||
}
|
||||
|
||||
// log logs the contents of `paras`.
|
||||
func (paras paraList) log(title string) {
|
||||
if !verbosePage {
|
||||
return
|
||||
}
|
||||
common.Log.Info("%8s: %d paras =======-------=======", title, len(paras))
|
||||
for i, para := range paras {
|
||||
if para == nil {
|
||||
continue
|
||||
}
|
||||
text := para.text()
|
||||
tabl := " "
|
||||
if para.table != nil {
|
||||
tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
|
||||
}
|
||||
fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
|
||||
if len(text) == 0 {
|
||||
panic("empty")
|
||||
}
|
||||
if para.table != nil && len(para.table.cells) == 0 {
|
||||
panic(para)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -38,14 +38,14 @@ func makeTextStrata(words []*textWord, pageHeight float64) *textStrata {
|
||||
|
||||
// newTextStrata returns an empty textStrata with page height `pageHeight`.
|
||||
func newTextStrata(pageHeight float64) *textStrata {
|
||||
bins := textStrata{
|
||||
serial: serial.bins,
|
||||
strata := textStrata{
|
||||
serial: serial.strata,
|
||||
bins: map[int][]*textWord{},
|
||||
PdfRectangle: model.PdfRectangle{Urx: -1.0, Ury: -1.0},
|
||||
pageHeight: pageHeight,
|
||||
}
|
||||
serial.bins++
|
||||
return &bins
|
||||
serial.strata++
|
||||
return &strata
|
||||
}
|
||||
|
||||
// String returns a description of `s`.
|
||||
@ -57,7 +57,9 @@ func (s *textStrata) String() string {
|
||||
texts = append(texts, w.text())
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts)
|
||||
// return fmt.Sprintf("serial=%d %d %q", s.serial, )
|
||||
return fmt.Sprintf("serial=%d %.2f fontsize=%.2f %d %q",
|
||||
s.serial, s.PdfRectangle, s.fontsize, len(texts), texts)
|
||||
}
|
||||
|
||||
// sort sorts the words in each bin in `s` in the reading direction.
|
||||
@ -129,10 +131,24 @@ func (s *textStrata) scanBand(title string, para *textStrata,
|
||||
if !readingOverlap(para, word) {
|
||||
continue
|
||||
}
|
||||
if fontTol > 0 && math.Abs(word.fontsize-fontsize) > fontTol*fontsize {
|
||||
continue
|
||||
fontRatio1 := math.Abs(word.fontsize-fontsize) / fontsize
|
||||
fontRatio2 := word.fontsize / fontsize
|
||||
|
||||
fontRatio := math.Min(fontRatio1, fontRatio2)
|
||||
if fontTol > 0 {
|
||||
if fontRatio > fontTol {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if fontTol <= 0 {
|
||||
panic(fontTol)
|
||||
}
|
||||
if !detectOnly {
|
||||
// if !para.isHomogenous(word) {
|
||||
// panic(fmt.Errorf("not homogeneous fontTol=%.2f ratio=%.2f (%.2f->%.2f)\n\tpara=%s\n\tword=%s",
|
||||
// fontTol, fontRatio, fontsize, word.fontsize,
|
||||
// para.String(), word.String()))
|
||||
// }
|
||||
moveWord(depthIdx, s, para, word)
|
||||
}
|
||||
newWords = append(newWords, word)
|
||||
@ -155,11 +171,11 @@ func (s *textStrata) scanBand(title string, para *textStrata,
|
||||
}
|
||||
if verbose {
|
||||
if len(title) > 0 {
|
||||
common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f",
|
||||
common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f fontsize=%.2f",
|
||||
title,
|
||||
minDepth0, maxDepth0,
|
||||
minDepth, maxDepth,
|
||||
para.PdfRectangle)
|
||||
para.PdfRectangle, para.fontsize)
|
||||
for i, word := range newWords {
|
||||
fmt.Printf("%4d: %s\n", i, word)
|
||||
}
|
||||
@ -271,6 +287,36 @@ func moveWord(depthIdx int, page, para *textStrata, word *textWord) {
|
||||
page.removeWord(depthIdx, word)
|
||||
}
|
||||
|
||||
func (s *textStrata) allWords() []*textWord {
|
||||
var wordList []*textWord
|
||||
for _, words := range s.bins {
|
||||
wordList = append(wordList, words...)
|
||||
}
|
||||
return wordList
|
||||
}
|
||||
|
||||
func (s *textStrata) isHomogenous(w *textWord) bool {
|
||||
words := s.allWords()
|
||||
words = append(words, w)
|
||||
if len(words) == 0 {
|
||||
return true
|
||||
}
|
||||
minFont := words[0].fontsize
|
||||
maxFont := minFont
|
||||
for _, w := range words {
|
||||
if w.fontsize < minFont {
|
||||
minFont = w.fontsize
|
||||
} else if w.fontsize > maxFont {
|
||||
maxFont = w.fontsize
|
||||
}
|
||||
}
|
||||
if maxFont/minFont > 1.3 {
|
||||
common.Log.Error("font size range: %.2f - %.2f = %.1fx", minFont, maxFont, maxFont/minFont)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// removeWord removes `word`from `s`.bins[`depthIdx`].
|
||||
// NOTE: We delete bins as soon as they become empty to save code that calls other textStrata
|
||||
// functions from having to check for empty bins.
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -175,7 +175,7 @@ func TestTermMarksFiles(t *testing.T) {
|
||||
if !doStress {
|
||||
t.Skip("skipping stress test")
|
||||
}
|
||||
common.Log.Info("Running text stress tests. go test --short to skip these.")
|
||||
common.Log.Info("Running text stress tests.")
|
||||
if len(corpusFolder) == 0 && !forceTest {
|
||||
t.Log("Corpus folder not set - skipping")
|
||||
return
|
||||
@ -736,6 +736,11 @@ func testTermMarks(t *testing.T, text string, textMarks *TextMarkArray, n int) {
|
||||
ofs1d = len(text)
|
||||
}
|
||||
show := fmt.Sprintf("<%s|%s|%s>", text[ofs0d:ofs0], text[ofs0:ofs1], text[ofs1:ofs1d])
|
||||
{
|
||||
show = fmt.Sprintf("%q", show)
|
||||
runes := []rune(show)
|
||||
show = string(runes[1 : len(runes)-1])
|
||||
}
|
||||
|
||||
// Get TextMarks spanning `term` with RangeOffset().
|
||||
spanArray, err := textMarks.RangeOffset(ofs0, ofs1)
|
||||
@ -783,6 +788,7 @@ func startWith(str, sub string) bool {
|
||||
if strings.HasPrefix(str, sub[n:]) {
|
||||
return true
|
||||
}
|
||||
// common.Log.Error("!startsWith: str=%q sub=%q sub[%d:]=%q", str, sub, n, sub[n:])
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
@ -170,6 +170,19 @@ func (w *textWord) text() string {
|
||||
return strings.Join(texts, "")
|
||||
}
|
||||
|
||||
// toTextMarks returns the TextMarks contained in `w`.text().
|
||||
// `offset` is used to give the TextMarks the correct Offset values.
|
||||
func (w *textWord) toTextMarks(offset *int) []TextMark {
|
||||
var marks []TextMark
|
||||
for _, tm := range w.marks {
|
||||
marks = appendTextMark(marks, offset, tm.ToTextMark())
|
||||
}
|
||||
if len(w.text()) > 0 && len(marks) == 0 {
|
||||
panic(w.text())
|
||||
}
|
||||
return marks
|
||||
}
|
||||
|
||||
// font returns the fontID of the `idx`th rune in text.
|
||||
// compute on creation? !@#$
|
||||
func (w *textWord) font(idx int) string {
|
||||
|
@ -22,7 +22,7 @@ const (
|
||||
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
|
||||
MissingCodeRune = '\ufffd' // <20>
|
||||
|
||||
// MissingCodeRune replaces strings that can't be decoded.
|
||||
// MissingCodeString replaces strings that can't be decoded.
|
||||
MissingCodeString = string(MissingCodeRune)
|
||||
)
|
||||
|
||||
@ -44,7 +44,7 @@ type charRange struct {
|
||||
type fbRange struct {
|
||||
code0 CharCode
|
||||
code1 CharCode
|
||||
r0 rune // TODO (peterwilliams97): Change to string for compound codes.
|
||||
r0 string
|
||||
}
|
||||
|
||||
// CIDSystemInfo contains information for identifying the character collection
|
||||
@ -110,8 +110,7 @@ type CMap struct {
|
||||
|
||||
// Used by ctype 2 CMaps.
|
||||
codeToUnicode map[CharCode]string // CID -> Unicode string
|
||||
// XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode?
|
||||
unicodeToCode map[rune]CharCode // Unicode rune -> CID
|
||||
unicodeToCode map[string]CharCode // Unicode rune -> CID
|
||||
|
||||
// cached contains the raw CMap data. It is used by the Bytes method in
|
||||
// order to avoid generating the data for every call.
|
||||
@ -137,10 +136,10 @@ func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
|
||||
Supplement: 0,
|
||||
},
|
||||
codespaces: []Codespace{{Low: 0, High: 0xffff}},
|
||||
codeToCID: make(map[CharCode]CharCode),
|
||||
cidToCode: make(map[CharCode]CharCode),
|
||||
codeToUnicode: codeToUnicode,
|
||||
unicodeToCode: make(map[rune]CharCode),
|
||||
unicodeToCode: make(map[string]CharCode, len(codeToRune)),
|
||||
codeToCID: make(map[CharCode]CharCode, len(codeToRune)),
|
||||
cidToCode: make(map[CharCode]CharCode, len(codeToRune)),
|
||||
}
|
||||
|
||||
cmap.computeInverseMappings()
|
||||
@ -159,7 +158,7 @@ func newCMap(isSimple bool) *CMap {
|
||||
codeToCID: make(map[CharCode]CharCode),
|
||||
cidToCode: make(map[CharCode]CharCode),
|
||||
codeToUnicode: make(map[CharCode]string),
|
||||
unicodeToCode: make(map[rune]CharCode),
|
||||
unicodeToCode: make(map[string]CharCode),
|
||||
}
|
||||
}
|
||||
|
||||
@ -265,13 +264,8 @@ func (cmap *CMap) computeInverseMappings() {
|
||||
|
||||
// Generate Unicode -> CID map.
|
||||
for cid, s := range cmap.codeToUnicode {
|
||||
// The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf
|
||||
if len(s) == 0 {
|
||||
continue
|
||||
}
|
||||
r := rune0(s)
|
||||
if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
|
||||
cmap.unicodeToCode[r] = cid
|
||||
if c, ok := cmap.unicodeToCode[s]; !ok || (ok && c > cid) {
|
||||
cmap.unicodeToCode[s] = cid
|
||||
}
|
||||
}
|
||||
|
||||
@ -326,10 +320,10 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
|
||||
return MissingCodeString, false
|
||||
}
|
||||
|
||||
// RuneToCID maps the specified rune to a character identifier. If the provided
|
||||
// rune has no available mapping, the second return value is false.
|
||||
func (cmap *CMap) RuneToCID(r rune) (CharCode, bool) {
|
||||
cid, ok := cmap.unicodeToCode[r]
|
||||
// StringToCID maps the specified string to a character identifier. If the provided
|
||||
// string has no available mapping, the bool return value is false.
|
||||
func (cmap *CMap) StringToCID(s string) (CharCode, bool) {
|
||||
cid, ok := cmap.unicodeToCode[s]
|
||||
return cid, ok
|
||||
}
|
||||
|
||||
@ -484,10 +478,10 @@ func (cmap *CMap) toBfData() string {
|
||||
// character codes have been mapped to code ranges.
|
||||
var charRanges []charRange
|
||||
currCharRange := charRange{codes[0], codes[0]}
|
||||
prevRune := rune0(cmap.codeToUnicode[codes[0]])
|
||||
prevRune := cmap.codeToUnicode[codes[0]]
|
||||
for _, c := range codes[1:] {
|
||||
currRune := rune0(cmap.codeToUnicode[c])
|
||||
if c == currCharRange.code1+1 && currRune == prevRune+1 {
|
||||
currRune := cmap.codeToUnicode[c]
|
||||
if c == currCharRange.code1+1 && lastRune(currRune) == lastRune(prevRune)+1 {
|
||||
currCharRange.code1 = c
|
||||
} else {
|
||||
charRanges = append(charRanges, currCharRange)
|
||||
@ -507,7 +501,7 @@ func (cmap *CMap) toBfData() string {
|
||||
fbRanges = append(fbRanges, fbRange{
|
||||
code0: cr.code0,
|
||||
code1: cr.code1,
|
||||
r0: rune0(cmap.codeToUnicode[cr.code0]),
|
||||
r0: cmap.codeToUnicode[cr.code0],
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -522,8 +516,8 @@ func (cmap *CMap) toBfData() string {
|
||||
lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
|
||||
for j := 0; j < n; j++ {
|
||||
code := fbChars[i*maxBfEntries+j]
|
||||
r := rune0(cmap.codeToUnicode[code])
|
||||
lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
|
||||
s := cmap.codeToUnicode[code]
|
||||
lines = append(lines, fmt.Sprintf("<%04x> %s", code, hexCode(s)))
|
||||
}
|
||||
lines = append(lines, "endbfchar")
|
||||
}
|
||||
@ -535,8 +529,8 @@ func (cmap *CMap) toBfData() string {
|
||||
lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
|
||||
for j := 0; j < n; j++ {
|
||||
rng := fbRanges[i*maxBfEntries+j]
|
||||
r := rng.r0
|
||||
lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1, r))
|
||||
lines = append(lines, fmt.Sprintf("<%04x><%04x> %s",
|
||||
rng.code0, rng.code1, hexCode(rng.r0)))
|
||||
}
|
||||
lines = append(lines, "endbfrange")
|
||||
}
|
||||
@ -544,6 +538,22 @@ func (cmap *CMap) toBfData() string {
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
|
||||
// lastRune returns the last rune in `s`.
|
||||
func lastRune(s string) rune {
|
||||
runes := []rune(s)
|
||||
return runes[len(runes)-1]
|
||||
}
|
||||
|
||||
// hexCode return the CMap hex code for `s`.
|
||||
func hexCode(s string) string {
|
||||
runes := []rune(s)
|
||||
codes := make([]string, len(runes))
|
||||
for i, r := range runes {
|
||||
codes[i] = fmt.Sprintf("%04x", r)
|
||||
}
|
||||
return fmt.Sprintf("<%s>", strings.Join(codes, ""))
|
||||
}
|
||||
|
||||
const (
|
||||
maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
|
||||
cmapHeader = `
|
||||
@ -563,9 +573,3 @@ end
|
||||
end
|
||||
`
|
||||
)
|
||||
|
||||
// rune0 is a convenience function that returns the first rune in `s`.
|
||||
// Caller must check that `s` is not empty.
|
||||
func rune0(s string) rune {
|
||||
return ([]rune(s))[0]
|
||||
}
|
||||
|
@ -105,7 +105,7 @@ func (cmap *CMap) parse() error {
|
||||
func (cmap *CMap) parseName() error {
|
||||
name := ""
|
||||
done := false
|
||||
// /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf
|
||||
// NOTE(peterwilliams97): We need up to 20 iterations of this loop for some PDFs I have seen.
|
||||
for i := 0; i < 20 && !done; i++ {
|
||||
o, err := cmap.parseObject()
|
||||
if err != nil {
|
||||
|
@ -67,7 +67,7 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) {
|
||||
}
|
||||
|
||||
// Map rune to CID.
|
||||
cid, ok := enc.cidToUnicode.RuneToCID(r)
|
||||
cid, ok := enc.cidToUnicode.StringToCID(string(r))
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ const (
|
||||
// MissingCodeRune replaces runes that can't be decoded. .
|
||||
MissingCodeRune = '\ufffd' // <20>
|
||||
|
||||
// MissingCodeRune replaces strings that can't be decoded.
|
||||
// MissingCodeString replaces strings that can't be decoded.
|
||||
MissingCodeString = string(MissingCodeRune)
|
||||
)
|
||||
|
||||
|
@ -421,31 +421,26 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
|
||||
return charcodes
|
||||
}
|
||||
|
||||
// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical
|
||||
// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except it returns more statistical
|
||||
// information about hits and misses from the reverse mapping process.
|
||||
// NOTE: The number of runes returned may be greater than the number of charcodes.
|
||||
// TODO(peterwilliams97): Deprecate?
|
||||
// TODO(peterwilliams97): Deprecate in v4 and use only CharcodesToStrings()
|
||||
func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) {
|
||||
runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes)
|
||||
var runes []rune
|
||||
for _, r := range runeSlices {
|
||||
runes = append(runes, r...)
|
||||
}
|
||||
return runes, numHits, numMisses
|
||||
texts, numHits, numMisses := font.CharcodesToStrings(charcodes)
|
||||
return []rune(strings.Join(texts, "")), numHits, numMisses
|
||||
}
|
||||
|
||||
// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices.
|
||||
// The int return is the number of unconvereted codes.
|
||||
// NOTE: The number of rune slices returned is equal to the number of charcodes
|
||||
func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) {
|
||||
// CharcodesToStrings returns the unicode strings corresponding to `charcodes`.
|
||||
// The int returns are the number of strings and the number of unconvereted codes.
|
||||
// NOTE: The number of strings returned is equal to the number of charcodes
|
||||
func (font *PdfFont) CharcodesToStrings(charcodes []textencoding.CharCode) ([]string, int, int) {
|
||||
fontBase := font.baseFields()
|
||||
runeSlices := make([][]rune, 0, len(charcodes))
|
||||
texts := make([]string, 0, len(charcodes))
|
||||
numMisses := 0
|
||||
for _, code := range charcodes {
|
||||
if fontBase.toUnicodeCmap != nil {
|
||||
if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
|
||||
runeSlices = append(runeSlices, []rune(s))
|
||||
// common.Log.Info("CharcodesToRuneSlices1: code=%d s=`%s`", code, s)
|
||||
texts = append(texts, s)
|
||||
continue
|
||||
}
|
||||
}
|
||||
@ -454,9 +449,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
|
||||
encoder := font.Encoder()
|
||||
if encoder != nil {
|
||||
if r, ok := encoder.CharcodeToRune(code); ok {
|
||||
runeSlices = append(runeSlices, []rune{r})
|
||||
// common.Log.Info("CharcodesToRuneSlices2: code=%d s=%q encoder=%s",
|
||||
// code, string(r), encoder.String())
|
||||
texts = append(texts, string(r))
|
||||
continue
|
||||
}
|
||||
}
|
||||
@ -465,7 +458,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
|
||||
"\tfont=%s\n\tencoding=%s",
|
||||
code, charcodes, fontBase.isCIDFont(), font, encoder)
|
||||
numMisses++
|
||||
runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune})
|
||||
texts = append(texts, cmap.MissingCodeString)
|
||||
}
|
||||
|
||||
if numMisses != 0 {
|
||||
@ -475,7 +468,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
|
||||
len(charcodes), numMisses, font)
|
||||
}
|
||||
|
||||
return runeSlices, len(runeSlices), numMisses
|
||||
return texts, len(texts), numMisses
|
||||
}
|
||||
|
||||
// CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.
|
||||
@ -499,8 +492,8 @@ func (font *PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
|
||||
// 1) Use the ToUnicode CMap if there is one.
|
||||
// 2) Use the underlying font's encoding.
|
||||
func (font *PdfFont) CharcodesToUnicode(charcodes []textencoding.CharCode) []rune {
|
||||
strlist, _, _ := font.CharcodesToUnicodeWithStats(charcodes)
|
||||
return strlist
|
||||
runes, _, _ := font.CharcodesToUnicodeWithStats(charcodes)
|
||||
return runes
|
||||
}
|
||||
|
||||
// RunesToCharcodeBytes maps the provided runes to charcode bytes and it
|
||||
|
Loading…
x
Reference in New Issue
Block a user