Merge branch 'development' of https://github.com/unidoc/unipdf into columns

This commit is contained in:
Peter Williams 2020-06-05 11:43:04 +10:00
commit 29f2d9b8cf
18 changed files with 1413 additions and 780 deletions

View File

@ -13,6 +13,7 @@ import (
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/contentstream"
"github.com/unidoc/unipdf/v3/contentstream/draw"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/internal/textencoding"
"github.com/unidoc/unipdf/v3/model"
@ -175,12 +176,14 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
width := rect.Width()
height := rect.Height()
var rotation float64
if mkDict, has := core.GetDict(wa.MK); has {
bsDict, _ := core.GetDict(wa.BS)
err := style.applyAppearanceCharacteristics(mkDict, bsDict, nil)
if err != nil {
return nil, err
}
rotation, _ = core.GetNumberAsFloat(mkDict.Get("R"))
}
// Get and process the default appearance string (DA) operands.
@ -192,6 +195,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
}
cc := contentstream.NewContentCreator()
if style.BorderSize > 0 {
drawRect(cc, style, width, height)
}
@ -205,6 +209,28 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
cc.Add_BMC("Tx")
cc.Add_q()
bboxWidth, bboxHeight := width, height
if rotation != 0 {
// Calculate bounding box before rotation.
revRotation := -rotation
bbox := draw.Path{Points: []draw.Point{
draw.NewPoint(0, 0).Rotate(revRotation),
draw.NewPoint(width, 0).Rotate(revRotation),
draw.NewPoint(0, height).Rotate(revRotation),
draw.NewPoint(width, height).Rotate(revRotation),
}}.GetBoundingBox()
// Update width and height, as the appearance is generated based on
// the bounding of the annotation with no rotation.
width = bbox.Width
height = bbox.Height
// Apply rotation.
cc.RotateDeg(rotation)
cc.Translate(bbox.X, bbox.Y)
}
// Graphic state changes.
cc.Add_BT()
@ -461,7 +487,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
xform := model.NewXObjectForm()
xform.Resources = resources
xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, width, height})
xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, bboxWidth, bboxHeight})
xform.SetContentStream(cc.Bytes(), defStreamEncoder())
apDict := core.MakeDict()

View File

@ -62,3 +62,54 @@ bruce.pdf for char spacing save/restore.
challenging-modified.pdf
transitions_test.pdf
Code Restructure?
-----------------
```
type textPara struct {
serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box.
w, h int
cells []textCell
}
type textCell struct {
serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box.
eBBox model.PdfRectangle // Extended bounding box needed to compute reading order.
lines []*textLine // Paragraph text gets broken into lines.
}
```
x x x x x x
x
x x
x
x x x
x
x
1. Compute all row candidates
alignedY No intervening paras
2. Compute all column candidates
alignedX No intervening paras
Table candidate
1. Top row fully populated
2. Left column fully populated
3. All cells in table are aligned with 1 top row element and 1 left column candidate
4. Mininum number of cells must be filled
Computation time
1. Row candidates O(N)
Sort top to bottom, left to right
Search
2. Column candidates O(N)
Sort left to right, top to bottom
Search
3. Find intersections O(N^2)
For each row
Find columns that start at row -> table candiates
Sort table candidates by w x h descending
4. Test each candidate O(N^4)

View File

@ -22,8 +22,6 @@ import (
"github.com/unidoc/unipdf/v3/model"
)
const verbose = false
// maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack
// overflow and high enough to accomodate customers' PDFs
const maxFormStack = 10
@ -49,7 +47,7 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM
// ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0)
pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, transform.IdentityMatrix(), 0)
if err != nil {
return nil, numChars, numMisses, err
}
@ -62,7 +60,8 @@ func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
// extractPageText returns the text contents of content stream `e` and resouces `resources` as a
// PageText.
// This can be called on a page or a form XObject.
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (
func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources,
parentCTM transform.Matrix, level int) (
*PageText, int, int, error) {
common.Log.Trace("extractPageText: level=%d", level)
pageText := &PageText{pageSize: e.mediaBox}
@ -97,7 +96,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
operand := op.Operand
if verbose {
if verboseGeom {
common.Log.Info("&&& op=%s", op)
}
@ -106,7 +105,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
savedStates.push(&state)
// common.Log.Info("Save state: stack=%d\n %s", len(savedStates), state.String())
case "Q":
if verbose {
if verboseGeom {
common.Log.Info("Restore state: %s", savedStates.String())
}
if !savedStates.empty() {
@ -129,7 +128,10 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
pageText.marks = append(pageText.marks, to.marks...)
}
inTextObj = true
to = newTextObject(e, resources, gs, &state, &savedStates)
graphicsState := gs
graphicsState.CTM = parentCTM.Mult(graphicsState.CTM)
to = newTextObject(e, resources, graphicsState, &state, &savedStates)
case "ET": // End Text
// End text object, discarding text matrix. If the current
// text object contains text marks, they are added to the
@ -343,8 +345,9 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
if formResources == nil {
formResources = resources
}
tList, numChars, numMisses, err := e.extractPageText(string(formContent),
formResources, level+1)
formResources, parentCTM.Mult(gs.CTM), level+1)
if err != nil {
common.Log.Debug("ERROR: %v", err)
return err
@ -489,8 +492,8 @@ func (to *textObject) setCharSpacing(x float64) {
return
}
to.state.tc = x
if verbose {
common.Log.Info("setCharSpacing: %.2f state=%s", to.state.String())
if verboseGeom {
common.Log.Info("setCharSpacing: %.2f state=%s", x, to.state.String())
}
}
@ -758,7 +761,7 @@ func (to *textObject) renderText(data []byte) error {
}
font := to.getCurrentFont()
charcodes := font.BytesToCharcodes(data)
runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
texts, numChars, numMisses := font.CharcodesToStrings(charcodes)
if numMisses > 0 {
common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
}
@ -777,17 +780,20 @@ func (to *textObject) renderText(data []byte) error {
spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
}
spaceWidth := spaceMetrics.Wx * glyphTextRatio
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)
common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.2f", spaceWidth, texts, font, tfs)
stateMatrix := transform.NewMatrix(
tfs*th, 0,
0, tfs,
0, state.trise)
if verbose {
common.Log.Info("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runeSlices)
if verboseGeom {
common.Log.Info("renderText: %d codes=%+v texts=%q", len(charcodes), charcodes, texts)
}
for i, r := range runeSlices {
common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, len(texts))
for i, text := range texts {
r := []rune(text)
if len(r) == 1 && r[0] == '\x00' {
continue
}
@ -819,7 +825,7 @@ func (to *textObject) renderText(data []byte) error {
// t is the displacement of the text cursor when the character is rendered.
t0 := transform.Point{X: (c.X*tfs + w) * th}
t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
if verbose {
if verboseGeom {
common.Log.Info("tfs=%.2f tc=%.2f tw=%.2f th=%.2f", tfs, state.tc, state.tw, th)
common.Log.Info("dx,dy=%.3f t0=%.2f t=%.2f", c, t0, t)
}
@ -830,7 +836,7 @@ func (to *textObject) renderText(data []byte) error {
td := translationMatrix(t)
end := to.gs.CTM.Mult(to.tm).Mult(td0)
if verbose {
if verboseGeom {
common.Log.Info("end:\n\tCTM=%s\n\t tm=%s\n"+
"\t td=%s xlat=%s\n"+
"\ttd0=%s\n\t → %s xlat=%s",
@ -865,7 +871,7 @@ func (to *textObject) renderText(data []byte) error {
// update the text matrix by the displacement of the text location.
to.tm.Concat(td)
if i != len(runeSlices)-1 {
if i != len(texts)-1 {
to.logCursor()
}
}
@ -908,10 +914,11 @@ func isTextSpace(text string) bool {
// PageText represents the layout of text on a device page.
type PageText struct {
marks []*textMark // Texts and their positions on a PDF page.
viewText string // Extracted page text.
viewMarks []TextMark // Public view of `marks`.
pageSize model.PdfRectangle
marks []*textMark // Texts and their positions on a PDF page.
viewText string // Extracted page text.
viewMarks []TextMark // Public view of text marks`.
viewTables []TextTable // Public view of text table`.
pageSize model.PdfRectangle // Page size. Used to calculate depth.
}
// String returns a string describing `pt`.
@ -942,6 +949,11 @@ func (pt PageText) Marks() *TextMarkArray {
return &TextMarkArray{marks: pt.viewMarks}
}
// Tables returns the tables extracted from the page.
func (pt PageText) Tables() []TextTable {
return pt.viewTables
}
// computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
// `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
// The comments above the TextMark definition describe how to use the []TextMark to
@ -953,6 +965,7 @@ func (pt *PageText) computeViews() {
paras.writeText(b)
pt.viewText = b.String()
pt.viewMarks = paras.toTextMarks()
pt.viewTables = paras.toTables()
}
// TextMarkArray is a collection of TextMarks.
@ -1119,6 +1132,13 @@ var spaceMark = TextMark{
Meta: true,
}
// TextTable represents a table.
// Cells are ordered top-to-bottom, left-to-right.
type TextTable struct {
W, H int
Cells [][]string
}
// getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
// empty.
func (to *textObject) getCurrentFont() *model.PdfFont {

View File

@ -19,11 +19,11 @@ import (
var serial serialState
type serialState struct {
mark int
word int
bins int
line int
para int
mark int
word int
strata int
line int
para int
}
func (serial *serialState) reset() {
@ -65,15 +65,25 @@ func diffReading(a, b bounded) float64 {
return a.bbox().Llx - b.bbox().Llx
}
// func boundedUnion(objs ...bounded) model.PdfRectangle {
// rect := objs[0].bbox()
// for _, r := range objs[1:] {
// rect = rectUnion(rect, r.bbox())
// }
// return rect
// }
func boundedUnion(objs ...bounded) model.PdfRectangle {
rect := objs[0].bbox()
for _, r := range objs[1:] {
rect = rectUnion(rect, r.bbox())
}
return rect
}
// diffDepth returns `a` - `b` in the depth direction..
// rectContainsBounded returns true if `a` contains `b`.
func rectContainsBounded(a model.PdfRectangle, b bounded) bool {
return rectContainsRect(a, b.bbox())
}
// rectContainsRect returns true if `a` contains `b`.
func rectContainsRect(a, b model.PdfRectangle) bool {
return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury
}
// diffDepth returns `a` - `b` in the depth direction.
func diffDepth(a, b bounded) float64 {
return bboxDepth(a) - bboxDepth(b)
}
@ -151,3 +161,19 @@ func overlappedXRect(r0, r1 model.PdfRectangle) bool {
func overlappedYRect(r0, r1 model.PdfRectangle) bool {
return (r0.Lly <= r1.Lly && r1.Lly <= r0.Ury) || (r0.Lly <= r1.Ury && r1.Ury <= r0.Ury)
}
// minInt return the lesser of `a` and `b`.
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
// maxInt return the greater of `a` and `b`.
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}

View File

@ -5,8 +5,24 @@
package extractor
// The follow constant configure debugging.
const (
verbose = false
verboseGeom = false
verbosePage = false
verbosePara = false
verboseTable = false
)
// The following constants control the approaches used in the code.
const (
useTables = true
doHyphens = true
useEBBox = false
)
// The following constants are the tuning parameter for text extracton
const (
// Size of depth bins in points
depthBinPoints = 6

View File

@ -20,10 +20,12 @@ type textLine struct {
model.PdfRectangle // Bounding box (union of `marks` bounding boxes).
depth float64 // Distance from bottom of line to top of page.
words []*textWord // Words in this line.
fontsize float64
hyphenated bool
fontsize float64 // Largest word font size.
hyphenated bool // Does line have at least minHyphenation runes and end in a hyphen.
}
const minHyphenation = 4
// newTextLine creates a line with font and bbox size of `w`, removes `w` from p.bins[bestWordDepthIdx] and adds it to the line
func newTextLine(p *textStrata, depthIdx int) *textLine {
words := p.getStratum(depthIdx)
@ -60,31 +62,22 @@ func (l *textLine) text() string {
}
}
return strings.Join(words, "")
}
// toTextMarks returns the TextMarks contained in `l`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (l *textLine) toTextMarks(offset *int) []TextMark {
var marks []TextMark
addMark := func(mark TextMark) {
mark.Offset = *offset
marks = append(marks, mark)
*offset += len(mark.Text)
}
addSpaceMark := func(spaceChar string) {
mark := spaceMark
mark.Text = spaceChar
addMark(mark)
}
for _, word := range l.words {
for _, tm := range word.marks {
addMark(tm.ToTextMark())
}
wordMarks := word.toTextMarks(offset)
marks = append(marks, wordMarks...)
if word.spaceAfter {
addSpaceMark(" ")
marks = appendSpaceMark(marks, offset, " ")
}
}
if len(l.text()) > 0 && len(marks) == 0 {
panic(l.text())
}
return marks
}
@ -130,16 +123,13 @@ func (l *textLine) mergeWordFragments() {
}
// check for hyphen at end of line
runes := []rune(l.text())
l.hyphenated = len(runes) >= 4 &&
l.hyphenated = isHyphenated(l.text())
}
// isHyphenated returns true if `text` is a hyphenated word.
func isHyphenated(text string) bool {
runes := []rune(text)
return len(runes) >= minHyphenation &&
unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
!unicode.IsSpace(runes[len(runes)-2])
// if l.hyphenated {
// // fmt.Fprintf(os.Stderr, "\n%q ", l.text())
// common.Log.Info("### %d %q\n\t%q:%t\n\t%q:%t",
// len(runes), l.text(),
// runes[len(runes)-1], unicode.Is(unicode.Hyphen, runes[len(runes)-1]),
// runes[len(runes)-2], !unicode.IsSpace(runes[len(runes)-2]),
// )
// }
}

View File

@ -21,11 +21,6 @@ type textMark struct {
model.PdfRectangle // Bounding box.
text string // The text (decoded via ToUnicode).
original string // Original text (decoded).
orient int // The text orientation in degrees. This is the current TRM rounded to 10°.
orientedStart transform.Point // Left of text in orientation where text is horizontal.
orientedEnd transform.Point // Right of text in orientation where text is horizontal.
height float64 // Text height.
spaceWidth float64 // Best guess at the width of a space in the font the text was rendered with.
font *model.PdfFont // The font the mark was drawn with.
fontsize float64 // The font size the mark was drawn with.
charspacing float64 // TODO (peterwilliams97: Should this be exposed in TextMark?
@ -74,25 +69,20 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo
bbox = clipped
tm := textMark{
text: text,
orient: orient,
PdfRectangle: bbox,
orientedStart: start.Rotate(theta),
orientedEnd: end.Rotate(theta),
height: math.Abs(height),
spaceWidth: spaceWidth,
font: font,
fontsize: height,
charspacing: charspacing,
trm: trm,
end: end,
serial: serial.mark,
text: text,
PdfRectangle: bbox,
font: font,
fontsize: height,
charspacing: charspacing,
trm: trm,
end: end,
serial: serial.mark,
}
serial.mark++
if !isTextSpace(tm.text) && tm.Width() == 0.0 {
common.Log.Debug("ERROR: Zero width text. tm=%s", tm.String())
}
if verbose {
if verboseGeom {
common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
}
@ -110,11 +100,6 @@ func (tm *textMark) bbox() model.PdfRectangle {
return tm.PdfRectangle
}
// Width returns the width of `tm`.text in the text direction.
func (tm *textMark) Width() float64 {
return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
}
// ToTextMark returns the public view of `tm`.
func (tm *textMark) ToTextMark() TextMark {
return TextMark{
@ -127,6 +112,23 @@ func (tm *textMark) ToTextMark() TextMark {
}
}
// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted
// text.
func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark {
mark.Offset = *offset
marks = append(marks, mark)
*offset += len(mark.Text)
return marks
}
// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`,
// the offset of `mark` in the extracted text.
func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark {
mark := spaceMark
mark.Text = spaceChar
return appendTextMark(marks, offset, mark)
}
// nearestMultiple return the integer multiple of `m` that is closest to `x`.
func nearestMultiple(x float64, m int) int {
if m == 0 {

View File

@ -9,16 +9,12 @@ import (
"fmt"
"io"
"math"
"unicode"
"sort"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
)
// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
// type so we can have methods on it.
type paraList []*textPara
// makeTextPage builds a paraList from `marks`, the textMarks on a page.
func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
@ -35,28 +31,21 @@ func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraL
for i, para := range paraStratas {
paras[i] = composePara(para)
}
if verbose || true {
common.Log.Info("unsorted=========----------=====")
for i, para := range paras {
common.Log.Info("paras[%d]=%.2f%q", i, para.PdfRectangle, truncate(paras[i].text(), 200))
}
}
paras.log("unsorted")
// paras.computeEBBoxes()
if useTables {
paras = paras.extractTables()
}
// paras.log("tables extracted")
paras.computeEBBoxes()
paras = paras.extractTables()
paras.log("EBBoxes 2")
// Sort the paras into reading order.
paras.sortReadingOrder()
if verbose || true {
common.Log.Info("para sorted in reading order -----------=========")
for i, para := range paras {
tab := ""
if para.table != nil {
tab = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
}
fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tab, truncate(para.text(), 50))
}
}
paras.log("sorted in reading order")
return paras
}
@ -72,7 +61,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
// Some bins are emptied before they iterated to (seee "surving bin" above).
// If a `page` survives until it is iterated to then at least one `para` will be built around it.
if verbose {
if verbosePage {
common.Log.Info("dividePage")
}
cnt := 0
@ -89,7 +78,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
firstReadingIdx := page.firstReadingIndex(depthIdx)
words := page.getStratum(firstReadingIdx)
moveWord(firstReadingIdx, page, para, words[0])
if verbose {
if verbosePage {
common.Log.Info("words[0]=%s", words[0].String())
}
@ -105,7 +94,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
// Add words that are within maxIntraDepthGap of `para` in the depth direction.
// i.e. Stretch para in the depth direction, vertically for English text.
if verbose {
if verbosePage {
common.Log.Info("para depth %.2f - %.2f maxIntraDepthGap=%.2f ",
para.minDepth(), para.maxDepth(), maxIntraDepthGap)
}
@ -159,6 +148,9 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
// Sort the words in `para`'s bins in the reading direction.
para.sort()
if verbosePage {
common.Log.Info("para=%s", para.String())
}
paraStratas = append(paraStratas, para)
}
}
@ -166,40 +158,11 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
return paraStratas
}
const doHyphens = true
const useTables = true
// writeText writes the text in `paras` to `w`.
func (paras paraList) writeText(w io.Writer) {
for ip, para := range paras {
if useTables {
para.writeText(w)
} else {
for il, line := range para.lines {
s := line.text()
reduced := false
if doHyphens {
if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
// Line ending with hyphen. Remove it.
runes := []rune(s)
s = string(runes[:len(runes)-1])
reduced = true
}
}
w.Write([]byte(s))
if reduced {
// We removed the hyphen from the end of the line so we don't need a line ending.
continue
}
if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
// Next line is the same depth so it's the same line as this one in the extracted text
w.Write([]byte(" "))
continue
}
w.Write([]byte("\n"))
}
w.Write([]byte("\n"))
}
for _, para := range paras {
para.writeText(w)
w.Write([]byte("\n"))
}
}
@ -208,69 +171,35 @@ func (paras paraList) writeText(w io.Writer) {
func (paras paraList) toTextMarks() []TextMark {
offset := 0
var marks []TextMark
addMark := func(mark TextMark) {
mark.Offset = offset
marks = append(marks, mark)
offset += len(mark.Text)
}
addSpaceMark := func(spaceChar string) {
mark := spaceMark
mark.Text = spaceChar
addMark(mark)
}
for ip, para := range paras {
if useTables {
paraMarks := para.toTextMarks(&offset)
marks = append(marks, paraMarks...)
} else {
for il, line := range para.lines {
lineMarks := line.toTextMarks(&offset)
marks = append(marks, lineMarks...)
reduced := false
if doHyphens {
if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
tm := marks[len(marks)-1]
r := []rune(tm.Text)
if unicode.IsSpace(r[len(r)-1]) {
panic(tm)
}
if len(r) == 1 {
marks = marks[:len(marks)-1]
offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
} else {
s := string(r[:len(r)-1])
offset += len(s) - len(tm.Text)
tm.Text = s
}
reduced = true
}
}
if reduced {
continue
}
if il != len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
// Next line is the same depth so it's the same line as this one in the extracted text
addSpaceMark(" ")
continue
}
addSpaceMark("\n")
}
if ip != len(paras)-1 {
addSpaceMark("\n")
}
}
for _, para := range paras {
paraMarks := para.toTextMarks(&offset)
marks = append(marks, paraMarks...)
marks = appendSpaceMark(marks, &offset, "\n")
}
return marks
}
func (paras paraList) toTables() []TextTable {
var tables []TextTable
for _, para := range paras {
if para.table != nil {
tables = append(tables, para.table.toTextTable())
}
}
return tables
}
// sortReadingOrder sorts `paras` in reading order.
func (paras paraList) sortReadingOrder() {
common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras))
if len(paras) <= 1 {
return
}
sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 })
paras.log("diffReadingDepth")
adj := paras.adjMatrix()
order := topoOrder(adj)
printAdj(adj)
paras.reorder(order)
}
@ -290,22 +219,23 @@ func (paras paraList) adjMatrix() [][]bool {
adj[i][j], reasons[i][j] = paras.before(i, j)
}
}
if verbose && false {
if verbosePage {
show := func(a *textPara) string {
return fmt.Sprintf("%6.2f %q", a.eBBox, truncate(a.text(), 70))
}
common.Log.Info("adjMatrix =======")
for i := 0; i < n; i++ {
a := paras[i]
fmt.Printf("%4d: %q %.2f\n", i, truncate(a.text(), 50), a.PdfRectangle)
fmt.Printf("%4d: %s\n", i, show(a))
for j := 0; j < n; j++ {
if i == j {
continue
}
if !adj[i][j] {
if !adj[i][j] && i != 16 {
continue
}
b := paras[j]
fmt.Printf("%8d: %10s %q %.2f\n", j,
reasons[i][j], truncate(b.text(), 40), b.PdfRectangle)
fmt.Printf("%8d: %t %10s %s\n", j, adj[i][j], reasons[i][j], show(b))
}
}
}
@ -344,7 +274,7 @@ func (paras paraList) before(i, j int) (bool, string) {
continue
}
if overlappedXPara(a, c) && overlappedXPara(c, b) {
return false, "Y intervening"
return false, fmt.Sprintf("Y intervening: %d: %s", k, c)
}
}
return true, "TO LEFT"
@ -358,13 +288,21 @@ func overlappedXPara(r0, r1 *textPara) bool {
// computeEBBoxes computes the eBBox fields in the elements of `paras`.
func (paras paraList) computeEBBoxes() {
common.Log.Trace("computeEBBoxes:")
if verbose {
common.Log.Info("computeEBBoxes:")
}
for i, a := range paras {
// [llx, urx] is the reading direction interval for which no paras overlap `a`
for _, para := range paras {
para.eBBox = para.PdfRectangle
}
for i, aa := range paras {
a := aa.eBBox
// [llx, urx] is the reading direction interval for which no paras overlap `a`.
llx := -1.0e9
urx := +1.0e9
for j, b := range paras {
for j, bb := range paras {
b := bb.eBBox
if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) {
continue
}
@ -385,27 +323,65 @@ func (paras paraList) computeEBBoxes() {
// Go through all paras below `a` within interval [llx, urx] in the reading direction and
// expand `a` as far as possible to left and right without overlapping any of them.
a.eBBox = a.PdfRectangle
for j, b := range paras {
for j, bb := range paras {
b := bb.eBBox
if i == j || b.Ury > a.Lly {
continue
}
// If `b` is completely to right of `llx`, extend `a` left to `b`.
if llx <= b.Llx {
a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx)
a.Llx = math.Min(a.Llx, b.Llx)
}
// If `b` is completely to left of `urx`, extend `a` right to `b`.
if b.Urx <= urx {
a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx)
a.Urx = math.Max(a.Urx, b.Urx)
}
}
if verbose {
fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50))
}
aa.eBBox = a
}
if useEBBox {
for _, para := range paras {
para.PdfRectangle = para.eBBox
}
}
}
// printAdj prints `adj` to stdout.
func printAdj(adj [][]bool) {
if !verbosePage {
return
}
common.Log.Info("printAdj:")
n := len(adj)
fmt.Printf("%3s:", "")
for x := 0; x < n; x++ {
fmt.Printf("%3d", x)
}
fmt.Println()
for y := 0; y < n; y++ {
fmt.Printf("%3d:", y)
for x := 0; x < n; x++ {
s := ""
if adj[y][x] {
s = "X"
}
fmt.Printf("%3s", s)
}
fmt.Println()
}
}
// topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`.
func topoOrder(adj [][]bool) []int {
if verbosePage {
common.Log.Info("topoOrder:")
}
n := len(adj)
visited := make([]bool, n)
var order []int
@ -427,11 +403,16 @@ func topoOrder(adj [][]bool) []int {
sortNode(idx)
}
}
// Order is currently reversed so change it to forward order.
for i := 0; i < n/2; i++ {
order[i], order[n-1-i] = order[n-1-i], order[i]
return reversed(order)
}
// reversed return `order` reversed.
func reversed(order []int) []int {
rev := make([]int, len(order))
for i, v := range order {
rev[len(order)-1-i] = v
}
return order
return rev
}
// reorder reorders `para` to the order in `order`.

View File

@ -12,9 +12,14 @@ import (
"sort"
"unicode"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/model"
)
// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
// type so we can have methods on it.
type paraList []*textPara
// textPara is a group of words in a rectangular region of a page that get read together.
// An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.
// We start by finding paragraph regions on a page, then we break the words into the textPara into
@ -22,7 +27,7 @@ import (
type textPara struct {
serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box.
eBBox model.PdfRectangle // Extented ounding box needed to compute reading order.
eBBox model.PdfRectangle // Extended bounding box needed to compute reading order.
lines []*textLine // Paragraph text gets broken into lines.
table *textTable
}
@ -39,8 +44,8 @@ func newTextPara(strata *textStrata) *textPara {
// String returns a description of `p`.
func (p *textPara) String() string {
return fmt.Sprintf("serial=%d %.2f %d lines\n%s\n-------------",
p.serial, p.PdfRectangle, len(p.lines), p.text())
return fmt.Sprintf("serial=%d %.2f %d lines %q",
p.serial, p.PdfRectangle, len(p.lines), truncate(p.text(), 50))
}
// text returns the text of the lines in `p`.
@ -52,47 +57,21 @@ func (p *textPara) text() string {
// writeText writes the text of `p` including tables to `w`.
func (p *textPara) writeText(w io.Writer) {
if p.table != nil {
for y := 0; y < p.table.h; y++ {
for x := 0; x < p.table.w; x++ {
cell := p.table.cells[y*p.table.w+x]
cell.writeCellText(w)
w.Write([]byte(" "))
}
w.Write([]byte("\n"))
}
} else {
if p.table == nil {
p.writeCellText(w)
w.Write([]byte("\n"))
return
}
}
// writeCellText writes the text of `p` not including tables to `w`.
func (p *textPara) writeCellText(w io.Writer) {
// w := new(bytes.Buffer)
para := p
for il, line := range para.lines {
s := line.text()
reduced := false
if doHyphens {
if line.hyphenated && il != len(para.lines)-1 {
// Line ending with hyphen. Remove it.
runes := []rune(s)
s = string(runes[:len(runes)-1])
reduced = true
for y := 0; y < p.table.h; y++ {
for x := 0; x < p.table.w; x++ {
cell := p.table.get(x, y)
if cell == nil {
w.Write([]byte("\t"))
} else {
cell.writeCellText(w)
}
}
w.Write([]byte(s))
if reduced {
// We removed the hyphen from the end of the line so we don't need a line ending.
continue
}
if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
// Next line is the same depth so it's the same line as this one in the extracted text
w.Write([]byte(" "))
continue
}
if il < len(para.lines)-1 {
if y < p.table.h-1 {
w.Write([]byte("\n"))
}
}
@ -101,90 +80,103 @@ func (p *textPara) writeCellText(w io.Writer) {
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
// paras `p`.writeText().
func (p *textPara) toTextMarks(offset *int) []TextMark {
if p.table == nil {
return p.toCellTextMarks(offset)
}
var marks []TextMark
addMark := func(mark TextMark) {
mark.Offset = *offset
marks = append(marks, mark)
*offset += len(mark.Text)
}
addSpaceMark := func(spaceChar string) {
mark := spaceMark
mark.Text = spaceChar
addMark(mark)
}
if p.table != nil {
for y := 0; y < p.table.h; y++ {
for x := 0; x < p.table.w; x++ {
cell := p.table.cells[y*p.table.w+x]
for y := 0; y < p.table.h; y++ {
for x := 0; x < p.table.w; x++ {
cell := p.table.get(x, y)
if cell == nil {
marks = appendSpaceMark(marks, offset, "\t")
} else {
cellMarks := cell.toCellTextMarks(offset)
marks = append(marks, cellMarks...)
addSpaceMark(" ")
}
addSpaceMark("\n")
marks = appendSpaceMark(marks, offset, " ")
}
if y < p.table.h-1 {
marks = appendSpaceMark(marks, offset, "\n")
}
} else {
marks = p.toCellTextMarks(offset)
addSpaceMark("\n")
}
return marks
}
// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
// writeCellText writes the text of `p` not including tables to `w`.
func (p *textPara) writeCellText(w io.Writer) {
for il, line := range p.lines {
lineText := line.text()
reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
if reduced { // Line ending with hyphen. Remove it.
lineText = removeLastRune(lineText)
}
w.Write([]byte(lineText))
if !(reduced || il == len(p.lines)-1) {
w.Write([]byte(getSpace(line.depth, p.lines[il+1].depth)))
}
}
}
// toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by
// paras `paras`.writeCellText().
func (p *textPara) toCellTextMarks(offset *int) []TextMark {
var marks []TextMark
addMark := func(mark TextMark) {
mark.Offset = *offset
marks = append(marks, mark)
*offset += len(mark.Text)
}
addSpaceMark := func(spaceChar string) {
mark := spaceMark
mark.Text = spaceChar
addMark(mark)
}
para := p
for il, line := range para.lines {
for il, line := range p.lines {
lineMarks := line.toTextMarks(offset)
marks = append(marks, lineMarks...)
reduced := false
if doHyphens {
if line.hyphenated && il != len(para.lines)-1 {
tm := marks[len(marks)-1]
r := []rune(tm.Text)
if unicode.IsSpace(r[len(r)-1]) {
panic(tm)
}
if len(r) == 1 {
marks = marks[:len(marks)-1]
*offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
} else {
s := string(r[:len(r)-1])
*offset += len(s) - len(tm.Text)
tm.Text = s
}
reduced = true
reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
if reduced { // Line ending with hyphen. Remove it.
if len([]rune(line.text())) < minHyphenation {
panic(line.text())
}
if len(lineMarks) < 1 {
panic(line.text())
}
lineMarks = removeLastTextMarkRune(lineMarks, offset)
}
if reduced {
continue
}
if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
// Next line is the same depth so it's the same line as this one in the extracted text
addSpaceMark(" ")
continue
}
if il < len(para.lines)-1 {
addSpaceMark("\n")
marks = append(marks, lineMarks...)
if !(reduced || il == len(p.lines)-1) {
marks = appendSpaceMark(marks, offset, getSpace(line.depth, p.lines[il+1].depth))
}
}
addSpaceMark("\n")
return marks
}
func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark {
tm := marks[len(marks)-1]
runes := []rune(tm.Text)
if unicode.IsSpace(runes[len(runes)-1]) {
panic(tm)
}
if len(runes) == 1 {
marks = marks[:len(marks)-1]
tm1 := marks[len(marks)-1]
*offset = tm1.Offset + len(tm1.Text)
} else {
text := removeLastRune(tm.Text)
*offset += len(text) - len(tm.Text)
tm.Text = text
}
return marks
}
func removeLastRune(text string) string {
runes := []rune(text)
if len(runes) < 2 {
panic(text)
}
return string(runes[:len(runes)-1])
}
// getSpace returns the space to insert between lines of depth `depth1` and `depth2`.
// Next line is the same depth so it's the same line as this one in the extracted text
func getSpace(depth1, depth2 float64) string {
eol := !isZero(depth1 - depth2)
if eol {
return "\n"
}
return " "
}
// bbox makes textPara implement the `bounded` interface.
func (p *textPara) bbox() model.PdfRectangle {
return p.PdfRectangle
@ -271,5 +263,42 @@ func composePara(strata *textStrata) *textPara {
if len(para.lines) == 0 {
panic(para)
}
if verbosePara {
common.Log.Info("!!! para=%s", para.String())
for i, line := range para.lines {
fmt.Printf("%4d: %s\n", i, line)
for j, word := range line.words {
fmt.Printf("%8d: %s\n", j, word)
for k, mark := range word.marks {
fmt.Printf("%12d: %s\n", k, mark)
}
}
}
}
return para
}
// log logs the contents of `paras`.
func (paras paraList) log(title string) {
if !verbosePage {
return
}
common.Log.Info("%8s: %d paras =======-------=======", title, len(paras))
for i, para := range paras {
if para == nil {
continue
}
text := para.text()
tabl := " "
if para.table != nil {
tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
}
fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
if len(text) == 0 {
panic("empty")
}
if para.table != nil && len(para.table.cells) == 0 {
panic(para)
}
}
}

View File

@ -38,14 +38,14 @@ func makeTextStrata(words []*textWord, pageHeight float64) *textStrata {
// newTextStrata returns an empty textStrata with page height `pageHeight`.
func newTextStrata(pageHeight float64) *textStrata {
bins := textStrata{
serial: serial.bins,
strata := textStrata{
serial: serial.strata,
bins: map[int][]*textWord{},
PdfRectangle: model.PdfRectangle{Urx: -1.0, Ury: -1.0},
pageHeight: pageHeight,
}
serial.bins++
return &bins
serial.strata++
return &strata
}
// String returns a description of `s`.
@ -57,7 +57,9 @@ func (s *textStrata) String() string {
texts = append(texts, w.text())
}
}
return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts)
// return fmt.Sprintf("serial=%d %d %q", s.serial, )
return fmt.Sprintf("serial=%d %.2f fontsize=%.2f %d %q",
s.serial, s.PdfRectangle, s.fontsize, len(texts), texts)
}
// sort sorts the words in each bin in `s` in the reading direction.
@ -129,10 +131,24 @@ func (s *textStrata) scanBand(title string, para *textStrata,
if !readingOverlap(para, word) {
continue
}
if fontTol > 0 && math.Abs(word.fontsize-fontsize) > fontTol*fontsize {
continue
fontRatio1 := math.Abs(word.fontsize-fontsize) / fontsize
fontRatio2 := word.fontsize / fontsize
fontRatio := math.Min(fontRatio1, fontRatio2)
if fontTol > 0 {
if fontRatio > fontTol {
continue
}
}
if fontTol <= 0 {
panic(fontTol)
}
if !detectOnly {
// if !para.isHomogenous(word) {
// panic(fmt.Errorf("not homogeneous fontTol=%.2f ratio=%.2f (%.2f->%.2f)\n\tpara=%s\n\tword=%s",
// fontTol, fontRatio, fontsize, word.fontsize,
// para.String(), word.String()))
// }
moveWord(depthIdx, s, para, word)
}
newWords = append(newWords, word)
@ -155,11 +171,11 @@ func (s *textStrata) scanBand(title string, para *textStrata,
}
if verbose {
if len(title) > 0 {
common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f",
common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f fontsize=%.2f",
title,
minDepth0, maxDepth0,
minDepth, maxDepth,
para.PdfRectangle)
para.PdfRectangle, para.fontsize)
for i, word := range newWords {
fmt.Printf("%4d: %s\n", i, word)
}
@ -271,6 +287,36 @@ func moveWord(depthIdx int, page, para *textStrata, word *textWord) {
page.removeWord(depthIdx, word)
}
func (s *textStrata) allWords() []*textWord {
var wordList []*textWord
for _, words := range s.bins {
wordList = append(wordList, words...)
}
return wordList
}
func (s *textStrata) isHomogenous(w *textWord) bool {
words := s.allWords()
words = append(words, w)
if len(words) == 0 {
return true
}
minFont := words[0].fontsize
maxFont := minFont
for _, w := range words {
if w.fontsize < minFont {
minFont = w.fontsize
} else if w.fontsize > maxFont {
maxFont = w.fontsize
}
}
if maxFont/minFont > 1.3 {
common.Log.Error("font size range: %.2f - %.2f = %.1fx", minFont, maxFont, maxFont/minFont)
return false
}
return true
}
// removeWord removes `word`from `s`.bins[`depthIdx`].
// NOTE: We delete bins as soon as they become empty to save code that calls other textStrata
// functions from having to check for empty bins.

File diff suppressed because it is too large Load Diff

View File

@ -175,7 +175,7 @@ func TestTermMarksFiles(t *testing.T) {
if !doStress {
t.Skip("skipping stress test")
}
common.Log.Info("Running text stress tests. go test --short to skip these.")
common.Log.Info("Running text stress tests.")
if len(corpusFolder) == 0 && !forceTest {
t.Log("Corpus folder not set - skipping")
return
@ -736,6 +736,11 @@ func testTermMarks(t *testing.T, text string, textMarks *TextMarkArray, n int) {
ofs1d = len(text)
}
show := fmt.Sprintf("<%s|%s|%s>", text[ofs0d:ofs0], text[ofs0:ofs1], text[ofs1:ofs1d])
{
show = fmt.Sprintf("%q", show)
runes := []rune(show)
show = string(runes[1 : len(runes)-1])
}
// Get TextMarks spanning `term` with RangeOffset().
spanArray, err := textMarks.RangeOffset(ofs0, ofs1)
@ -783,6 +788,7 @@ func startWith(str, sub string) bool {
if strings.HasPrefix(str, sub[n:]) {
return true
}
// common.Log.Error("!startsWith: str=%q sub=%q sub[%d:]=%q", str, sub, n, sub[n:])
}
return false
}

View File

@ -170,6 +170,19 @@ func (w *textWord) text() string {
return strings.Join(texts, "")
}
// toTextMarks returns the TextMarks contained in `w`.text().
// `offset` is used to give the TextMarks the correct Offset values.
func (w *textWord) toTextMarks(offset *int) []TextMark {
var marks []TextMark
for _, tm := range w.marks {
marks = appendTextMark(marks, offset, tm.ToTextMark())
}
if len(w.text()) > 0 && len(marks) == 0 {
panic(w.text())
}
return marks
}
// font returns the fontID of the `idx`th rune in text.
// compute on creation? !@#$
func (w *textWord) font(idx int) string {

View File

@ -22,7 +22,7 @@ const (
// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
MissingCodeRune = '\ufffd' // <20>
// MissingCodeRune replaces strings that can't be decoded.
// MissingCodeString replaces strings that can't be decoded.
MissingCodeString = string(MissingCodeRune)
)
@ -44,7 +44,7 @@ type charRange struct {
type fbRange struct {
code0 CharCode
code1 CharCode
r0 rune // TODO (peterwilliams97): Change to string for compound codes.
r0 string
}
// CIDSystemInfo contains information for identifying the character collection
@ -110,8 +110,7 @@ type CMap struct {
// Used by ctype 2 CMaps.
codeToUnicode map[CharCode]string // CID -> Unicode string
// XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode?
unicodeToCode map[rune]CharCode // Unicode rune -> CID
unicodeToCode map[string]CharCode // Unicode rune -> CID
// cached contains the raw CMap data. It is used by the Bytes method in
// order to avoid generating the data for every call.
@ -137,10 +136,10 @@ func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
Supplement: 0,
},
codespaces: []Codespace{{Low: 0, High: 0xffff}},
codeToCID: make(map[CharCode]CharCode),
cidToCode: make(map[CharCode]CharCode),
codeToUnicode: codeToUnicode,
unicodeToCode: make(map[rune]CharCode),
unicodeToCode: make(map[string]CharCode, len(codeToRune)),
codeToCID: make(map[CharCode]CharCode, len(codeToRune)),
cidToCode: make(map[CharCode]CharCode, len(codeToRune)),
}
cmap.computeInverseMappings()
@ -159,7 +158,7 @@ func newCMap(isSimple bool) *CMap {
codeToCID: make(map[CharCode]CharCode),
cidToCode: make(map[CharCode]CharCode),
codeToUnicode: make(map[CharCode]string),
unicodeToCode: make(map[rune]CharCode),
unicodeToCode: make(map[string]CharCode),
}
}
@ -265,13 +264,8 @@ func (cmap *CMap) computeInverseMappings() {
// Generate Unicode -> CID map.
for cid, s := range cmap.codeToUnicode {
// The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf
if len(s) == 0 {
continue
}
r := rune0(s)
if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
cmap.unicodeToCode[r] = cid
if c, ok := cmap.unicodeToCode[s]; !ok || (ok && c > cid) {
cmap.unicodeToCode[s] = cid
}
}
@ -326,10 +320,10 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
return MissingCodeString, false
}
// RuneToCID maps the specified rune to a character identifier. If the provided
// rune has no available mapping, the second return value is false.
func (cmap *CMap) RuneToCID(r rune) (CharCode, bool) {
cid, ok := cmap.unicodeToCode[r]
// StringToCID maps the specified string to a character identifier. If the provided
// string has no available mapping, the bool return value is false.
func (cmap *CMap) StringToCID(s string) (CharCode, bool) {
cid, ok := cmap.unicodeToCode[s]
return cid, ok
}
@ -484,10 +478,10 @@ func (cmap *CMap) toBfData() string {
// character codes have been mapped to code ranges.
var charRanges []charRange
currCharRange := charRange{codes[0], codes[0]}
prevRune := rune0(cmap.codeToUnicode[codes[0]])
prevRune := cmap.codeToUnicode[codes[0]]
for _, c := range codes[1:] {
currRune := rune0(cmap.codeToUnicode[c])
if c == currCharRange.code1+1 && currRune == prevRune+1 {
currRune := cmap.codeToUnicode[c]
if c == currCharRange.code1+1 && lastRune(currRune) == lastRune(prevRune)+1 {
currCharRange.code1 = c
} else {
charRanges = append(charRanges, currCharRange)
@ -507,7 +501,7 @@ func (cmap *CMap) toBfData() string {
fbRanges = append(fbRanges, fbRange{
code0: cr.code0,
code1: cr.code1,
r0: rune0(cmap.codeToUnicode[cr.code0]),
r0: cmap.codeToUnicode[cr.code0],
})
}
}
@ -522,8 +516,8 @@ func (cmap *CMap) toBfData() string {
lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
for j := 0; j < n; j++ {
code := fbChars[i*maxBfEntries+j]
r := rune0(cmap.codeToUnicode[code])
lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
s := cmap.codeToUnicode[code]
lines = append(lines, fmt.Sprintf("<%04x> %s", code, hexCode(s)))
}
lines = append(lines, "endbfchar")
}
@ -535,8 +529,8 @@ func (cmap *CMap) toBfData() string {
lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
for j := 0; j < n; j++ {
rng := fbRanges[i*maxBfEntries+j]
r := rng.r0
lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1, r))
lines = append(lines, fmt.Sprintf("<%04x><%04x> %s",
rng.code0, rng.code1, hexCode(rng.r0)))
}
lines = append(lines, "endbfrange")
}
@ -544,6 +538,22 @@ func (cmap *CMap) toBfData() string {
return strings.Join(lines, "\n")
}
// lastRune returns the last rune in `s`.
func lastRune(s string) rune {
runes := []rune(s)
return runes[len(runes)-1]
}
// hexCode return the CMap hex code for `s`.
func hexCode(s string) string {
runes := []rune(s)
codes := make([]string, len(runes))
for i, r := range runes {
codes[i] = fmt.Sprintf("%04x", r)
}
return fmt.Sprintf("<%s>", strings.Join(codes, ""))
}
const (
maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
cmapHeader = `
@ -563,9 +573,3 @@ end
end
`
)
// rune0 is a convenience function that returns the first rune in `s`.
// Caller must check that `s` is not empty.
func rune0(s string) rune {
return ([]rune(s))[0]
}

View File

@ -105,7 +105,7 @@ func (cmap *CMap) parse() error {
func (cmap *CMap) parseName() error {
name := ""
done := false
// /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf
// NOTE(peterwilliams97): We need up to 20 iterations of this loop for some PDFs I have seen.
for i := 0; i < 20 && !done; i++ {
o, err := cmap.parseObject()
if err != nil {

View File

@ -67,7 +67,7 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) {
}
// Map rune to CID.
cid, ok := enc.cidToUnicode.RuneToCID(r)
cid, ok := enc.cidToUnicode.StringToCID(string(r))
if !ok {
return 0, false
}

View File

@ -23,7 +23,7 @@ const (
// MissingCodeRune replaces runes that can't be decoded. .
MissingCodeRune = '\ufffd' // <20>
// MissingCodeRune replaces strings that can't be decoded.
// MissingCodeString replaces strings that can't be decoded.
MissingCodeString = string(MissingCodeRune)
)

View File

@ -421,31 +421,26 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
return charcodes
}
// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical
// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except it returns more statistical
// information about hits and misses from the reverse mapping process.
// NOTE: The number of runes returned may be greater than the number of charcodes.
// TODO(peterwilliams97): Deprecate?
// TODO(peterwilliams97): Deprecate in v4 and use only CharcodesToStrings()
func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) {
runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes)
var runes []rune
for _, r := range runeSlices {
runes = append(runes, r...)
}
return runes, numHits, numMisses
texts, numHits, numMisses := font.CharcodesToStrings(charcodes)
return []rune(strings.Join(texts, "")), numHits, numMisses
}
// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices.
// The int return is the number of unconvereted codes.
// NOTE: The number of rune slices returned is equal to the number of charcodes
func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) {
// CharcodesToStrings returns the unicode strings corresponding to `charcodes`.
// The int returns are the number of strings and the number of unconvereted codes.
// NOTE: The number of strings returned is equal to the number of charcodes
func (font *PdfFont) CharcodesToStrings(charcodes []textencoding.CharCode) ([]string, int, int) {
fontBase := font.baseFields()
runeSlices := make([][]rune, 0, len(charcodes))
texts := make([]string, 0, len(charcodes))
numMisses := 0
for _, code := range charcodes {
if fontBase.toUnicodeCmap != nil {
if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
runeSlices = append(runeSlices, []rune(s))
// common.Log.Info("CharcodesToRuneSlices1: code=%d s=`%s`", code, s)
texts = append(texts, s)
continue
}
}
@ -454,9 +449,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
encoder := font.Encoder()
if encoder != nil {
if r, ok := encoder.CharcodeToRune(code); ok {
runeSlices = append(runeSlices, []rune{r})
// common.Log.Info("CharcodesToRuneSlices2: code=%d s=%q encoder=%s",
// code, string(r), encoder.String())
texts = append(texts, string(r))
continue
}
}
@ -465,7 +458,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
"\tfont=%s\n\tencoding=%s",
code, charcodes, fontBase.isCIDFont(), font, encoder)
numMisses++
runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune})
texts = append(texts, cmap.MissingCodeString)
}
if numMisses != 0 {
@ -475,7 +468,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
len(charcodes), numMisses, font)
}
return runeSlices, len(runeSlices), numMisses
return texts, len(texts), numMisses
}
// CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.
@ -499,8 +492,8 @@ func (font *PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
// 1) Use the ToUnicode CMap if there is one.
// 2) Use the underlying font's encoding.
func (font *PdfFont) CharcodesToUnicode(charcodes []textencoding.CharCode) []rune {
strlist, _, _ := font.CharcodesToUnicodeWithStats(charcodes)
return strlist
runes, _, _ := font.CharcodesToUnicodeWithStats(charcodes)
return runes
}
// RunesToCharcodeBytes maps the provided runes to charcode bytes and it