Merge branch 'development' of https://github.com/unidoc/unipdf into columns

2025-05-13 19:29:10 +08:00 · 2020-06-05 11:43:04 +10:00 · 2020-06-05 11:43:04 +10:00 · 29f2d9b8cf
commit 29f2d9b8cf
parent 40806d7f96 5777ee1394
18 changed files with 1413 additions and 780 deletions
--- a/annotator/field_appearance.go
+++ b/annotator/field_appearance.go
@ -13,6 +13,7 @@ import (

 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/contentstream"
+	"github.com/unidoc/unipdf/v3/contentstream/draw"
 	"github.com/unidoc/unipdf/v3/core"
 	"github.com/unidoc/unipdf/v3/internal/textencoding"
 	"github.com/unidoc/unipdf/v3/model"
@ -175,12 +176,14 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
 	width := rect.Width()
 	height := rect.Height()

+	var rotation float64
 	if mkDict, has := core.GetDict(wa.MK); has {
 		bsDict, _ := core.GetDict(wa.BS)
 		err := style.applyAppearanceCharacteristics(mkDict, bsDict, nil)
 		if err != nil {
 			return nil, err
 		}
+		rotation, _ = core.GetNumberAsFloat(mkDict.Get("R"))
 	}

 	// Get and process the default appearance string (DA) operands.
@ -192,6 +195,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT
 	}

 	cc := contentstream.NewContentCreator()
+
 	if style.BorderSize > 0 {
 		drawRect(cc, style, width, height)
 	}
@ -205,6 +209,28 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT

 	cc.Add_BMC("Tx")
 	cc.Add_q()
+
+	bboxWidth, bboxHeight := width, height
+	if rotation != 0 {
+		// Calculate bounding box before rotation.
+		revRotation := -rotation
+		bbox := draw.Path{Points: []draw.Point{
+			draw.NewPoint(0, 0).Rotate(revRotation),
+			draw.NewPoint(width, 0).Rotate(revRotation),
+			draw.NewPoint(0, height).Rotate(revRotation),
+			draw.NewPoint(width, height).Rotate(revRotation),
+		}}.GetBoundingBox()
+
+		// Update width and height, as the appearance is generated based on
+		// the bounding of the annotation with no rotation.
+		width = bbox.Width
+		height = bbox.Height
+
+		// Apply rotation.
+		cc.RotateDeg(rotation)
+		cc.Translate(bbox.X, bbox.Y)
+	}
+
 	// Graphic state changes.
 	cc.Add_BT()

@ -461,7 +487,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT

 	xform := model.NewXObjectForm()
 	xform.Resources = resources
-	xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, width, height})
+	xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, bboxWidth, bboxHeight})
 	xform.SetContentStream(cc.Bytes(), defStreamEncoder())

 	apDict := core.MakeDict()
--- a/extractor/README.md
+++ b/extractor/README.md
@ -62,3 +62,54 @@ bruce.pdf for char spacing save/restore.

 challenging-modified.pdf
 transitions_test.pdf
+
+
+Code Restructure?
+-----------------
+```
+	type textPara struct {
+		serial             int                // Sequence number for debugging.
+		model.PdfRectangle                    // Bounding box.
+		w, h   int
+		cells []textCell
+	}
+
+	type textCell struct {
+		serial             int                // Sequence number for debugging.
+		model.PdfRectangle                    // Bounding box.
+		eBBox              model.PdfRectangle // Extended bounding box needed to compute reading order.
+		lines              []*textLine        // Paragraph text gets broken into lines.
+	}
+```
+
+  x     x    x      x     x     x
+  x
+  x     x
+  x
+  x     x           x
+  x
+  x
+
+1. Compute all row candidates
+     alignedY  No intervening paras
+2. Compute all column candidates
+     alignedX  No intervening paras
+
+Table candidate
+1. Top row fully populated
+2. Left column fully populated
+3. All cells in table are aligned with 1 top row element and 1 left column candidate
+4. Mininum number of cells must be filled
+
+Computation time
+1. Row candidates  O(N)
+   Sort top to bottom, left to right
+   Search
+2. Column candidates O(N)
+   Sort left to right, top to bottom
+   Search
+3. Find intersections  O(N^2)
+   For each row
+      Find columns that start at row -> table candiates
+   Sort table candidates by w x h descending
+4. Test each candidate O(N^4)
--- a/extractor/text.go
+++ b/extractor/text.go
@ -22,8 +22,6 @@ import (
 	"github.com/unidoc/unipdf/v3/model"
 )

-const verbose = false
-
 // maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack
 // overflow and high enough to accomodate customers' PDFs
 const maxFormStack = 10
@ -49,7 +47,7 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM

 // ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
 func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
-	pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0)
+	pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, transform.IdentityMatrix(), 0)
 	if err != nil {
 		return nil, numChars, numMisses, err
 	}
@ -62,7 +60,8 @@ func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
 // extractPageText returns the text contents of content stream `e` and resouces `resources` as a
 // PageText.
 // This can be called on a page or a form XObject.
-func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) (
+func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources,
+	parentCTM transform.Matrix, level int) (
 	*PageText, int, int, error) {
 	common.Log.Trace("extractPageText: level=%d", level)
 	pageText := &PageText{pageSize: e.mediaBox}
@ -97,7 +96,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes

 			operand := op.Operand

-			if verbose {
+			if verboseGeom {
 				common.Log.Info("&&& op=%s", op)
 			}

@ -106,7 +105,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 				savedStates.push(&state)
 				// common.Log.Info("Save state: stack=%d\n %s", len(savedStates), state.String())
 			case "Q":
-				if verbose {
+				if verboseGeom {
 					common.Log.Info("Restore state: %s", savedStates.String())
 				}
 				if !savedStates.empty() {
@ -129,7 +128,10 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 					pageText.marks = append(pageText.marks, to.marks...)
 				}
 				inTextObj = true
-				to = newTextObject(e, resources, gs, &state, &savedStates)
+				graphicsState := gs
+				graphicsState.CTM = parentCTM.Mult(graphicsState.CTM)
+				to = newTextObject(e, resources, graphicsState, &state, &savedStates)
+
 			case "ET": // End Text
 				// End text object, discarding text matrix. If the current
 				// text object contains text marks, they are added to the
@ -343,8 +345,9 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 					if formResources == nil {
 						formResources = resources
 					}
+
 					tList, numChars, numMisses, err := e.extractPageText(string(formContent),
-						formResources, level+1)
+						formResources, parentCTM.Mult(gs.CTM), level+1)
 					if err != nil {
 						common.Log.Debug("ERROR: %v", err)
 						return err
@ -489,8 +492,8 @@ func (to *textObject) setCharSpacing(x float64) {
 		return
 	}
 	to.state.tc = x
-	if verbose {
-		common.Log.Info("setCharSpacing: %.2f state=%s", to.state.String())
+	if verboseGeom {
+		common.Log.Info("setCharSpacing: %.2f state=%s", x, to.state.String())
 	}
 }

@ -758,7 +761,7 @@ func (to *textObject) renderText(data []byte) error {
 	}
 	font := to.getCurrentFont()
 	charcodes := font.BytesToCharcodes(data)
-	runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes)
+	texts, numChars, numMisses := font.CharcodesToStrings(charcodes)
 	if numMisses > 0 {
 		common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses)
 	}
@ -777,17 +780,20 @@ func (to *textObject) renderText(data []byte) error {
 		spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ')
 	}
 	spaceWidth := spaceMetrics.Wx * glyphTextRatio
-	common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs)
+	common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.2f", spaceWidth, texts, font, tfs)

 	stateMatrix := transform.NewMatrix(
 		tfs*th, 0,
 		0, tfs,
 		0, state.trise)
-	if verbose {
-		common.Log.Info("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runeSlices)
+	if verboseGeom {
+		common.Log.Info("renderText: %d codes=%+v texts=%q", len(charcodes), charcodes, texts)
 	}

-	for i, r := range runeSlices {
+	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, len(texts))
+
+	for i, text := range texts {
+		r := []rune(text)
 		if len(r) == 1 && r[0] == '\x00' {
 			continue
 		}
@ -819,7 +825,7 @@ func (to *textObject) renderText(data []byte) error {
 		// t is the displacement of the text cursor when the character is rendered.
 		t0 := transform.Point{X: (c.X*tfs + w) * th}
 		t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
-		if verbose {
+		if verboseGeom {
 			common.Log.Info("tfs=%.2f tc=%.2f tw=%.2f th=%.2f", tfs, state.tc, state.tw, th)
 			common.Log.Info("dx,dy=%.3f t0=%.2f t=%.2f", c, t0, t)
 		}
@ -830,7 +836,7 @@ func (to *textObject) renderText(data []byte) error {
 		td := translationMatrix(t)
 		end := to.gs.CTM.Mult(to.tm).Mult(td0)

-		if verbose {
+		if verboseGeom {
 			common.Log.Info("end:\n\tCTM=%s\n\t tm=%s\n"+
 				"\t td=%s xlat=%s\n"+
 				"\ttd0=%s\n\t → %s xlat=%s",
@ -865,7 +871,7 @@ func (to *textObject) renderText(data []byte) error {

 		// update the text matrix by the displacement of the text location.
 		to.tm.Concat(td)
-		if i != len(runeSlices)-1 {
+		if i != len(texts)-1 {
 			to.logCursor()
 		}
 	}
@ -908,10 +914,11 @@ func isTextSpace(text string) bool {

 // PageText represents the layout of text on a device page.
 type PageText struct {
-	marks     []*textMark // Texts and their positions on a PDF page.
-	viewText  string      // Extracted page text.
-	viewMarks []TextMark  // Public view of `marks`.
-	pageSize  model.PdfRectangle
+	marks      []*textMark        // Texts and their positions on a PDF page.
+	viewText   string             // Extracted page text.
+	viewMarks  []TextMark         // Public view of text marks`.
+	viewTables []TextTable        // Public view of text table`.
+	pageSize   model.PdfRectangle // Page size. Used to calculate depth.
 }

 // String returns a string describing `pt`.
@ -942,6 +949,11 @@ func (pt PageText) Marks() *TextMarkArray {
 	return &TextMarkArray{marks: pt.viewMarks}
 }

+// Tables returns the tables extracted from the page.
+func (pt PageText) Tables() []TextTable {
+	return pt.viewTables
+}
+
 // computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
 // `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
 // The comments above the TextMark definition describe how to use the []TextMark to
@ -953,6 +965,7 @@ func (pt *PageText) computeViews() {
 	paras.writeText(b)
 	pt.viewText = b.String()
 	pt.viewMarks = paras.toTextMarks()
+	pt.viewTables = paras.toTables()
 }

 // TextMarkArray is a collection of TextMarks.
@ -1119,6 +1132,13 @@ var spaceMark = TextMark{
 	Meta:     true,
 }

+// TextTable represents a table.
+// Cells are ordered top-to-bottom, left-to-right.
+type TextTable struct {
+	W, H  int
+	Cells [][]string
+}
+
 // getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
 // empty.
 func (to *textObject) getCurrentFont() *model.PdfFont {
--- a/extractor/text_bound.go
+++ b/extractor/text_bound.go
@ -19,11 +19,11 @@ import (
 var serial serialState

 type serialState struct {
-	mark int
-	word int
-	bins int
-	line int
-	para int
+	mark   int
+	word   int
+	strata int
+	line   int
+	para   int
 }

 func (serial *serialState) reset() {
@ -65,15 +65,25 @@ func diffReading(a, b bounded) float64 {
 	return a.bbox().Llx - b.bbox().Llx
 }

-// func boundedUnion(objs ...bounded) model.PdfRectangle {
-// 	rect := objs[0].bbox()
-// 	for _, r := range objs[1:] {
-// 		rect = rectUnion(rect, r.bbox())
-// 	}
-// 	return rect
-// }
+func boundedUnion(objs ...bounded) model.PdfRectangle {
+	rect := objs[0].bbox()
+	for _, r := range objs[1:] {
+		rect = rectUnion(rect, r.bbox())
+	}
+	return rect
+}

-// diffDepth returns `a` - `b` in the depth direction..
+// rectContainsBounded returns true if `a` contains `b`.
+func rectContainsBounded(a model.PdfRectangle, b bounded) bool {
+	return rectContainsRect(a, b.bbox())
+}
+
+// rectContainsRect returns true if `a` contains `b`.
+func rectContainsRect(a, b model.PdfRectangle) bool {
+	return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury
+}
+
+// diffDepth returns `a` - `b` in the depth direction.
 func diffDepth(a, b bounded) float64 {
 	return bboxDepth(a) - bboxDepth(b)
 }
@ -151,3 +161,19 @@ func overlappedXRect(r0, r1 model.PdfRectangle) bool {
 func overlappedYRect(r0, r1 model.PdfRectangle) bool {
 	return (r0.Lly <= r1.Lly && r1.Lly <= r0.Ury) || (r0.Lly <= r1.Ury && r1.Ury <= r0.Ury)
 }
+
+// minInt return the lesser of `a` and `b`.
+func minInt(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// maxInt return the greater of `a` and `b`.
+func maxInt(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
--- a/extractor/text_const.go
+++ b/extractor/text_const.go
@ -5,8 +5,24 @@

 package extractor

+// The follow constant configure debugging.
 const (
+	verbose      = false
+	verboseGeom  = false
+	verbosePage  = false
+	verbosePara  = false
+	verboseTable = false
+)

+// The following constants control the approaches used in the code.
+const (
+	useTables = true
+	doHyphens = true
+	useEBBox  = false
+)
+
+// The following constants are the tuning parameter for text extracton
+const (
 	// Size of depth bins in points
 	depthBinPoints = 6

--- a/extractor/text_line.go
+++ b/extractor/text_line.go
@ -20,10 +20,12 @@ type textLine struct {
 	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
 	depth              float64     // Distance from bottom of line to top of page.
 	words              []*textWord // Words in this line.
-	fontsize           float64
-	hyphenated         bool
+	fontsize           float64     // Largest word font size.
+	hyphenated         bool        // Does line have at least minHyphenation runes and end in a hyphen.
 }

+const minHyphenation = 4
+
 // newTextLine creates a line with font and bbox size of `w`, removes `w` from p.bins[bestWordDepthIdx] and adds it to the line
 func newTextLine(p *textStrata, depthIdx int) *textLine {
 	words := p.getStratum(depthIdx)
@ -60,31 +62,22 @@ func (l *textLine) text() string {
 		}
 	}
 	return strings.Join(words, "")
-
 }

 // toTextMarks returns the TextMarks contained in `l`.text().
 // `offset` is used to give the TextMarks the correct Offset values.
 func (l *textLine) toTextMarks(offset *int) []TextMark {
 	var marks []TextMark
-	addMark := func(mark TextMark) {
-		mark.Offset = *offset
-		marks = append(marks, mark)
-		*offset += len(mark.Text)
-	}
-	addSpaceMark := func(spaceChar string) {
-		mark := spaceMark
-		mark.Text = spaceChar
-		addMark(mark)
-	}
 	for _, word := range l.words {
-		for _, tm := range word.marks {
-			addMark(tm.ToTextMark())
-		}
+		wordMarks := word.toTextMarks(offset)
+		marks = append(marks, wordMarks...)
 		if word.spaceAfter {
-			addSpaceMark(" ")
+			marks = appendSpaceMark(marks, offset, " ")
 		}
 	}
+	if len(l.text()) > 0 && len(marks) == 0 {
+		panic(l.text())
+	}
 	return marks
 }

@ -130,16 +123,13 @@ func (l *textLine) mergeWordFragments() {
 	}

 	// check for hyphen at end of line
-	runes := []rune(l.text())
-	l.hyphenated = len(runes) >= 4 &&
+	l.hyphenated = isHyphenated(l.text())
+}
+
+// isHyphenated returns true if `text` is a hyphenated word.
+func isHyphenated(text string) bool {
+	runes := []rune(text)
+	return len(runes) >= minHyphenation &&
 		unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
 		!unicode.IsSpace(runes[len(runes)-2])
-	// if l.hyphenated {
-	// 	// fmt.Fprintf(os.Stderr, "\n%q ", l.text())
-	// 	common.Log.Info("### %d %q\n\t%q:%t\n\t%q:%t",
-	// 		len(runes), l.text(),
-	// 		runes[len(runes)-1], unicode.Is(unicode.Hyphen, runes[len(runes)-1]),
-	// 		runes[len(runes)-2], !unicode.IsSpace(runes[len(runes)-2]),
-	// 	)
-	// }
 }
--- a/extractor/text_mark.go
+++ b/extractor/text_mark.go
@ -21,11 +21,6 @@ type textMark struct {
 	model.PdfRectangle                  // Bounding box.
 	text               string           // The text (decoded via ToUnicode).
 	original           string           // Original text (decoded).
-	orient             int              // The text orientation in degrees. This is the current TRM rounded to 10°.
-	orientedStart      transform.Point  // Left of text in orientation where text is horizontal.
-	orientedEnd        transform.Point  // Right of text in orientation where text is horizontal.
-	height             float64          // Text height.
-	spaceWidth         float64          // Best guess at the width of a space in the font the text was rendered with.
 	font               *model.PdfFont   // The font the mark was drawn with.
 	fontsize           float64          // The font size the mark was drawn with.
 	charspacing        float64          // TODO (peterwilliams97: Should this be exposed in TextMark?
@ -74,25 +69,20 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo
 	bbox = clipped

 	tm := textMark{
-		text:          text,
-		orient:        orient,
-		PdfRectangle:  bbox,
-		orientedStart: start.Rotate(theta),
-		orientedEnd:   end.Rotate(theta),
-		height:        math.Abs(height),
-		spaceWidth:    spaceWidth,
-		font:          font,
-		fontsize:      height,
-		charspacing:   charspacing,
-		trm:           trm,
-		end:           end,
-		serial:        serial.mark,
+		text:         text,
+		PdfRectangle: bbox,
+		font:         font,
+		fontsize:     height,
+		charspacing:  charspacing,
+		trm:          trm,
+		end:          end,
+		serial:       serial.mark,
 	}
 	serial.mark++
 	if !isTextSpace(tm.text) && tm.Width() == 0.0 {
 		common.Log.Debug("ERROR: Zero width text. tm=%s", tm.String())
 	}
-	if verbose {
+	if verboseGeom {
 		common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
 	}

@ -110,11 +100,6 @@ func (tm *textMark) bbox() model.PdfRectangle {
 	return tm.PdfRectangle
 }

-// Width returns the width of `tm`.text in the text direction.
-func (tm *textMark) Width() float64 {
-	return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
-}
-
 // ToTextMark returns the public view of `tm`.
 func (tm *textMark) ToTextMark() TextMark {
 	return TextMark{
@ -127,6 +112,23 @@ func (tm *textMark) ToTextMark() TextMark {
 	}
 }

+// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted
+// text.
+func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark {
+	mark.Offset = *offset
+	marks = append(marks, mark)
+	*offset += len(mark.Text)
+	return marks
+}
+
+// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`,
+// the offset of `mark` in the extracted text.
+func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark {
+	mark := spaceMark
+	mark.Text = spaceChar
+	return appendTextMark(marks, offset, mark)
+}
+
 // nearestMultiple return the integer multiple of `m` that is closest to `x`.
 func nearestMultiple(x float64, m int) int {
 	if m == 0 {
--- a/extractor/text_page.go
+++ b/extractor/text_page.go
@ -9,16 +9,12 @@ import (
 	"fmt"
 	"io"
 	"math"
-	"unicode"
+	"sort"

 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/model"
 )

-// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
-// type so we can have methods on it.
-type paraList []*textPara
-
 // makeTextPage builds a paraList from `marks`, the textMarks on a page.
 func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
 	common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
@ -35,28 +31,21 @@ func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraL
 	for i, para := range paraStratas {
 		paras[i] = composePara(para)
 	}
-	if verbose || true {
-		common.Log.Info("unsorted=========----------=====")
-		for i, para := range paras {
-			common.Log.Info("paras[%d]=%.2f%q", i, para.PdfRectangle, truncate(paras[i].text(), 200))
-		}
-	}

+	paras.log("unsorted")
+	// paras.computeEBBoxes()
+
+	if useTables {
+		paras = paras.extractTables()
+	}
+	// paras.log("tables extracted")
 	paras.computeEBBoxes()
-	paras = paras.extractTables()
+	paras.log("EBBoxes 2")

 	// Sort the paras into reading order.
 	paras.sortReadingOrder()
-	if verbose || true {
-		common.Log.Info("para sorted in reading order -----------=========")
-		for i, para := range paras {
-			tab := ""
-			if para.table != nil {
-				tab = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
-			}
-			fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tab, truncate(para.text(), 50))
-		}
-	}
+	paras.log("sorted in reading order")
+
 	return paras
 }

@ -72,7 +61,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
 	// Some bins are emptied before they iterated to (seee "surving bin" above).
 	// If a `page` survives until it is iterated to then at least one `para` will be built around it.

-	if verbose {
+	if verbosePage {
 		common.Log.Info("dividePage")
 	}
 	cnt := 0
@ -89,7 +78,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
 			firstReadingIdx := page.firstReadingIndex(depthIdx)
 			words := page.getStratum(firstReadingIdx)
 			moveWord(firstReadingIdx, page, para, words[0])
-			if verbose {
+			if verbosePage {
 				common.Log.Info("words[0]=%s", words[0].String())
 			}

@ -105,7 +94,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {

 				// Add words that are within maxIntraDepthGap of `para` in the depth direction.
 				// i.e. Stretch para in the depth direction, vertically for English text.
-				if verbose {
+				if verbosePage {
 					common.Log.Info("para depth %.2f - %.2f maxIntraDepthGap=%.2f ",
 						para.minDepth(), para.maxDepth(), maxIntraDepthGap)
 				}
@ -159,6 +148,9 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {

 			// Sort the words in `para`'s bins in the reading direction.
 			para.sort()
+			if verbosePage {
+				common.Log.Info("para=%s", para.String())
+			}
 			paraStratas = append(paraStratas, para)
 		}
 	}
@ -166,40 +158,11 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata {
 	return paraStratas
 }

-const doHyphens = true
-const useTables = true
-
 // writeText writes the text in `paras` to `w`.
 func (paras paraList) writeText(w io.Writer) {
-	for ip, para := range paras {
-		if useTables {
-			para.writeText(w)
-		} else {
-			for il, line := range para.lines {
-				s := line.text()
-				reduced := false
-				if doHyphens {
-					if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
-						// Line ending with hyphen. Remove it.
-						runes := []rune(s)
-						s = string(runes[:len(runes)-1])
-						reduced = true
-					}
-				}
-				w.Write([]byte(s))
-				if reduced {
-					// We removed the hyphen from the end of the line so we don't need a line ending.
-					continue
-				}
-				if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
-					// Next line is the same depth so it's the same line as this one in the extracted text
-					w.Write([]byte(" "))
-					continue
-				}
-				w.Write([]byte("\n"))
-			}
-			w.Write([]byte("\n"))
-		}
+	for _, para := range paras {
+		para.writeText(w)
+		w.Write([]byte("\n"))
 	}
 }

@ -208,69 +171,35 @@ func (paras paraList) writeText(w io.Writer) {
 func (paras paraList) toTextMarks() []TextMark {
 	offset := 0
 	var marks []TextMark
-	addMark := func(mark TextMark) {
-		mark.Offset = offset
-		marks = append(marks, mark)
-		offset += len(mark.Text)
-	}
-	addSpaceMark := func(spaceChar string) {
-		mark := spaceMark
-		mark.Text = spaceChar
-		addMark(mark)
-	}
-	for ip, para := range paras {
-		if useTables {
-			paraMarks := para.toTextMarks(&offset)
-			marks = append(marks, paraMarks...)
-		} else {
-			for il, line := range para.lines {
-				lineMarks := line.toTextMarks(&offset)
-				marks = append(marks, lineMarks...)
-				reduced := false
-				if doHyphens {
-					if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) {
-						tm := marks[len(marks)-1]
-						r := []rune(tm.Text)
-						if unicode.IsSpace(r[len(r)-1]) {
-							panic(tm)
-						}
-						if len(r) == 1 {
-							marks = marks[:len(marks)-1]
-							offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
-						} else {
-							s := string(r[:len(r)-1])
-							offset += len(s) - len(tm.Text)
-							tm.Text = s
-						}
-						reduced = true
-					}
-				}
-				if reduced {
-					continue
-				}
-				if il != len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
-					// Next line is the same depth so it's the same line as this one in the extracted text
-					addSpaceMark(" ")
-					continue
-				}
-				addSpaceMark("\n")
-			}
-			if ip != len(paras)-1 {
-				addSpaceMark("\n")
-			}
-		}
+	for _, para := range paras {
+		paraMarks := para.toTextMarks(&offset)
+		marks = append(marks, paraMarks...)
+		marks = appendSpaceMark(marks, &offset, "\n")
 	}
 	return marks
 }

+func (paras paraList) toTables() []TextTable {
+	var tables []TextTable
+	for _, para := range paras {
+		if para.table != nil {
+			tables = append(tables, para.table.toTextTable())
+		}
+	}
+	return tables
+}
+
 // sortReadingOrder sorts `paras` in reading order.
 func (paras paraList) sortReadingOrder() {
 	common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras))
 	if len(paras) <= 1 {
 		return
 	}
+	sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 })
+	paras.log("diffReadingDepth")
 	adj := paras.adjMatrix()
 	order := topoOrder(adj)
+	printAdj(adj)
 	paras.reorder(order)
 }

@ -290,22 +219,23 @@ func (paras paraList) adjMatrix() [][]bool {
 			adj[i][j], reasons[i][j] = paras.before(i, j)
 		}
 	}
-	if verbose && false {
+	if verbosePage {
+		show := func(a *textPara) string {
+			return fmt.Sprintf("%6.2f %q", a.eBBox, truncate(a.text(), 70))
+		}
 		common.Log.Info("adjMatrix =======")
 		for i := 0; i < n; i++ {
 			a := paras[i]
-			fmt.Printf("%4d: %q %.2f\n", i, truncate(a.text(), 50), a.PdfRectangle)
+			fmt.Printf("%4d: %s\n", i, show(a))
 			for j := 0; j < n; j++ {
 				if i == j {
 					continue
 				}
-				if !adj[i][j] {
+				if !adj[i][j] && i != 16 {
 					continue
 				}
 				b := paras[j]
-				fmt.Printf("%8d: %10s %q %.2f\n", j,
-					reasons[i][j], truncate(b.text(), 40), b.PdfRectangle)
-
+				fmt.Printf("%8d: %t %10s %s\n", j, adj[i][j], reasons[i][j], show(b))
 			}
 		}
 	}
@ -344,7 +274,7 @@ func (paras paraList) before(i, j int) (bool, string) {
 			continue
 		}
 		if overlappedXPara(a, c) && overlappedXPara(c, b) {
-			return false, "Y intervening"
+			return false, fmt.Sprintf("Y intervening: %d: %s", k, c)
 		}
 	}
 	return true, "TO LEFT"
@ -358,13 +288,21 @@ func overlappedXPara(r0, r1 *textPara) bool {

 // computeEBBoxes computes the eBBox fields in the elements of `paras`.
 func (paras paraList) computeEBBoxes() {
-	common.Log.Trace("computeEBBoxes:")
+	if verbose {
+		common.Log.Info("computeEBBoxes:")
+	}

-	for i, a := range paras {
-		// [llx, urx] is the reading direction interval for which no paras overlap `a`
+	for _, para := range paras {
+		para.eBBox = para.PdfRectangle
+	}
+
+	for i, aa := range paras {
+		a := aa.eBBox
+		// [llx, urx] is the reading direction interval for which no paras overlap `a`.
 		llx := -1.0e9
 		urx := +1.0e9
-		for j, b := range paras {
+		for j, bb := range paras {
+			b := bb.eBBox
 			if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) {
 				continue
 			}
@ -385,27 +323,65 @@ func (paras paraList) computeEBBoxes() {

 		// Go through all paras below `a` within interval [llx, urx] in the reading direction and
 		// expand `a` as far as possible to left and right without overlapping any of them.
-		a.eBBox = a.PdfRectangle
-		for j, b := range paras {
+
+		for j, bb := range paras {
+			b := bb.eBBox
 			if i == j || b.Ury > a.Lly {
 				continue
 			}

 			// If `b` is completely to right of `llx`, extend `a` left to `b`.
 			if llx <= b.Llx {
-				a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx)
+				a.Llx = math.Min(a.Llx, b.Llx)
 			}

 			// If `b` is completely to left of `urx`, extend `a` right to `b`.
 			if b.Urx <= urx {
-				a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx)
+				a.Urx = math.Max(a.Urx, b.Urx)
 			}
 		}
+		if verbose {
+			fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50))
+		}
+		aa.eBBox = a
+	}
+	if useEBBox {
+		for _, para := range paras {
+			para.PdfRectangle = para.eBBox
+		}
+	}
+}
+
+// printAdj prints `adj` to stdout.
+func printAdj(adj [][]bool) {
+	if !verbosePage {
+		return
+	}
+	common.Log.Info("printAdj:")
+	n := len(adj)
+	fmt.Printf("%3s:", "")
+	for x := 0; x < n; x++ {
+		fmt.Printf("%3d", x)
+	}
+	fmt.Println()
+	for y := 0; y < n; y++ {
+		fmt.Printf("%3d:", y)
+		for x := 0; x < n; x++ {
+			s := ""
+			if adj[y][x] {
+				s = "X"
+			}
+			fmt.Printf("%3s", s)
+		}
+		fmt.Println()
 	}
 }

 // topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`.
 func topoOrder(adj [][]bool) []int {
+	if verbosePage {
+		common.Log.Info("topoOrder:")
+	}
 	n := len(adj)
 	visited := make([]bool, n)
 	var order []int
@ -427,11 +403,16 @@ func topoOrder(adj [][]bool) []int {
 			sortNode(idx)
 		}
 	}
-	// Order is currently reversed so change it to forward order.
-	for i := 0; i < n/2; i++ {
-		order[i], order[n-1-i] = order[n-1-i], order[i]
+	return reversed(order)
+}
+
+// reversed return `order` reversed.
+func reversed(order []int) []int {
+	rev := make([]int, len(order))
+	for i, v := range order {
+		rev[len(order)-1-i] = v
 	}
-	return order
+	return rev
 }

 // reorder reorders `para` to the order in `order`.
--- a/extractor/text_para.go
+++ b/extractor/text_para.go
@ -12,9 +12,14 @@ import (
 	"sort"
 	"unicode"

+	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/model"
 )

+// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
+// type so we can have methods on it.
+type paraList []*textPara
+
 // textPara is a group of words in a rectangular region of a page that get read together.
 // An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.
 // We start by finding paragraph regions on a page, then we break the words into the textPara into
@ -22,7 +27,7 @@ import (
 type textPara struct {
 	serial             int                // Sequence number for debugging.
 	model.PdfRectangle                    // Bounding box.
-	eBBox              model.PdfRectangle // Extented ounding box needed to compute reading order.
+	eBBox              model.PdfRectangle // Extended bounding box needed to compute reading order.
 	lines              []*textLine        // Paragraph text gets broken into lines.
 	table              *textTable
 }
@ -39,8 +44,8 @@ func newTextPara(strata *textStrata) *textPara {

 // String returns a description of `p`.
 func (p *textPara) String() string {
-	return fmt.Sprintf("serial=%d %.2f %d lines\n%s\n-------------",
-		p.serial, p.PdfRectangle, len(p.lines), p.text())
+	return fmt.Sprintf("serial=%d %.2f %d lines %q",
+		p.serial, p.PdfRectangle, len(p.lines), truncate(p.text(), 50))
 }

 // text returns the text  of the lines in `p`.
@ -52,47 +57,21 @@ func (p *textPara) text() string {

 // writeText writes the text of `p` including tables to `w`.
 func (p *textPara) writeText(w io.Writer) {
-	if p.table != nil {
-		for y := 0; y < p.table.h; y++ {
-			for x := 0; x < p.table.w; x++ {
-				cell := p.table.cells[y*p.table.w+x]
-				cell.writeCellText(w)
-				w.Write([]byte(" "))
-			}
-			w.Write([]byte("\n"))
-		}
-	} else {
+	if p.table == nil {
 		p.writeCellText(w)
-		w.Write([]byte("\n"))
+		return
 	}
-}
-
-// writeCellText writes the text of `p` not including tables to `w`.
-func (p *textPara) writeCellText(w io.Writer) {
-	// w := new(bytes.Buffer)
-	para := p
-	for il, line := range para.lines {
-		s := line.text()
-		reduced := false
-		if doHyphens {
-			if line.hyphenated && il != len(para.lines)-1 {
-				// Line ending with hyphen. Remove it.
-				runes := []rune(s)
-				s = string(runes[:len(runes)-1])
-				reduced = true
+	for y := 0; y < p.table.h; y++ {
+		for x := 0; x < p.table.w; x++ {
+			cell := p.table.get(x, y)
+			if cell == nil {
+				w.Write([]byte("\t"))
+			} else {
+				cell.writeCellText(w)
 			}
-		}
-		w.Write([]byte(s))
-		if reduced {
-			// We removed the hyphen from the end of the line so we don't need a line ending.
-			continue
-		}
-		if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
-			// Next line is the same depth so it's the same line as this one in the extracted text
 			w.Write([]byte(" "))
-			continue
 		}
-		if il < len(para.lines)-1 {
+		if y < p.table.h-1 {
 			w.Write([]byte("\n"))
 		}
 	}
@ -101,90 +80,103 @@ func (p *textPara) writeCellText(w io.Writer) {
 // toTextMarks creates the TextMarkArray corresponding to the extracted text created by
 // paras `p`.writeText().
 func (p *textPara) toTextMarks(offset *int) []TextMark {
+	if p.table == nil {
+		return p.toCellTextMarks(offset)
+	}
 	var marks []TextMark
-	addMark := func(mark TextMark) {
-		mark.Offset = *offset
-		marks = append(marks, mark)
-		*offset += len(mark.Text)
-	}
-	addSpaceMark := func(spaceChar string) {
-		mark := spaceMark
-		mark.Text = spaceChar
-		addMark(mark)
-	}
-	if p.table != nil {
-		for y := 0; y < p.table.h; y++ {
-			for x := 0; x < p.table.w; x++ {
-				cell := p.table.cells[y*p.table.w+x]
+	for y := 0; y < p.table.h; y++ {
+		for x := 0; x < p.table.w; x++ {
+			cell := p.table.get(x, y)
+			if cell == nil {
+				marks = appendSpaceMark(marks, offset, "\t")
+			} else {
 				cellMarks := cell.toCellTextMarks(offset)
 				marks = append(marks, cellMarks...)
-				addSpaceMark(" ")
 			}
-			addSpaceMark("\n")
+			marks = appendSpaceMark(marks, offset, " ")
+		}
+		if y < p.table.h-1 {
+			marks = appendSpaceMark(marks, offset, "\n")
 		}
-	} else {
-		marks = p.toCellTextMarks(offset)
-		addSpaceMark("\n")
 	}
 	return marks
 }

-// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
+// writeCellText writes the text of `p` not including tables to `w`.
+func (p *textPara) writeCellText(w io.Writer) {
+	for il, line := range p.lines {
+		lineText := line.text()
+		reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
+		if reduced { // Line ending with hyphen. Remove it.
+			lineText = removeLastRune(lineText)
+		}
+		w.Write([]byte(lineText))
+		if !(reduced || il == len(p.lines)-1) {
+			w.Write([]byte(getSpace(line.depth, p.lines[il+1].depth)))
+		}
+	}
+}
+
+// toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by
 // paras `paras`.writeCellText().
 func (p *textPara) toCellTextMarks(offset *int) []TextMark {
 	var marks []TextMark
-	addMark := func(mark TextMark) {
-		mark.Offset = *offset
-		marks = append(marks, mark)
-		*offset += len(mark.Text)
-	}
-	addSpaceMark := func(spaceChar string) {
-		mark := spaceMark
-		mark.Text = spaceChar
-		addMark(mark)
-	}
-	para := p
-
-	for il, line := range para.lines {
+	for il, line := range p.lines {
 		lineMarks := line.toTextMarks(offset)
-		marks = append(marks, lineMarks...)
-		reduced := false
-		if doHyphens {
-			if line.hyphenated && il != len(para.lines)-1 {
-				tm := marks[len(marks)-1]
-				r := []rune(tm.Text)
-				if unicode.IsSpace(r[len(r)-1]) {
-					panic(tm)
-				}
-				if len(r) == 1 {
-					marks = marks[:len(marks)-1]
-					*offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text)
-				} else {
-					s := string(r[:len(r)-1])
-					*offset += len(s) - len(tm.Text)
-					tm.Text = s
-				}
-				reduced = true
+		reduced := doHyphens && line.hyphenated && il != len(p.lines)-1
+		if reduced { // Line ending with hyphen. Remove it.
+			if len([]rune(line.text())) < minHyphenation {
+				panic(line.text())
 			}
+			if len(lineMarks) < 1 {
+				panic(line.text())
+			}
+			lineMarks = removeLastTextMarkRune(lineMarks, offset)
 		}
-		if reduced {
-			continue
-		}
-		if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) {
-			// Next line is the same depth so it's the same line as this one in the extracted text
-			addSpaceMark(" ")
-			continue
-		}
-		if il < len(para.lines)-1 {
-			addSpaceMark("\n")
+		marks = append(marks, lineMarks...)
+		if !(reduced || il == len(p.lines)-1) {
+			marks = appendSpaceMark(marks, offset, getSpace(line.depth, p.lines[il+1].depth))
 		}
 	}
-
-	addSpaceMark("\n")
-
 	return marks
 }

+func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark {
+	tm := marks[len(marks)-1]
+	runes := []rune(tm.Text)
+	if unicode.IsSpace(runes[len(runes)-1]) {
+		panic(tm)
+	}
+	if len(runes) == 1 {
+		marks = marks[:len(marks)-1]
+		tm1 := marks[len(marks)-1]
+		*offset = tm1.Offset + len(tm1.Text)
+	} else {
+		text := removeLastRune(tm.Text)
+		*offset += len(text) - len(tm.Text)
+		tm.Text = text
+	}
+	return marks
+}
+
+func removeLastRune(text string) string {
+	runes := []rune(text)
+	if len(runes) < 2 {
+		panic(text)
+	}
+	return string(runes[:len(runes)-1])
+}
+
+// getSpace returns the space to insert between lines of depth `depth1` and `depth2`.
+// Next line is the same depth so it's the same line as this one in the extracted text
+func getSpace(depth1, depth2 float64) string {
+	eol := !isZero(depth1 - depth2)
+	if eol {
+		return "\n"
+	}
+	return " "
+}
+
 // bbox makes textPara implement the `bounded` interface.
 func (p *textPara) bbox() model.PdfRectangle {
 	return p.PdfRectangle
@ -271,5 +263,42 @@ func composePara(strata *textStrata) *textPara {
 	if len(para.lines) == 0 {
 		panic(para)
 	}
+	if verbosePara {
+		common.Log.Info("!!! para=%s", para.String())
+		for i, line := range para.lines {
+			fmt.Printf("%4d: %s\n", i, line)
+			for j, word := range line.words {
+				fmt.Printf("%8d: %s\n", j, word)
+				for k, mark := range word.marks {
+					fmt.Printf("%12d: %s\n", k, mark)
+				}
+			}
+		}
+	}
 	return para
 }
+
+// log logs the contents of `paras`.
+func (paras paraList) log(title string) {
+	if !verbosePage {
+		return
+	}
+	common.Log.Info("%8s: %d paras =======-------=======", title, len(paras))
+	for i, para := range paras {
+		if para == nil {
+			continue
+		}
+		text := para.text()
+		tabl := "  "
+		if para.table != nil {
+			tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
+		}
+		fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
+		if len(text) == 0 {
+			panic("empty")
+		}
+		if para.table != nil && len(para.table.cells) == 0 {
+			panic(para)
+		}
+	}
+}
--- a/extractor/text_strata.go
+++ b/extractor/text_strata.go
@ -38,14 +38,14 @@ func makeTextStrata(words []*textWord, pageHeight float64) *textStrata {

 // newTextStrata returns an empty textStrata with page height `pageHeight`.
 func newTextStrata(pageHeight float64) *textStrata {
-	bins := textStrata{
-		serial:       serial.bins,
+	strata := textStrata{
+		serial:       serial.strata,
 		bins:         map[int][]*textWord{},
 		PdfRectangle: model.PdfRectangle{Urx: -1.0, Ury: -1.0},
 		pageHeight:   pageHeight,
 	}
-	serial.bins++
-	return &bins
+	serial.strata++
+	return &strata
 }

 // String returns a description of `s`.
@ -57,7 +57,9 @@ func (s *textStrata) String() string {
 			texts = append(texts, w.text())
 		}
 	}
-	return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts)
+	// return fmt.Sprintf("serial=%d %d %q", s.serial, )
+	return fmt.Sprintf("serial=%d %.2f fontsize=%.2f %d %q",
+		s.serial, s.PdfRectangle, s.fontsize, len(texts), texts)
 }

 // sort sorts the words in each bin in `s` in the reading direction.
@ -129,10 +131,24 @@ func (s *textStrata) scanBand(title string, para *textStrata,
 			if !readingOverlap(para, word) {
 				continue
 			}
-			if fontTol > 0 && math.Abs(word.fontsize-fontsize) > fontTol*fontsize {
-				continue
+			fontRatio1 := math.Abs(word.fontsize-fontsize) / fontsize
+			fontRatio2 := word.fontsize / fontsize
+
+			fontRatio := math.Min(fontRatio1, fontRatio2)
+			if fontTol > 0 {
+				if fontRatio > fontTol {
+					continue
+				}
+			}
+			if fontTol <= 0 {
+				panic(fontTol)
 			}
 			if !detectOnly {
+				// if !para.isHomogenous(word) {
+				// 	panic(fmt.Errorf("not homogeneous fontTol=%.2f ratio=%.2f (%.2f->%.2f)\n\tpara=%s\n\tword=%s",
+				// 		fontTol, fontRatio, fontsize, word.fontsize,
+				// 		para.String(), word.String()))
+				// }
 				moveWord(depthIdx, s, para, word)
 			}
 			newWords = append(newWords, word)
@ -155,11 +171,11 @@ func (s *textStrata) scanBand(title string, para *textStrata,
 	}
 	if verbose {
 		if len(title) > 0 {
-			common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f]  para=%.2f",
+			common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f fontsize=%.2f",
 				title,
 				minDepth0, maxDepth0,
 				minDepth, maxDepth,
-				para.PdfRectangle)
+				para.PdfRectangle, para.fontsize)
 			for i, word := range newWords {
 				fmt.Printf("%4d: %s\n", i, word)
 			}
@ -271,6 +287,36 @@ func moveWord(depthIdx int, page, para *textStrata, word *textWord) {
 	page.removeWord(depthIdx, word)
 }

+func (s *textStrata) allWords() []*textWord {
+	var wordList []*textWord
+	for _, words := range s.bins {
+		wordList = append(wordList, words...)
+	}
+	return wordList
+}
+
+func (s *textStrata) isHomogenous(w *textWord) bool {
+	words := s.allWords()
+	words = append(words, w)
+	if len(words) == 0 {
+		return true
+	}
+	minFont := words[0].fontsize
+	maxFont := minFont
+	for _, w := range words {
+		if w.fontsize < minFont {
+			minFont = w.fontsize
+		} else if w.fontsize > maxFont {
+			maxFont = w.fontsize
+		}
+	}
+	if maxFont/minFont > 1.3 {
+		common.Log.Error("font size range: %.2f - %.2f = %.1fx", minFont, maxFont, maxFont/minFont)
+		return false
+	}
+	return true
+}
+
 // removeWord removes `word`from `s`.bins[`depthIdx`].
 // NOTE: We delete bins as soon as they become empty to save code that calls other textStrata
 // functions from having to check for empty bins.
--- a/extractor/text_table.go
+++ b/extractor/text_table.go
--- a/extractor/text_test.go
+++ b/extractor/text_test.go
@ -175,7 +175,7 @@ func TestTermMarksFiles(t *testing.T) {
 	if !doStress {
 		t.Skip("skipping stress test")
 	}
-	common.Log.Info("Running text stress tests. go test --short to skip these.")
+	common.Log.Info("Running text stress tests.")
 	if len(corpusFolder) == 0 && !forceTest {
 		t.Log("Corpus folder not set - skipping")
 		return
@ -736,6 +736,11 @@ func testTermMarks(t *testing.T, text string, textMarks *TextMarkArray, n int) {
 			ofs1d = len(text)
 		}
 		show := fmt.Sprintf("<%s|%s|%s>", text[ofs0d:ofs0], text[ofs0:ofs1], text[ofs1:ofs1d])
+		{
+			show = fmt.Sprintf("%q", show)
+			runes := []rune(show)
+			show = string(runes[1 : len(runes)-1])
+		}

 		// Get TextMarks spanning `term` with RangeOffset().
 		spanArray, err := textMarks.RangeOffset(ofs0, ofs1)
@ -783,6 +788,7 @@ func startWith(str, sub string) bool {
 		if strings.HasPrefix(str, sub[n:]) {
 			return true
 		}
+		// common.Log.Error("!startsWith: str=%q sub=%q sub[%d:]=%q", str, sub, n, sub[n:])
 	}
 	return false
 }
--- a/extractor/text_word.go
+++ b/extractor/text_word.go
@ -170,6 +170,19 @@ func (w *textWord) text() string {
 	return strings.Join(texts, "")
 }

+// toTextMarks returns the TextMarks contained in `w`.text().
+// `offset` is used to give the TextMarks the correct Offset values.
+func (w *textWord) toTextMarks(offset *int) []TextMark {
+	var marks []TextMark
+	for _, tm := range w.marks {
+		marks = appendTextMark(marks, offset, tm.ToTextMark())
+	}
+	if len(w.text()) > 0 && len(marks) == 0 {
+		panic(w.text())
+	}
+	return marks
+}
+
 // font returns the fontID of the `idx`th rune in text.
 // compute on creation? !@#$
 func (w *textWord) font(idx int) string {
--- a/internal/cmap/cmap.go
+++ b/internal/cmap/cmap.go
@ -22,7 +22,7 @@ const (
 	// MissingCodeRune replaces runes that can't be decoded. '\ufffd' = <20>. Was '?'.
 	MissingCodeRune = '\ufffd' // <20>

-	// MissingCodeRune replaces strings that can't be decoded.
+	// MissingCodeString replaces strings that can't be decoded.
 	MissingCodeString = string(MissingCodeRune)
 )

@ -44,7 +44,7 @@ type charRange struct {
 type fbRange struct {
 	code0 CharCode
 	code1 CharCode
-	r0    rune // TODO (peterwilliams97): Change to string for compound codes.
+	r0    string
 }

 // CIDSystemInfo contains information for identifying the character collection
@ -110,8 +110,7 @@ type CMap struct {

 	// Used by ctype 2 CMaps.
 	codeToUnicode map[CharCode]string // CID -> Unicode string
-	// XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode?
-	unicodeToCode map[rune]CharCode // Unicode rune -> CID
+	unicodeToCode map[string]CharCode // Unicode rune -> CID

 	// cached contains the raw CMap data. It is used by the Bytes method in
 	// order to avoid generating the data for every call.
@ -137,10 +136,10 @@ func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap {
 			Supplement: 0,
 		},
 		codespaces:    []Codespace{{Low: 0, High: 0xffff}},
-		codeToCID:     make(map[CharCode]CharCode),
-		cidToCode:     make(map[CharCode]CharCode),
 		codeToUnicode: codeToUnicode,
-		unicodeToCode: make(map[rune]CharCode),
+		unicodeToCode: make(map[string]CharCode, len(codeToRune)),
+		codeToCID:     make(map[CharCode]CharCode, len(codeToRune)),
+		cidToCode:     make(map[CharCode]CharCode, len(codeToRune)),
 	}

 	cmap.computeInverseMappings()
@ -159,7 +158,7 @@ func newCMap(isSimple bool) *CMap {
 		codeToCID:     make(map[CharCode]CharCode),
 		cidToCode:     make(map[CharCode]CharCode),
 		codeToUnicode: make(map[CharCode]string),
-		unicodeToCode: make(map[rune]CharCode),
+		unicodeToCode: make(map[string]CharCode),
 	}
 }

@ -265,13 +264,8 @@ func (cmap *CMap) computeInverseMappings() {

 	// Generate Unicode -> CID map.
 	for cid, s := range cmap.codeToUnicode {
-		// The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf
-		if len(s) == 0 {
-			continue
-		}
-		r := rune0(s)
-		if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) {
-			cmap.unicodeToCode[r] = cid
+		if c, ok := cmap.unicodeToCode[s]; !ok || (ok && c > cid) {
+			cmap.unicodeToCode[s] = cid
 		}
 	}

@ -326,10 +320,10 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
 	return MissingCodeString, false
 }

-// RuneToCID maps the specified rune to a character identifier. If the provided
-// rune has no available mapping, the second return value is false.
-func (cmap *CMap) RuneToCID(r rune) (CharCode, bool) {
-	cid, ok := cmap.unicodeToCode[r]
+// StringToCID maps the specified string to a character identifier. If the provided
+// string has no available mapping, the bool return value is false.
+func (cmap *CMap) StringToCID(s string) (CharCode, bool) {
+	cid, ok := cmap.unicodeToCode[s]
 	return cid, ok
 }

@ -484,10 +478,10 @@ func (cmap *CMap) toBfData() string {
 	// character codes have been mapped to code ranges.
 	var charRanges []charRange
 	currCharRange := charRange{codes[0], codes[0]}
-	prevRune := rune0(cmap.codeToUnicode[codes[0]])
+	prevRune := cmap.codeToUnicode[codes[0]]
 	for _, c := range codes[1:] {
-		currRune := rune0(cmap.codeToUnicode[c])
-		if c == currCharRange.code1+1 && currRune == prevRune+1 {
+		currRune := cmap.codeToUnicode[c]
+		if c == currCharRange.code1+1 && lastRune(currRune) == lastRune(prevRune)+1 {
 			currCharRange.code1 = c
 		} else {
 			charRanges = append(charRanges, currCharRange)
@ -507,7 +501,7 @@ func (cmap *CMap) toBfData() string {
 			fbRanges = append(fbRanges, fbRange{
 				code0: cr.code0,
 				code1: cr.code1,
-				r0:    rune0(cmap.codeToUnicode[cr.code0]),
+				r0:    cmap.codeToUnicode[cr.code0],
 			})
 		}
 	}
@ -522,8 +516,8 @@ func (cmap *CMap) toBfData() string {
 			lines = append(lines, fmt.Sprintf("%d beginbfchar", n))
 			for j := 0; j < n; j++ {
 				code := fbChars[i*maxBfEntries+j]
-				r := rune0(cmap.codeToUnicode[code])
-				lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r))
+				s := cmap.codeToUnicode[code]
+				lines = append(lines, fmt.Sprintf("<%04x> %s", code, hexCode(s)))
 			}
 			lines = append(lines, "endbfchar")
 		}
@ -535,8 +529,8 @@ func (cmap *CMap) toBfData() string {
 			lines = append(lines, fmt.Sprintf("%d beginbfrange", n))
 			for j := 0; j < n; j++ {
 				rng := fbRanges[i*maxBfEntries+j]
-				r := rng.r0
-				lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1, r))
+				lines = append(lines, fmt.Sprintf("<%04x><%04x> %s",
+					rng.code0, rng.code1, hexCode(rng.r0)))
 			}
 			lines = append(lines, "endbfrange")
 		}
@ -544,6 +538,22 @@ func (cmap *CMap) toBfData() string {
 	return strings.Join(lines, "\n")
 }

+// lastRune returns the last rune in `s`.
+func lastRune(s string) rune {
+	runes := []rune(s)
+	return runes[len(runes)-1]
+}
+
+// hexCode return the CMap hex code for `s`.
+func hexCode(s string) string {
+	runes := []rune(s)
+	codes := make([]string, len(runes))
+	for i, r := range runes {
+		codes[i] = fmt.Sprintf("%04x", r)
+	}
+	return fmt.Sprintf("<%s>", strings.Join(codes, ""))
+}
+
 const (
 	maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section.
 	cmapHeader   = `
@ -563,9 +573,3 @@ end
 end
 `
 )
-
-// rune0 is a convenience function that returns the first rune in `s`.
-// Caller must check that `s` is not empty.
-func rune0(s string) rune {
-	return ([]rune(s))[0]
-}
--- a/internal/cmap/cmap_parser.go
+++ b/internal/cmap/cmap_parser.go
@ -105,7 +105,7 @@ func (cmap *CMap) parse() error {
 func (cmap *CMap) parseName() error {
 	name := ""
 	done := false
-	// /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf
+	// NOTE(peterwilliams97): We need up to 20 iterations of this loop for some PDFs I have seen.
 	for i := 0; i < 20 && !done; i++ {
 		o, err := cmap.parseObject()
 		if err != nil {
--- a/internal/textencoding/cmap.go
+++ b/internal/textencoding/cmap.go
@ -67,7 +67,7 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) {
 	}

 	// Map rune to CID.
-	cid, ok := enc.cidToUnicode.RuneToCID(r)
+	cid, ok := enc.cidToUnicode.StringToCID(string(r))
 	if !ok {
 		return 0, false
 	}
--- a/internal/textencoding/glyphs_glyphlist.go
+++ b/internal/textencoding/glyphs_glyphlist.go
@ -23,7 +23,7 @@ const (
 	// MissingCodeRune replaces runes that can't be decoded. .
 	MissingCodeRune = '\ufffd' // <20>

-	// MissingCodeRune replaces strings that can't be decoded.
+	// MissingCodeString replaces strings that can't be decoded.
 	MissingCodeString = string(MissingCodeRune)
 )

--- a/model/font.go
+++ b/model/font.go
@ -421,31 +421,26 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode {
 	return charcodes
 }

-// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical
+// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except it returns more statistical
 // information about hits and misses from the reverse mapping process.
 // NOTE: The number of runes returned may be greater than the number of charcodes.
-// TODO(peterwilliams97): Deprecate?
+// TODO(peterwilliams97): Deprecate in v4 and use only CharcodesToStrings()
 func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) {
-	runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes)
-	var runes []rune
-	for _, r := range runeSlices {
-		runes = append(runes, r...)
-	}
-	return runes, numHits, numMisses
+	texts, numHits, numMisses := font.CharcodesToStrings(charcodes)
+	return []rune(strings.Join(texts, "")), numHits, numMisses
 }

-// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices.
-// The int return is the number of unconvereted codes.
-// NOTE: The number of rune slices returned is equal to the number of charcodes
-func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) {
+// CharcodesToStrings returns the unicode strings corresponding to `charcodes`.
+// The int returns are the number of strings and the number of unconvereted codes.
+// NOTE: The number of strings returned is equal to the number of charcodes
+func (font *PdfFont) CharcodesToStrings(charcodes []textencoding.CharCode) ([]string, int, int) {
 	fontBase := font.baseFields()
-	runeSlices := make([][]rune, 0, len(charcodes))
+	texts := make([]string, 0, len(charcodes))
 	numMisses := 0
 	for _, code := range charcodes {
 		if fontBase.toUnicodeCmap != nil {
 			if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok {
-				runeSlices = append(runeSlices, []rune(s))
-				// common.Log.Info("CharcodesToRuneSlices1: code=%d s=`%s`", code, s)
+				texts = append(texts, s)
 				continue
 			}
 		}
@ -454,9 +449,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
 		encoder := font.Encoder()
 		if encoder != nil {
 			if r, ok := encoder.CharcodeToRune(code); ok {
-				runeSlices = append(runeSlices, []rune{r})
-				// common.Log.Info("CharcodesToRuneSlices2: code=%d s=%q encoder=%s",
-				// 	code, string(r), encoder.String())
+				texts = append(texts, string(r))
 				continue
 			}
 		}
@ -465,7 +458,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
 			"\tfont=%s\n\tencoding=%s",
 			code, charcodes, fontBase.isCIDFont(), font, encoder)
 		numMisses++
-		runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune})
+		texts = append(texts, cmap.MissingCodeString)
 	}

 	if numMisses != 0 {
@ -475,7 +468,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([
 			len(charcodes), numMisses, font)
 	}

-	return runeSlices, len(runeSlices), numMisses
+	return texts, len(texts), numMisses
 }

 // CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string.
@ -499,8 +492,8 @@ func (font *PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
 //  1) Use the ToUnicode CMap if there is one.
 //  2) Use the underlying font's encoding.
 func (font *PdfFont) CharcodesToUnicode(charcodes []textencoding.CharCode) []rune {
-	strlist, _, _ := font.CharcodesToUnicodeWithStats(charcodes)
-	return strlist
+	runes, _, _ := font.CharcodesToUnicodeWithStats(charcodes)
+	return runes
 }

 // RunesToCharcodeBytes maps the provided runes to charcode bytes and it