diff --git a/annotator/field_appearance.go b/annotator/field_appearance.go index 07b2d7ef..1a4c12ea 100644 --- a/annotator/field_appearance.go +++ b/annotator/field_appearance.go @@ -13,6 +13,7 @@ import ( "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/contentstream" + "github.com/unidoc/unipdf/v3/contentstream/draw" "github.com/unidoc/unipdf/v3/core" "github.com/unidoc/unipdf/v3/internal/textencoding" "github.com/unidoc/unipdf/v3/model" @@ -175,12 +176,14 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT width := rect.Width() height := rect.Height() + var rotation float64 if mkDict, has := core.GetDict(wa.MK); has { bsDict, _ := core.GetDict(wa.BS) err := style.applyAppearanceCharacteristics(mkDict, bsDict, nil) if err != nil { return nil, err } + rotation, _ = core.GetNumberAsFloat(mkDict.Get("R")) } // Get and process the default appearance string (DA) operands. @@ -192,6 +195,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT } cc := contentstream.NewContentCreator() + if style.BorderSize > 0 { drawRect(cc, style, width, height) } @@ -205,6 +209,28 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT cc.Add_BMC("Tx") cc.Add_q() + + bboxWidth, bboxHeight := width, height + if rotation != 0 { + // Calculate bounding box before rotation. + revRotation := -rotation + bbox := draw.Path{Points: []draw.Point{ + draw.NewPoint(0, 0).Rotate(revRotation), + draw.NewPoint(width, 0).Rotate(revRotation), + draw.NewPoint(0, height).Rotate(revRotation), + draw.NewPoint(width, height).Rotate(revRotation), + }}.GetBoundingBox() + + // Update width and height, as the appearance is generated based on + // the bounding of the annotation with no rotation. + width = bbox.Width + height = bbox.Height + + // Apply rotation. + cc.RotateDeg(rotation) + cc.Translate(bbox.X, bbox.Y) + } + // Graphic state changes. cc.Add_BT() @@ -461,7 +487,7 @@ func genFieldTextAppearance(wa *model.PdfAnnotationWidget, ftxt *model.PdfFieldT xform := model.NewXObjectForm() xform.Resources = resources - xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, width, height}) + xform.BBox = core.MakeArrayFromFloats([]float64{0, 0, bboxWidth, bboxHeight}) xform.SetContentStream(cc.Bytes(), defStreamEncoder()) apDict := core.MakeDict() diff --git a/extractor/README.md b/extractor/README.md index 0e303708..0f7204ca 100644 --- a/extractor/README.md +++ b/extractor/README.md @@ -62,3 +62,54 @@ bruce.pdf for char spacing save/restore. challenging-modified.pdf transitions_test.pdf + + +Code Restructure? +----------------- +``` + type textPara struct { + serial int // Sequence number for debugging. + model.PdfRectangle // Bounding box. + w, h int + cells []textCell + } + + type textCell struct { + serial int // Sequence number for debugging. + model.PdfRectangle // Bounding box. + eBBox model.PdfRectangle // Extended bounding box needed to compute reading order. + lines []*textLine // Paragraph text gets broken into lines. + } +``` + + x x x x x x + x + x x + x + x x x + x + x + +1. Compute all row candidates + alignedY No intervening paras +2. Compute all column candidates + alignedX No intervening paras + +Table candidate +1. Top row fully populated +2. Left column fully populated +3. All cells in table are aligned with 1 top row element and 1 left column candidate +4. Mininum number of cells must be filled + +Computation time +1. Row candidates O(N) + Sort top to bottom, left to right + Search +2. Column candidates O(N) + Sort left to right, top to bottom + Search +3. Find intersections O(N^2) + For each row + Find columns that start at row -> table candiates + Sort table candidates by w x h descending +4. Test each candidate O(N^4) diff --git a/extractor/text.go b/extractor/text.go index 29638b12..ef607d61 100644 --- a/extractor/text.go +++ b/extractor/text.go @@ -22,8 +22,6 @@ import ( "github.com/unidoc/unipdf/v3/model" ) -const verbose = false - // maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack // overflow and high enough to accomodate customers' PDFs const maxFormStack = 10 @@ -49,7 +47,7 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM // ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText. func (e *Extractor) ExtractPageText() (*PageText, int, int, error) { - pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, 0) + pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, transform.IdentityMatrix(), 0) if err != nil { return nil, numChars, numMisses, err } @@ -62,7 +60,8 @@ func (e *Extractor) ExtractPageText() (*PageText, int, int, error) { // extractPageText returns the text contents of content stream `e` and resouces `resources` as a // PageText. // This can be called on a page or a form XObject. -func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, level int) ( +func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, + parentCTM transform.Matrix, level int) ( *PageText, int, int, error) { common.Log.Trace("extractPageText: level=%d", level) pageText := &PageText{pageSize: e.mediaBox} @@ -97,7 +96,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes operand := op.Operand - if verbose { + if verboseGeom { common.Log.Info("&&& op=%s", op) } @@ -106,7 +105,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes savedStates.push(&state) // common.Log.Info("Save state: stack=%d\n %s", len(savedStates), state.String()) case "Q": - if verbose { + if verboseGeom { common.Log.Info("Restore state: %s", savedStates.String()) } if !savedStates.empty() { @@ -129,7 +128,10 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes pageText.marks = append(pageText.marks, to.marks...) } inTextObj = true - to = newTextObject(e, resources, gs, &state, &savedStates) + graphicsState := gs + graphicsState.CTM = parentCTM.Mult(graphicsState.CTM) + to = newTextObject(e, resources, graphicsState, &state, &savedStates) + case "ET": // End Text // End text object, discarding text matrix. If the current // text object contains text marks, they are added to the @@ -343,8 +345,9 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes if formResources == nil { formResources = resources } + tList, numChars, numMisses, err := e.extractPageText(string(formContent), - formResources, level+1) + formResources, parentCTM.Mult(gs.CTM), level+1) if err != nil { common.Log.Debug("ERROR: %v", err) return err @@ -489,8 +492,8 @@ func (to *textObject) setCharSpacing(x float64) { return } to.state.tc = x - if verbose { - common.Log.Info("setCharSpacing: %.2f state=%s", to.state.String()) + if verboseGeom { + common.Log.Info("setCharSpacing: %.2f state=%s", x, to.state.String()) } } @@ -758,7 +761,7 @@ func (to *textObject) renderText(data []byte) error { } font := to.getCurrentFont() charcodes := font.BytesToCharcodes(data) - runeSlices, numChars, numMisses := font.CharcodesToRuneSlices(charcodes) + texts, numChars, numMisses := font.CharcodesToStrings(charcodes) if numMisses > 0 { common.Log.Debug("renderText: numChars=%d numMisses=%d", numChars, numMisses) } @@ -777,17 +780,20 @@ func (to *textObject) renderText(data []byte) error { spaceMetrics, _ = model.DefaultFont().GetRuneMetrics(' ') } spaceWidth := spaceMetrics.Wx * glyphTextRatio - common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.1f", spaceWidth, runeSlices, font, tfs) + common.Log.Trace("spaceWidth=%.2f text=%q font=%s fontSize=%.2f", spaceWidth, texts, font, tfs) stateMatrix := transform.NewMatrix( tfs*th, 0, 0, tfs, 0, state.trise) - if verbose { - common.Log.Info("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, runeSlices) + if verboseGeom { + common.Log.Info("renderText: %d codes=%+v texts=%q", len(charcodes), charcodes, texts) } - for i, r := range runeSlices { + common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, len(texts)) + + for i, text := range texts { + r := []rune(text) if len(r) == 1 && r[0] == '\x00' { continue } @@ -819,7 +825,7 @@ func (to *textObject) renderText(data []byte) error { // t is the displacement of the text cursor when the character is rendered. t0 := transform.Point{X: (c.X*tfs + w) * th} t := transform.Point{X: (c.X*tfs + state.tc + w) * th} - if verbose { + if verboseGeom { common.Log.Info("tfs=%.2f tc=%.2f tw=%.2f th=%.2f", tfs, state.tc, state.tw, th) common.Log.Info("dx,dy=%.3f t0=%.2f t=%.2f", c, t0, t) } @@ -830,7 +836,7 @@ func (to *textObject) renderText(data []byte) error { td := translationMatrix(t) end := to.gs.CTM.Mult(to.tm).Mult(td0) - if verbose { + if verboseGeom { common.Log.Info("end:\n\tCTM=%s\n\t tm=%s\n"+ "\t td=%s xlat=%s\n"+ "\ttd0=%s\n\t → %s xlat=%s", @@ -865,7 +871,7 @@ func (to *textObject) renderText(data []byte) error { // update the text matrix by the displacement of the text location. to.tm.Concat(td) - if i != len(runeSlices)-1 { + if i != len(texts)-1 { to.logCursor() } } @@ -908,10 +914,11 @@ func isTextSpace(text string) bool { // PageText represents the layout of text on a device page. type PageText struct { - marks []*textMark // Texts and their positions on a PDF page. - viewText string // Extracted page text. - viewMarks []TextMark // Public view of `marks`. - pageSize model.PdfRectangle + marks []*textMark // Texts and their positions on a PDF page. + viewText string // Extracted page text. + viewMarks []TextMark // Public view of text marks`. + viewTables []TextTable // Public view of text table`. + pageSize model.PdfRectangle // Page size. Used to calculate depth. } // String returns a string describing `pt`. @@ -942,6 +949,11 @@ func (pt PageText) Marks() *TextMarkArray { return &TextMarkArray{marks: pt.viewMarks} } +// Tables returns the tables extracted from the page. +func (pt PageText) Tables() []TextTable { + return pt.viewTables +} + // computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and // `pt.viewMarks` which represent the text and marks in the order which it is read on the page. // The comments above the TextMark definition describe how to use the []TextMark to @@ -953,6 +965,7 @@ func (pt *PageText) computeViews() { paras.writeText(b) pt.viewText = b.String() pt.viewMarks = paras.toTextMarks() + pt.viewTables = paras.toTables() } // TextMarkArray is a collection of TextMarks. @@ -1119,6 +1132,13 @@ var spaceMark = TextMark{ Meta: true, } +// TextTable represents a table. +// Cells are ordered top-to-bottom, left-to-right. +type TextTable struct { + W, H int + Cells [][]string +} + // getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is // empty. func (to *textObject) getCurrentFont() *model.PdfFont { diff --git a/extractor/text_bound.go b/extractor/text_bound.go index 52b13c0b..16afae4e 100644 --- a/extractor/text_bound.go +++ b/extractor/text_bound.go @@ -19,11 +19,11 @@ import ( var serial serialState type serialState struct { - mark int - word int - bins int - line int - para int + mark int + word int + strata int + line int + para int } func (serial *serialState) reset() { @@ -65,15 +65,25 @@ func diffReading(a, b bounded) float64 { return a.bbox().Llx - b.bbox().Llx } -// func boundedUnion(objs ...bounded) model.PdfRectangle { -// rect := objs[0].bbox() -// for _, r := range objs[1:] { -// rect = rectUnion(rect, r.bbox()) -// } -// return rect -// } +func boundedUnion(objs ...bounded) model.PdfRectangle { + rect := objs[0].bbox() + for _, r := range objs[1:] { + rect = rectUnion(rect, r.bbox()) + } + return rect +} -// diffDepth returns `a` - `b` in the depth direction.. +// rectContainsBounded returns true if `a` contains `b`. +func rectContainsBounded(a model.PdfRectangle, b bounded) bool { + return rectContainsRect(a, b.bbox()) +} + +// rectContainsRect returns true if `a` contains `b`. +func rectContainsRect(a, b model.PdfRectangle) bool { + return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury +} + +// diffDepth returns `a` - `b` in the depth direction. func diffDepth(a, b bounded) float64 { return bboxDepth(a) - bboxDepth(b) } @@ -151,3 +161,19 @@ func overlappedXRect(r0, r1 model.PdfRectangle) bool { func overlappedYRect(r0, r1 model.PdfRectangle) bool { return (r0.Lly <= r1.Lly && r1.Lly <= r0.Ury) || (r0.Lly <= r1.Ury && r1.Ury <= r0.Ury) } + +// minInt return the lesser of `a` and `b`. +func minInt(a, b int) int { + if a < b { + return a + } + return b +} + +// maxInt return the greater of `a` and `b`. +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/extractor/text_const.go b/extractor/text_const.go index 4f964e1b..c1df77f7 100644 --- a/extractor/text_const.go +++ b/extractor/text_const.go @@ -5,8 +5,24 @@ package extractor +// The follow constant configure debugging. const ( + verbose = false + verboseGeom = false + verbosePage = false + verbosePara = false + verboseTable = false +) +// The following constants control the approaches used in the code. +const ( + useTables = true + doHyphens = true + useEBBox = false +) + +// The following constants are the tuning parameter for text extracton +const ( // Size of depth bins in points depthBinPoints = 6 diff --git a/extractor/text_line.go b/extractor/text_line.go index 69bf98ed..cb315d66 100644 --- a/extractor/text_line.go +++ b/extractor/text_line.go @@ -20,10 +20,12 @@ type textLine struct { model.PdfRectangle // Bounding box (union of `marks` bounding boxes). depth float64 // Distance from bottom of line to top of page. words []*textWord // Words in this line. - fontsize float64 - hyphenated bool + fontsize float64 // Largest word font size. + hyphenated bool // Does line have at least minHyphenation runes and end in a hyphen. } +const minHyphenation = 4 + // newTextLine creates a line with font and bbox size of `w`, removes `w` from p.bins[bestWordDepthIdx] and adds it to the line func newTextLine(p *textStrata, depthIdx int) *textLine { words := p.getStratum(depthIdx) @@ -60,31 +62,22 @@ func (l *textLine) text() string { } } return strings.Join(words, "") - } // toTextMarks returns the TextMarks contained in `l`.text(). // `offset` is used to give the TextMarks the correct Offset values. func (l *textLine) toTextMarks(offset *int) []TextMark { var marks []TextMark - addMark := func(mark TextMark) { - mark.Offset = *offset - marks = append(marks, mark) - *offset += len(mark.Text) - } - addSpaceMark := func(spaceChar string) { - mark := spaceMark - mark.Text = spaceChar - addMark(mark) - } for _, word := range l.words { - for _, tm := range word.marks { - addMark(tm.ToTextMark()) - } + wordMarks := word.toTextMarks(offset) + marks = append(marks, wordMarks...) if word.spaceAfter { - addSpaceMark(" ") + marks = appendSpaceMark(marks, offset, " ") } } + if len(l.text()) > 0 && len(marks) == 0 { + panic(l.text()) + } return marks } @@ -130,16 +123,13 @@ func (l *textLine) mergeWordFragments() { } // check for hyphen at end of line - runes := []rune(l.text()) - l.hyphenated = len(runes) >= 4 && + l.hyphenated = isHyphenated(l.text()) +} + +// isHyphenated returns true if `text` is a hyphenated word. +func isHyphenated(text string) bool { + runes := []rune(text) + return len(runes) >= minHyphenation && unicode.Is(unicode.Hyphen, runes[len(runes)-1]) && !unicode.IsSpace(runes[len(runes)-2]) - // if l.hyphenated { - // // fmt.Fprintf(os.Stderr, "\n%q ", l.text()) - // common.Log.Info("### %d %q\n\t%q:%t\n\t%q:%t", - // len(runes), l.text(), - // runes[len(runes)-1], unicode.Is(unicode.Hyphen, runes[len(runes)-1]), - // runes[len(runes)-2], !unicode.IsSpace(runes[len(runes)-2]), - // ) - // } } diff --git a/extractor/text_mark.go b/extractor/text_mark.go index b7d9fcf8..f23d3a77 100644 --- a/extractor/text_mark.go +++ b/extractor/text_mark.go @@ -21,11 +21,6 @@ type textMark struct { model.PdfRectangle // Bounding box. text string // The text (decoded via ToUnicode). original string // Original text (decoded). - orient int // The text orientation in degrees. This is the current TRM rounded to 10°. - orientedStart transform.Point // Left of text in orientation where text is horizontal. - orientedEnd transform.Point // Right of text in orientation where text is horizontal. - height float64 // Text height. - spaceWidth float64 // Best guess at the width of a space in the font the text was rendered with. font *model.PdfFont // The font the mark was drawn with. fontsize float64 // The font size the mark was drawn with. charspacing float64 // TODO (peterwilliams97: Should this be exposed in TextMark? @@ -74,25 +69,20 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo bbox = clipped tm := textMark{ - text: text, - orient: orient, - PdfRectangle: bbox, - orientedStart: start.Rotate(theta), - orientedEnd: end.Rotate(theta), - height: math.Abs(height), - spaceWidth: spaceWidth, - font: font, - fontsize: height, - charspacing: charspacing, - trm: trm, - end: end, - serial: serial.mark, + text: text, + PdfRectangle: bbox, + font: font, + fontsize: height, + charspacing: charspacing, + trm: trm, + end: end, + serial: serial.mark, } serial.mark++ if !isTextSpace(tm.text) && tm.Width() == 0.0 { common.Log.Debug("ERROR: Zero width text. tm=%s", tm.String()) } - if verbose { + if verboseGeom { common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String()) } @@ -110,11 +100,6 @@ func (tm *textMark) bbox() model.PdfRectangle { return tm.PdfRectangle } -// Width returns the width of `tm`.text in the text direction. -func (tm *textMark) Width() float64 { - return math.Abs(tm.orientedStart.X - tm.orientedEnd.X) -} - // ToTextMark returns the public view of `tm`. func (tm *textMark) ToTextMark() TextMark { return TextMark{ @@ -127,6 +112,23 @@ func (tm *textMark) ToTextMark() TextMark { } } +// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted +// text. +func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark { + mark.Offset = *offset + marks = append(marks, mark) + *offset += len(mark.Text) + return marks +} + +// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`, +// the offset of `mark` in the extracted text. +func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark { + mark := spaceMark + mark.Text = spaceChar + return appendTextMark(marks, offset, mark) +} + // nearestMultiple return the integer multiple of `m` that is closest to `x`. func nearestMultiple(x float64, m int) int { if m == 0 { diff --git a/extractor/text_page.go b/extractor/text_page.go index 2b8d2679..1830dabd 100644 --- a/extractor/text_page.go +++ b/extractor/text_page.go @@ -9,16 +9,12 @@ import ( "fmt" "io" "math" - "unicode" + "sort" "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/model" ) -// paraList is a sequence of textPara. We use it so often that it is convenient to have its own -// type so we can have methods on it. -type paraList []*textPara - // makeTextPage builds a paraList from `marks`, the textMarks on a page. func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList { common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize) @@ -35,28 +31,21 @@ func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraL for i, para := range paraStratas { paras[i] = composePara(para) } - if verbose || true { - common.Log.Info("unsorted=========----------=====") - for i, para := range paras { - common.Log.Info("paras[%d]=%.2f%q", i, para.PdfRectangle, truncate(paras[i].text(), 200)) - } - } + paras.log("unsorted") + // paras.computeEBBoxes() + + if useTables { + paras = paras.extractTables() + } + // paras.log("tables extracted") paras.computeEBBoxes() - paras = paras.extractTables() + paras.log("EBBoxes 2") // Sort the paras into reading order. paras.sortReadingOrder() - if verbose || true { - common.Log.Info("para sorted in reading order -----------=========") - for i, para := range paras { - tab := "" - if para.table != nil { - tab = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h) - } - fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tab, truncate(para.text(), 50)) - } - } + paras.log("sorted in reading order") + return paras } @@ -72,7 +61,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata { // Some bins are emptied before they iterated to (seee "surving bin" above). // If a `page` survives until it is iterated to then at least one `para` will be built around it. - if verbose { + if verbosePage { common.Log.Info("dividePage") } cnt := 0 @@ -89,7 +78,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata { firstReadingIdx := page.firstReadingIndex(depthIdx) words := page.getStratum(firstReadingIdx) moveWord(firstReadingIdx, page, para, words[0]) - if verbose { + if verbosePage { common.Log.Info("words[0]=%s", words[0].String()) } @@ -105,7 +94,7 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata { // Add words that are within maxIntraDepthGap of `para` in the depth direction. // i.e. Stretch para in the depth direction, vertically for English text. - if verbose { + if verbosePage { common.Log.Info("para depth %.2f - %.2f maxIntraDepthGap=%.2f ", para.minDepth(), para.maxDepth(), maxIntraDepthGap) } @@ -159,6 +148,9 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata { // Sort the words in `para`'s bins in the reading direction. para.sort() + if verbosePage { + common.Log.Info("para=%s", para.String()) + } paraStratas = append(paraStratas, para) } } @@ -166,40 +158,11 @@ func dividePage(page *textStrata, pageHeight float64) []*textStrata { return paraStratas } -const doHyphens = true -const useTables = true - // writeText writes the text in `paras` to `w`. func (paras paraList) writeText(w io.Writer) { - for ip, para := range paras { - if useTables { - para.writeText(w) - } else { - for il, line := range para.lines { - s := line.text() - reduced := false - if doHyphens { - if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) { - // Line ending with hyphen. Remove it. - runes := []rune(s) - s = string(runes[:len(runes)-1]) - reduced = true - } - } - w.Write([]byte(s)) - if reduced { - // We removed the hyphen from the end of the line so we don't need a line ending. - continue - } - if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) { - // Next line is the same depth so it's the same line as this one in the extracted text - w.Write([]byte(" ")) - continue - } - w.Write([]byte("\n")) - } - w.Write([]byte("\n")) - } + for _, para := range paras { + para.writeText(w) + w.Write([]byte("\n")) } } @@ -208,69 +171,35 @@ func (paras paraList) writeText(w io.Writer) { func (paras paraList) toTextMarks() []TextMark { offset := 0 var marks []TextMark - addMark := func(mark TextMark) { - mark.Offset = offset - marks = append(marks, mark) - offset += len(mark.Text) - } - addSpaceMark := func(spaceChar string) { - mark := spaceMark - mark.Text = spaceChar - addMark(mark) - } - for ip, para := range paras { - if useTables { - paraMarks := para.toTextMarks(&offset) - marks = append(marks, paraMarks...) - } else { - for il, line := range para.lines { - lineMarks := line.toTextMarks(&offset) - marks = append(marks, lineMarks...) - reduced := false - if doHyphens { - if line.hyphenated && (il != len(para.lines)-1 || ip != len(paras)-1) { - tm := marks[len(marks)-1] - r := []rune(tm.Text) - if unicode.IsSpace(r[len(r)-1]) { - panic(tm) - } - if len(r) == 1 { - marks = marks[:len(marks)-1] - offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text) - } else { - s := string(r[:len(r)-1]) - offset += len(s) - len(tm.Text) - tm.Text = s - } - reduced = true - } - } - if reduced { - continue - } - if il != len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) { - // Next line is the same depth so it's the same line as this one in the extracted text - addSpaceMark(" ") - continue - } - addSpaceMark("\n") - } - if ip != len(paras)-1 { - addSpaceMark("\n") - } - } + for _, para := range paras { + paraMarks := para.toTextMarks(&offset) + marks = append(marks, paraMarks...) + marks = appendSpaceMark(marks, &offset, "\n") } return marks } +func (paras paraList) toTables() []TextTable { + var tables []TextTable + for _, para := range paras { + if para.table != nil { + tables = append(tables, para.table.toTextTable()) + } + } + return tables +} + // sortReadingOrder sorts `paras` in reading order. func (paras paraList) sortReadingOrder() { common.Log.Debug("sortReadingOrder: paras=%d ===========x=============", len(paras)) if len(paras) <= 1 { return } + sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 }) + paras.log("diffReadingDepth") adj := paras.adjMatrix() order := topoOrder(adj) + printAdj(adj) paras.reorder(order) } @@ -290,22 +219,23 @@ func (paras paraList) adjMatrix() [][]bool { adj[i][j], reasons[i][j] = paras.before(i, j) } } - if verbose && false { + if verbosePage { + show := func(a *textPara) string { + return fmt.Sprintf("%6.2f %q", a.eBBox, truncate(a.text(), 70)) + } common.Log.Info("adjMatrix =======") for i := 0; i < n; i++ { a := paras[i] - fmt.Printf("%4d: %q %.2f\n", i, truncate(a.text(), 50), a.PdfRectangle) + fmt.Printf("%4d: %s\n", i, show(a)) for j := 0; j < n; j++ { if i == j { continue } - if !adj[i][j] { + if !adj[i][j] && i != 16 { continue } b := paras[j] - fmt.Printf("%8d: %10s %q %.2f\n", j, - reasons[i][j], truncate(b.text(), 40), b.PdfRectangle) - + fmt.Printf("%8d: %t %10s %s\n", j, adj[i][j], reasons[i][j], show(b)) } } } @@ -344,7 +274,7 @@ func (paras paraList) before(i, j int) (bool, string) { continue } if overlappedXPara(a, c) && overlappedXPara(c, b) { - return false, "Y intervening" + return false, fmt.Sprintf("Y intervening: %d: %s", k, c) } } return true, "TO LEFT" @@ -358,13 +288,21 @@ func overlappedXPara(r0, r1 *textPara) bool { // computeEBBoxes computes the eBBox fields in the elements of `paras`. func (paras paraList) computeEBBoxes() { - common.Log.Trace("computeEBBoxes:") + if verbose { + common.Log.Info("computeEBBoxes:") + } - for i, a := range paras { - // [llx, urx] is the reading direction interval for which no paras overlap `a` + for _, para := range paras { + para.eBBox = para.PdfRectangle + } + + for i, aa := range paras { + a := aa.eBBox + // [llx, urx] is the reading direction interval for which no paras overlap `a`. llx := -1.0e9 urx := +1.0e9 - for j, b := range paras { + for j, bb := range paras { + b := bb.eBBox if i == j || !(a.Lly <= b.Ury && b.Lly <= a.Ury) { continue } @@ -385,27 +323,65 @@ func (paras paraList) computeEBBoxes() { // Go through all paras below `a` within interval [llx, urx] in the reading direction and // expand `a` as far as possible to left and right without overlapping any of them. - a.eBBox = a.PdfRectangle - for j, b := range paras { + + for j, bb := range paras { + b := bb.eBBox if i == j || b.Ury > a.Lly { continue } // If `b` is completely to right of `llx`, extend `a` left to `b`. if llx <= b.Llx { - a.eBBox.Llx = math.Min(a.eBBox.Llx, b.Llx) + a.Llx = math.Min(a.Llx, b.Llx) } // If `b` is completely to left of `urx`, extend `a` right to `b`. if b.Urx <= urx { - a.eBBox.Urx = math.Max(a.eBBox.Urx, b.Urx) + a.Urx = math.Max(a.Urx, b.Urx) } } + if verbose { + fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50)) + } + aa.eBBox = a + } + if useEBBox { + for _, para := range paras { + para.PdfRectangle = para.eBBox + } + } +} + +// printAdj prints `adj` to stdout. +func printAdj(adj [][]bool) { + if !verbosePage { + return + } + common.Log.Info("printAdj:") + n := len(adj) + fmt.Printf("%3s:", "") + for x := 0; x < n; x++ { + fmt.Printf("%3d", x) + } + fmt.Println() + for y := 0; y < n; y++ { + fmt.Printf("%3d:", y) + for x := 0; x < n; x++ { + s := "" + if adj[y][x] { + s = "X" + } + fmt.Printf("%3s", s) + } + fmt.Println() } } // topoOrder returns the ordering of the topological sort of the nodes with adjacency matrix `adj`. func topoOrder(adj [][]bool) []int { + if verbosePage { + common.Log.Info("topoOrder:") + } n := len(adj) visited := make([]bool, n) var order []int @@ -427,11 +403,16 @@ func topoOrder(adj [][]bool) []int { sortNode(idx) } } - // Order is currently reversed so change it to forward order. - for i := 0; i < n/2; i++ { - order[i], order[n-1-i] = order[n-1-i], order[i] + return reversed(order) +} + +// reversed return `order` reversed. +func reversed(order []int) []int { + rev := make([]int, len(order)) + for i, v := range order { + rev[len(order)-1-i] = v } - return order + return rev } // reorder reorders `para` to the order in `order`. diff --git a/extractor/text_para.go b/extractor/text_para.go index a7d4549c..1384dd67 100644 --- a/extractor/text_para.go +++ b/extractor/text_para.go @@ -12,9 +12,14 @@ import ( "sort" "unicode" + "github.com/unidoc/unipdf/v3/common" "github.com/unidoc/unipdf/v3/model" ) +// paraList is a sequence of textPara. We use it so often that it is convenient to have its own +// type so we can have methods on it. +type paraList []*textPara + // textPara is a group of words in a rectangular region of a page that get read together. // An peragraph in a document might span multiple pages. This is the paragraph framgent on one page. // We start by finding paragraph regions on a page, then we break the words into the textPara into @@ -22,7 +27,7 @@ import ( type textPara struct { serial int // Sequence number for debugging. model.PdfRectangle // Bounding box. - eBBox model.PdfRectangle // Extented ounding box needed to compute reading order. + eBBox model.PdfRectangle // Extended bounding box needed to compute reading order. lines []*textLine // Paragraph text gets broken into lines. table *textTable } @@ -39,8 +44,8 @@ func newTextPara(strata *textStrata) *textPara { // String returns a description of `p`. func (p *textPara) String() string { - return fmt.Sprintf("serial=%d %.2f %d lines\n%s\n-------------", - p.serial, p.PdfRectangle, len(p.lines), p.text()) + return fmt.Sprintf("serial=%d %.2f %d lines %q", + p.serial, p.PdfRectangle, len(p.lines), truncate(p.text(), 50)) } // text returns the text of the lines in `p`. @@ -52,47 +57,21 @@ func (p *textPara) text() string { // writeText writes the text of `p` including tables to `w`. func (p *textPara) writeText(w io.Writer) { - if p.table != nil { - for y := 0; y < p.table.h; y++ { - for x := 0; x < p.table.w; x++ { - cell := p.table.cells[y*p.table.w+x] - cell.writeCellText(w) - w.Write([]byte(" ")) - } - w.Write([]byte("\n")) - } - } else { + if p.table == nil { p.writeCellText(w) - w.Write([]byte("\n")) + return } -} - -// writeCellText writes the text of `p` not including tables to `w`. -func (p *textPara) writeCellText(w io.Writer) { - // w := new(bytes.Buffer) - para := p - for il, line := range para.lines { - s := line.text() - reduced := false - if doHyphens { - if line.hyphenated && il != len(para.lines)-1 { - // Line ending with hyphen. Remove it. - runes := []rune(s) - s = string(runes[:len(runes)-1]) - reduced = true + for y := 0; y < p.table.h; y++ { + for x := 0; x < p.table.w; x++ { + cell := p.table.get(x, y) + if cell == nil { + w.Write([]byte("\t")) + } else { + cell.writeCellText(w) } - } - w.Write([]byte(s)) - if reduced { - // We removed the hyphen from the end of the line so we don't need a line ending. - continue - } - if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) { - // Next line is the same depth so it's the same line as this one in the extracted text w.Write([]byte(" ")) - continue } - if il < len(para.lines)-1 { + if y < p.table.h-1 { w.Write([]byte("\n")) } } @@ -101,90 +80,103 @@ func (p *textPara) writeCellText(w io.Writer) { // toTextMarks creates the TextMarkArray corresponding to the extracted text created by // paras `p`.writeText(). func (p *textPara) toTextMarks(offset *int) []TextMark { + if p.table == nil { + return p.toCellTextMarks(offset) + } var marks []TextMark - addMark := func(mark TextMark) { - mark.Offset = *offset - marks = append(marks, mark) - *offset += len(mark.Text) - } - addSpaceMark := func(spaceChar string) { - mark := spaceMark - mark.Text = spaceChar - addMark(mark) - } - if p.table != nil { - for y := 0; y < p.table.h; y++ { - for x := 0; x < p.table.w; x++ { - cell := p.table.cells[y*p.table.w+x] + for y := 0; y < p.table.h; y++ { + for x := 0; x < p.table.w; x++ { + cell := p.table.get(x, y) + if cell == nil { + marks = appendSpaceMark(marks, offset, "\t") + } else { cellMarks := cell.toCellTextMarks(offset) marks = append(marks, cellMarks...) - addSpaceMark(" ") } - addSpaceMark("\n") + marks = appendSpaceMark(marks, offset, " ") + } + if y < p.table.h-1 { + marks = appendSpaceMark(marks, offset, "\n") } - } else { - marks = p.toCellTextMarks(offset) - addSpaceMark("\n") } return marks } -// toTextMarks creates the TextMarkArray corresponding to the extracted text created by +// writeCellText writes the text of `p` not including tables to `w`. +func (p *textPara) writeCellText(w io.Writer) { + for il, line := range p.lines { + lineText := line.text() + reduced := doHyphens && line.hyphenated && il != len(p.lines)-1 + if reduced { // Line ending with hyphen. Remove it. + lineText = removeLastRune(lineText) + } + w.Write([]byte(lineText)) + if !(reduced || il == len(p.lines)-1) { + w.Write([]byte(getSpace(line.depth, p.lines[il+1].depth))) + } + } +} + +// toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by // paras `paras`.writeCellText(). func (p *textPara) toCellTextMarks(offset *int) []TextMark { var marks []TextMark - addMark := func(mark TextMark) { - mark.Offset = *offset - marks = append(marks, mark) - *offset += len(mark.Text) - } - addSpaceMark := func(spaceChar string) { - mark := spaceMark - mark.Text = spaceChar - addMark(mark) - } - para := p - - for il, line := range para.lines { + for il, line := range p.lines { lineMarks := line.toTextMarks(offset) - marks = append(marks, lineMarks...) - reduced := false - if doHyphens { - if line.hyphenated && il != len(para.lines)-1 { - tm := marks[len(marks)-1] - r := []rune(tm.Text) - if unicode.IsSpace(r[len(r)-1]) { - panic(tm) - } - if len(r) == 1 { - marks = marks[:len(marks)-1] - *offset = marks[len(marks)-1].Offset + len(marks[len(marks)-1].Text) - } else { - s := string(r[:len(r)-1]) - *offset += len(s) - len(tm.Text) - tm.Text = s - } - reduced = true + reduced := doHyphens && line.hyphenated && il != len(p.lines)-1 + if reduced { // Line ending with hyphen. Remove it. + if len([]rune(line.text())) < minHyphenation { + panic(line.text()) } + if len(lineMarks) < 1 { + panic(line.text()) + } + lineMarks = removeLastTextMarkRune(lineMarks, offset) } - if reduced { - continue - } - if il < len(para.lines)-1 && isZero(line.depth-para.lines[il+1].depth) { - // Next line is the same depth so it's the same line as this one in the extracted text - addSpaceMark(" ") - continue - } - if il < len(para.lines)-1 { - addSpaceMark("\n") + marks = append(marks, lineMarks...) + if !(reduced || il == len(p.lines)-1) { + marks = appendSpaceMark(marks, offset, getSpace(line.depth, p.lines[il+1].depth)) } } - - addSpaceMark("\n") - return marks } +func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark { + tm := marks[len(marks)-1] + runes := []rune(tm.Text) + if unicode.IsSpace(runes[len(runes)-1]) { + panic(tm) + } + if len(runes) == 1 { + marks = marks[:len(marks)-1] + tm1 := marks[len(marks)-1] + *offset = tm1.Offset + len(tm1.Text) + } else { + text := removeLastRune(tm.Text) + *offset += len(text) - len(tm.Text) + tm.Text = text + } + return marks +} + +func removeLastRune(text string) string { + runes := []rune(text) + if len(runes) < 2 { + panic(text) + } + return string(runes[:len(runes)-1]) +} + +// getSpace returns the space to insert between lines of depth `depth1` and `depth2`. +// Next line is the same depth so it's the same line as this one in the extracted text +func getSpace(depth1, depth2 float64) string { + eol := !isZero(depth1 - depth2) + if eol { + return "\n" + } + return " " +} + // bbox makes textPara implement the `bounded` interface. func (p *textPara) bbox() model.PdfRectangle { return p.PdfRectangle @@ -271,5 +263,42 @@ func composePara(strata *textStrata) *textPara { if len(para.lines) == 0 { panic(para) } + if verbosePara { + common.Log.Info("!!! para=%s", para.String()) + for i, line := range para.lines { + fmt.Printf("%4d: %s\n", i, line) + for j, word := range line.words { + fmt.Printf("%8d: %s\n", j, word) + for k, mark := range word.marks { + fmt.Printf("%12d: %s\n", k, mark) + } + } + } + } return para } + +// log logs the contents of `paras`. +func (paras paraList) log(title string) { + if !verbosePage { + return + } + common.Log.Info("%8s: %d paras =======-------=======", title, len(paras)) + for i, para := range paras { + if para == nil { + continue + } + text := para.text() + tabl := " " + if para.table != nil { + tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h) + } + fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50)) + if len(text) == 0 { + panic("empty") + } + if para.table != nil && len(para.table.cells) == 0 { + panic(para) + } + } +} diff --git a/extractor/text_strata.go b/extractor/text_strata.go index f24070d4..05afa833 100644 --- a/extractor/text_strata.go +++ b/extractor/text_strata.go @@ -38,14 +38,14 @@ func makeTextStrata(words []*textWord, pageHeight float64) *textStrata { // newTextStrata returns an empty textStrata with page height `pageHeight`. func newTextStrata(pageHeight float64) *textStrata { - bins := textStrata{ - serial: serial.bins, + strata := textStrata{ + serial: serial.strata, bins: map[int][]*textWord{}, PdfRectangle: model.PdfRectangle{Urx: -1.0, Ury: -1.0}, pageHeight: pageHeight, } - serial.bins++ - return &bins + serial.strata++ + return &strata } // String returns a description of `s`. @@ -57,7 +57,9 @@ func (s *textStrata) String() string { texts = append(texts, w.text()) } } - return fmt.Sprintf("serial=%d %d %q", s.serial, len(texts), texts) + // return fmt.Sprintf("serial=%d %d %q", s.serial, ) + return fmt.Sprintf("serial=%d %.2f fontsize=%.2f %d %q", + s.serial, s.PdfRectangle, s.fontsize, len(texts), texts) } // sort sorts the words in each bin in `s` in the reading direction. @@ -129,10 +131,24 @@ func (s *textStrata) scanBand(title string, para *textStrata, if !readingOverlap(para, word) { continue } - if fontTol > 0 && math.Abs(word.fontsize-fontsize) > fontTol*fontsize { - continue + fontRatio1 := math.Abs(word.fontsize-fontsize) / fontsize + fontRatio2 := word.fontsize / fontsize + + fontRatio := math.Min(fontRatio1, fontRatio2) + if fontTol > 0 { + if fontRatio > fontTol { + continue + } + } + if fontTol <= 0 { + panic(fontTol) } if !detectOnly { + // if !para.isHomogenous(word) { + // panic(fmt.Errorf("not homogeneous fontTol=%.2f ratio=%.2f (%.2f->%.2f)\n\tpara=%s\n\tword=%s", + // fontTol, fontRatio, fontsize, word.fontsize, + // para.String(), word.String())) + // } moveWord(depthIdx, s, para, word) } newWords = append(newWords, word) @@ -155,11 +171,11 @@ func (s *textStrata) scanBand(title string, para *textStrata, } if verbose { if len(title) > 0 { - common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f", + common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f fontsize=%.2f", title, minDepth0, maxDepth0, minDepth, maxDepth, - para.PdfRectangle) + para.PdfRectangle, para.fontsize) for i, word := range newWords { fmt.Printf("%4d: %s\n", i, word) } @@ -271,6 +287,36 @@ func moveWord(depthIdx int, page, para *textStrata, word *textWord) { page.removeWord(depthIdx, word) } +func (s *textStrata) allWords() []*textWord { + var wordList []*textWord + for _, words := range s.bins { + wordList = append(wordList, words...) + } + return wordList +} + +func (s *textStrata) isHomogenous(w *textWord) bool { + words := s.allWords() + words = append(words, w) + if len(words) == 0 { + return true + } + minFont := words[0].fontsize + maxFont := minFont + for _, w := range words { + if w.fontsize < minFont { + minFont = w.fontsize + } else if w.fontsize > maxFont { + maxFont = w.fontsize + } + } + if maxFont/minFont > 1.3 { + common.Log.Error("font size range: %.2f - %.2f = %.1fx", minFont, maxFont, maxFont/minFont) + return false + } + return true +} + // removeWord removes `word`from `s`.bins[`depthIdx`]. // NOTE: We delete bins as soon as they become empty to save code that calls other textStrata // functions from having to check for empty bins. diff --git a/extractor/text_table.go b/extractor/text_table.go index b04459a6..722fc3d5 100644 --- a/extractor/text_table.go +++ b/extractor/text_table.go @@ -17,52 +17,136 @@ import ( type textTable struct { model.PdfRectangle w, h int - cells cellList + cells cellMap } -func (t textTable) bbox() model.PdfRectangle { - return t.PdfRectangle +func newTextTable(w, h int) *textTable { + return &textTable{w: w, h: h, cells: cellMap{}} } +func (t *textTable) String() string { + return fmt.Sprintf("[%dx%d] %6.2f", t.w, t.h, t.PdfRectangle) +} + +func (t *textTable) bbox() model.PdfRectangle { + rect := model.PdfRectangle{Urx: -1, Ury: -1} + for _, cell := range t.cells { + if rect.Urx < rect.Llx { + rect = cell.PdfRectangle + } else { + rect = rectUnion(rect, cell.PdfRectangle) + } + } + return rect +} + +func (t *textTable) get(x, y int) *textPara { + t.validate(x, y) + return t.cells[cellIndex{x, y}] +} +func (t *textTable) put(x, y int, cell *textPara) { + t.validate(x, y) + t.cells[cellIndex{x, y}] = cell +} +func (t *textTable) del(x, y int) { + t.validate(x, y) + delete(t.cells, cellIndex{x, y}) +} + +func (t *textTable) validate(x, y int) { + if !(0 <= x && x < t.w) { + panic(fmt.Errorf("bad x=%d t=%s", x, t)) + } + if !(0 <= y && y < t.h) { + panic(fmt.Errorf("bad y=%d t=%s", y, t)) + } +} + +// fontsize for a table is the minimum font size of the cells. +func (t *textTable) fontsize() float64 { + size := -1.0 + for _, p := range t.cells { + if p != nil { + if size < 0 { + size = p.fontsize() + } else { + size = math.Min(size, p.fontsize()) + } + } + } + return size +} + +func (t *textTable) expand(w, h int) { + if w < t.w { + panic(w) + } + if h < t.h { + panic(h) + } + t.w = w + t.h = h +} + +// !@#$% +// w := combo.w +// h := combo.h + t2.h - 1 +// common.Log.Info("COMBINE! %dx%d i1=%d i2=%d", w, h, i1, i2) +// combined := make(cellList, w*h) +// for y := 0; y < t1.h; y++ { +// for x := 0; x < w; x++ { +// combined[y*w+x] = combo.cells[y*w+x] +// } +// } +// for y := 1; y < t2.h; y++ { +// yy := y + combo.h - 1 +// for x := 0; x < w; x++ { +// combined[yy*w+x] = t2.cells[y*w+x] +// } +// } +// combo.cells = combined + +type cellIndex struct{ x, y int } + +type cellMap map[cellIndex]*textPara type cellList paraList +func (cells cellList) String() string { + return fmt.Sprintf("%d %q", len(cells), cells.asStrings()) +} + +// bbox returns the union of the bounds of `cells`. +func (cells cellList) bbox() model.PdfRectangle { + rect := cells[0].PdfRectangle + for _, r := range cells[1:] { + rect = rectUnion(rect, r.PdfRectangle) + } + return rect +} + const DBL_MIN, DBL_MAX = -1.0e10, +1.0e10 // extractTables converts the`paras` that are table cells to tables containing those cells. func (paras paraList) extractTables() paraList { common.Log.Debug("extractTables=%d ===========x=============", len(paras)) if len(paras) < 4 { - return nil + return paras } - show := func(title string) { - common.Log.Info("%8s: %d=========----------=====", title, len(paras)) - for i, para := range paras { - text := para.text() - tabl := " " - if para.table != nil { - tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h) - } - fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50)) - if len(text) == 0 { - panic("empty") - } - if para.table != nil && len(para.table.cells) == 0 { - panic(para) - } - } - } - tables := paras.extractTableAtoms() - tables = combineTables(tables) - common.Log.Info("combined tables %d ================", len(tables)) - for i, t := range tables { - t.log(fmt.Sprintf("combined %d", i)) - } - // if len(tables) == 0 {panic("NO TABLES")} - show("tables extracted") + + cells := cellList(paras) + tables := cells.findTables() + logTables(tables, "find tables") + + // tables := paras.extractTableAtoms() + // logTables(tables, "table atoms") + // tables = combineTables(tables) + // logTables(tables, "table molecules") + // // if len(tables) == 0 {panic("NO TABLES")} + // showParas("tables extracted") paras = paras.applyTables(tables) - show("tables applied") + paras.log("tables applied") paras = paras.trimTables() - show("tables trimmed") + paras.log("tables trimmed") return paras } @@ -71,22 +155,28 @@ func (paras paraList) trimTables() paraList { var recycledParas paraList seen := map[*textPara]bool{} for _, para := range paras { + table := para.table + if table == nil { + continue + } for _, p := range paras { if p == para { continue } - table := para.table - if table != nil && overlapped(table, p) { - table.log("REMOVE") - for _, cell := range table.cells { - if _, ok := seen[cell]; ok { - continue - } - recycledParas = append(recycledParas, cell) - seen[cell] = true - } - para.table.cells = nil + if !overlapped(table, p) { + continue } + // common.Log.Info("overlap REMOVE:\n\ttable=%s\n\t p=%s", table.String(), p.String()) + table.log("REMOVE") + for _, cell := range table.cells { + if _, ok := seen[cell]; ok { + continue + } + recycledParas = append(recycledParas, cell) + seen[cell] = true + } + para.table.cells = nil + } } @@ -99,7 +189,7 @@ func (paras paraList) trimTables() paraList { return recycledParas } -func (paras paraList) applyTables(tables []textTable) paraList { +func (paras paraList) applyTables(tables []*textTable) paraList { // if len(tables) == 0 {panic("no tables")} consumed := map[*textPara]bool{} for _, table := range tables { @@ -124,278 +214,12 @@ func (paras paraList) applyTables(tables []textTable) paraList { tabled = append(tabled, para) } } + if verboseTable { + common.Log.Info("applyTables: %d->%d tables=%d", len(paras), len(tabled), len(tables)) + } return tabled } -// extractTableAtome returns all the 2x2 table candidateds in `paras`. -func (paras paraList) extractTableAtoms() []textTable { - // Pre-sort by reading direction then depth - sort.Slice(paras, func(i, j int) bool { - return diffReadingDepth(paras[i], paras[j]) < 0 - }) - - var llx0, lly0, llx1, lly1 float64 - var tables []textTable - - for _, para1 := range paras { - llx0, lly0 = DBL_MAX, DBL_MIN - llx1, lly1 = DBL_MAX, DBL_MIN - - // Build a table fragment of 4 cells - // 0 1 - // 2 3 - // where - // 0 is `para1` - // 1 is on the right of 0 and overlaps with 0 in y axis - // 2 is under 0 and overlaps with 0 in x axis - // 3 is under 1 and on the right of 1 and closest to 0 - cells := make(cellList, 4) - cells[0] = para1 - - for _, para2 := range paras { - if para1 == para2 { - continue - } - if yOverlap(para1, para2) && toRight(para2, para1) && para2.Llx < llx0 { - llx0 = para2.Llx - cells[1] = para2 - } else if xOverlap(para1, para2) && below(para2, para1) && para2.Ury > lly0 { - lly0 = para2.Ury - cells[2] = para2 - } else if toRight(para2, para1) && para2.Llx < llx1 && below(para2, para1) && para2.Ury > lly1 { - llx1 = para2.Llx - lly1 = para2.Ury - cells[3] = para2 - } - } - // if we found any then look whether they form a table !@#$ - if !(cells[1] != nil && cells[2] != nil && cells[3] != nil) { - continue - } - // 1 cannot overlap with 2 in x and y - // 3 cannot overlap with 2 in x and with 1 in y - // 3 has to overlap with 2 in y and with 1 in x - - if (xOverlap(cells[2], cells[3]) || yOverlap(cells[1], cells[3]) || - xOverlap(cells[1], cells[2]) || yOverlap(cells[1], cells[2])) || - !(xOverlap(cells[1], cells[3]) && yOverlap(cells[2], cells[3])) { - continue - } - - // common.Log.Info("@@10 ip=%d %s", ip, truncate(para1.text(), 40)) - - deltaX := cells.fontsize() - deltaY := deltaX - // deltaX *= minColSpacing1; !@#$ - // deltaY *= maxIntraLineDelta; - deltaX *= maxIntraReadingGapR - deltaY *= lineDepthR - - correspondenceX := cells.alignedX(cells.fontsize() * maxIntraReadingGapR) - correspondenceY := cells.alignedY(cells.fontsize() * lineDepthR) - - // are blocks aligned in x and y ? - if correspondenceX > 0 && correspondenceY > 0 { - table := newTable(cells, 2, 2) - tables = append(tables, table) - table.log("New textTable") - // common.Log.Info("New textTable\n %6.2f", table.PdfRectangle) - // for i, p := range cells { - // fmt.Printf("%4d: %6.2f %q\n", i, p.PdfRectangle, truncate(p.text(), 50)) - // } - } - } - return tables -} - -func (table textTable) log(title string) { - common.Log.Info("~~~ %s: %s: %d x %d\n %6.2f", title, fileLine(1, false), - table.w, table.h, table.PdfRectangle) - for i, p := range table.cells { - fmt.Printf("%4d: %6.2f %q\n", i, p.PdfRectangle, truncate(p.text(), 50)) - } -} - -// 0 1 -// 2 3 -// A B -// C -// Extensions: -// A[1] == B[0] right -// A[2] == C[0] down -func combineTables(tables []textTable) []textTable { - // if len(tables) == 0 {panic("tables")} - tablesY := combineTablesY(tables) - // if len(tablesY) == 0 { panic("tablesY")} - heightTables := map[int][]textTable{} - for _, table := range tablesY { - heightTables[table.h] = append(heightTables[table.h], table) - } - // if len(heightTables) == 0 {panic("heightTables")} - var heights []int - for h := range heightTables { - heights = append(heights, h) - } - // Try to extend tallest tables to the right - sort.Slice(heights, func(i, j int) bool { return heights[i] > heights[j] }) - // for _, h := range heights { - // columns := heightTables[h] - // if len(columns) < 2 { - // continue - // } - // heightTables[h] = combineTablesX(columns) - // } - - var combined []textTable - for _, h := range heights { - combined = append(combined, heightTables[h]...) - } - for i, table := range combined { - table.log(fmt.Sprintf("Combined %d", i)) - } - return combined -} - -func combineTablesY(tables []textTable) []textTable { - sort.Slice(tables, func(i, j int) bool { return tables[i].Ury > tables[j].Ury }) - removed := map[int]bool{} - - var combinedTables []textTable - common.Log.Info("combineTablesY ------------------\n\t ------------------") - for i1, t1 := range tables { - if _, ok := removed[i1]; ok { - continue - } - fontsize := t1.cells.fontsize() - c1 := t1.corners() - var combo *textTable - for i2, t2 := range tables { - if _, ok := removed[i2]; ok { - continue - } - if t1.w != t2.w { - continue - } - c2 := t2.corners() - if c1[2] != c2[0] { - continue - } - // common.Log.Info("Comparing i1=%d i2=%d", i1, i2) - // t1.log("t1") - // t2.log("t2") - cells := cellList{ - c1[0], c1[1], - c2[2], c2[3], - } - alX := cells.alignedX(fontsize * maxIntraReadingGapR) - alY := cells.alignedY(fontsize * lineDepthR) - common.Log.Info("alX=%d alY=%d", alX, alY) - if !(alX > 0 && alY > 0) { - if combo != nil { - combinedTables = append(combinedTables, *combo) - } - combo = nil - continue - } - if combo == nil { - combo = &t1 - removed[i1] = true - } - - w := combo.w - h := combo.h + t2.h - 1 - common.Log.Info("COMBINE! %dx%d", w, h) - combined := make(cellList, w*h) - for y := 0; y < t1.h; y++ { - for x := 0; x < w; x++ { - combined[y*w+x] = combo.cells[y*w+x] - } - } - for y := 1; y < t2.h; y++ { - yy := y + combo.h - 1 - for x := 0; x < w; x++ { - combined[yy*w+x] = t2.cells[y*w+x] - } - } - combo.cells = combined - combo.h = h - combo.log("combo") - removed[i2] = true - fontsize = combo.cells.fontsize() - c1 = combo.corners() - } - if combo != nil { - combinedTables = append(combinedTables, *combo) - } - } - - common.Log.Info("combineTablesY a: combinedTables=%d", len(combinedTables)) - for i, t := range tables { - if _, ok := removed[i]; ok { - continue - } - combinedTables = append(combinedTables, t) - } - common.Log.Info("combineTablesY b: combinedTables=%d", len(combinedTables)) - - return combinedTables -} - -func combineTablesX(tables []textTable) []textTable { - sort.Slice(tables, func(i, j int) bool { return tables[i].Llx < tables[j].Llx }) - removed := map[int]bool{} - for i1, t1 := range tables { - if _, ok := removed[i1]; ok { - continue - } - fontsize := t1.cells.fontsize() - c1 := t1.corners() - for i2, t2 := range tables { - if _, ok := removed[i2]; ok { - continue - } - if t1.w != t2.w { - continue - } - c2 := t2.corners() - if c1[1] != c2[0] { - continue - } - cells := cellList{ - c1[0], c2[1], - c1[2], c2[3], - } - if !(cells.alignedX(fontsize*maxIntraReadingGapR) > 0 && - cells.alignedY(fontsize*lineDepthR) > 0) { - continue - } - w := t1.w + t2.w - h := t1.h - combined := make(cellList, w*h) - for y := 0; y < h; y++ { - for x := 0; x < t1.w; x++ { - combined[y*w+x] = t1.cells[y*w+x] - } - for x := 0; x < t2.w; x++ { - xx := x + t1.w - combined[y*w+xx] = t1.cells[y*w+x] - } - } - removed[i2] = true - fontsize = t1.cells.fontsize() - c1 = t1.corners() - } - } - var reduced []textTable - for i, t := range tables { - if _, ok := removed[i]; ok { - continue - } - reduced = append(reduced, t) - } - return reduced -} - func yOverlap(para1, para2 *textPara) bool { // blk2->yMin <= blk1->yMax &&blk2->yMax >= blk1->yMin return para2.Lly <= para1.Ury && para1.Lly <= para2.Ury @@ -413,46 +237,46 @@ func below(para2, para1 *textPara) bool { return para2.Ury < para1.Lly } -func (paras cellList) cellDepths() []float64 { - topF := func(p *textPara) float64 { return p.Ury } - botF := func(p *textPara) float64 { return p.Lly } - top := paras.calcCellDepths(topF) - bottom := paras.calcCellDepths(botF) - if len(bottom) < len(top) { - return bottom - } - return top -} +// func (paras cellList) cellDepths() []float64 { +// topF := func(p *textPara) float64 { return p.Ury } +// botF := func(p *textPara) float64 { return p.Lly } +// top := paras.calcCellDepths(topF) +// bottom := paras.calcCellDepths(botF) +// if len(bottom) < len(top) { +// return bottom +// } +// return top +// } -func (paras cellList) calcCellDepths(getY func(*textPara) float64) []float64 { - depths := []float64{getY(paras[0])} - delta := paras.fontsize() * maxIntraDepthGapR - for _, para := range paras { - newDepth := true - y := getY(para) - for _, d := range depths { - if math.Abs(d-getY(para)) < delta { - newDepth = false - break - } - } - if newDepth { - depths = append(depths, y) - } - } - return depths -} +// func (paras cellList) calcCellDepths(getY func(*textPara) float64) []float64 { +// depths := []float64{getY(paras[0])} +// delta := paras.fontsize() * maxIntraDepthGapR +// for _, para := range paras { +// newDepth := true +// y := getY(para) +// for _, d := range depths { +// if math.Abs(d-getY(para)) < delta { +// newDepth = false +// break +// } +// } +// if newDepth { +// depths = append(depths, y) +// } +// } +// return depths +// } -func (c *textTable) corners() paraList { - w, h := c.w, c.h +func (t *textTable) __corners() paraList { + w, h := t.w, t.h if w == 0 || h == 0 { - panic(c) + panic(t) } cnrs := paraList{ - c.cells[0], - c.cells[w-1], - c.cells[w*(h-1)], - c.cells[w*h-1], + t.get(0, 0), + t.get(w-1, 0), + t.get(0, h-1), + t.get(w-1, h-1), } for i0, c0 := range cnrs { for _, c1 := range cnrs[:i0] { @@ -464,38 +288,44 @@ func (c *textTable) corners() paraList { return cnrs } -func newTable(cells cellList, w, h int) textTable { - if w == 0 || h == 0 { - panic("emprty") - } - for i0, c0 := range cells { - for _, c1 := range cells[:i0] { - if c0.serial == c1.serial { - panic("dup") - } - } - } - rect := cells[0].PdfRectangle - for _, c := range cells[1:] { - rect = rectUnion(rect, c.PdfRectangle) - } - return textTable{ - PdfRectangle: rect, - w: w, - h: h, - cells: cells, - } -} +// func newTable(cells cellList, w, h int) textTable { +// if w == 0 || h == 0 { +// panic("emprty") +// } +// for i0, c0 := range cells { +// for _, c1 := range cells[:i0] { +// if c0.serial == c1.serial { +// panic("dup") +// } +// } +// } +// rect := cells[0].PdfRectangle +// for _, c := range cells[1:] { +// rect = rectUnion(rect, c.PdfRectangle) +// } +// return textTable{ +// PdfRectangle: rect, +// w: w, +// h: h, +// cells: cells, +// } +// } -func (table textTable) newTablePara() *textPara { - cells := table.cells - sort.Slice(cells, func(i, j int) bool { return diffDepthReading(cells[i], cells[j]) < 0 }) - table.cells = cells +func (table *textTable) newTablePara() *textPara { + // var cells cellList + // for _, cell := range table.cells { + // if cell != nil { + // cells = append(cells, cell) + // } + // } + // sort.Slice(cells, func(i, j int) bool { return diffDepthReading(cells[i], cells[j]) < 0 }) + // table.cells = cells + bbox := table.bbox() para := textPara{ serial: serial.para, - PdfRectangle: table.PdfRectangle, - eBBox: table.PdfRectangle, - table: &table, + PdfRectangle: bbox, + eBBox: bbox, + table: table, } table.log(fmt.Sprintf("newTablePara: serial=%d", para.serial)) @@ -503,10 +333,28 @@ func (table textTable) newTablePara() *textPara { return ¶ } -func (cells cellList) alignedX(delta float64) int { +// aligned2x2X return an X alignment score for the 2x2 table atom `cells`. +func (cells cellList) aligned2x2X(delta float64) int { + if len(cells) != 4 { + panic(fmt.Errorf("cells=%d", len(cells))) + } matches := 0 for _, get := range gettersX { - if cells.aligned(0, 2, delta, get) && cells.aligned(1, 3, delta, get) { + if cells.aligned(get, delta, 0, 2) && cells.aligned(get, delta, 1, 3) { + matches++ + } + } + return matches +} + +// aligned2x2Y return a Y alignment score for the 2x2 table atom `cells`. +func (cells cellList) aligned2x2Y(delta float64) int { + if len(cells) != 4 { + panic(fmt.Errorf("cells=%d", len(cells))) + } + matches := 0 + for _, get := range gettersY { + if cells.aligned(get, delta, 0, 1) && cells.aligned(get, delta, 2, 3) { matches++ } } @@ -514,23 +362,568 @@ func (cells cellList) alignedX(delta float64) int { } func (cells cellList) alignedY(delta float64) int { - matches := 0 - for _, get := range gettersY { - if cells.aligned(0, 1, delta, get) && cells.aligned(2, 3, delta, get) { - matches++ + worstMatches := 100 + for i := 1; i < len(cells); i++ { + matches := 0 + for _, get := range gettersY { + if cells.aligned(get, delta, i-1, i) { + matches++ + } + } + if matches < worstMatches { + worstMatches = matches } } - return matches + return worstMatches } -func (cells cellList) aligned(i, j int, delta float64, get getter) bool { - return parasAligned(cells[i], cells[j], delta, get) +// aligned returns true if `cells` are aligned on attribute `get` for indexes `i` and 'j`. +func (cells cellList) aligned(get getter, delta float64, i, j int) bool { + if !(0 <= i && i < len(cells) && 0 <= j && j < len(cells)) { + panic(fmt.Errorf("i=%d j=%d cells=%d", i, j, len(cells))) + } + return parasAligned(get, delta, cells[i], cells[j]) +} + +// parasAligned returns true if `para1` and `para2` are aligned within `delta` for attribute `get`. +func parasAligned(get getter, delta float64, para1, para2 *textPara) bool { + z1 := get(para1) + z2 := get(para2) + return math.Abs(z1-z2) <= delta +} + +// fontsize for a paraList is the minimum font size of the paras. +func (paras cellList) fontsize() float64 { + size := -1.0 + for _, p := range paras { + if p != nil { + if size < 0 { + size = p.fontsize() + } else { + size = math.Min(size, p.fontsize()) + } + } + } + return size +} + +// insertAt inserts `table` in `t` at `x`, `y`. +func (t *textTable) insertAt(x, y int, table *textTable) { + if !(0 <= x && x < t.w) { + panic(fmt.Errorf("x=%d is an invalid insertion for %s", x, t)) + } + if !(0 <= y && y < t.h) { + panic(fmt.Errorf("y=%d is an invalid insertion for %s", y, t)) + } + if t.w < x+table.w { + panic(fmt.Errorf("x=%d is an invalid insertion for %s", x, t)) + } + if t.h < y+table.h { + panic(fmt.Errorf("y=%d is an invalid insertion for %s", y, t)) + } + for idx, cell := range table.cells { + idx.x += x + idx.y += y + t.cells[idx] = cell + t.PdfRectangle = rectUnion(t.PdfRectangle, cell.PdfRectangle) + } +} + +// subTable returns the `w` x `h` subtable of `t` at 0,0. +func (t *textTable) subTable(w, h int) *textTable { + if !(1 <= w && w <= t.w) { + panic(fmt.Errorf("w=%d is an invalid sub-width for %s", w, t)) + } + if !(1 <= h && h <= t.h) { + panic(fmt.Errorf("h=%d is an invalid sub-height for %s", h, t)) + } + table := newTextTable(w, h) + for y := 0; y < h; y++ { + for x := 0; x < w; x++ { + cell := t.get(x, y) + if cell == nil { + continue + } + table.put(x, y, cell) + table.PdfRectangle = rectUnion(table.PdfRectangle, cell.PdfRectangle) + } + } + return table +} + +// row returns the (0-offset) `y`th row in `t`. +func (t textTable) row(y int) cellList { + if !(0 <= y && y < t.h) { + panic(fmt.Errorf("y=%d is an invalid row for %s", y, t.String())) + } + cells := make(cellList, t.w) + for x := 0; x < t.w; x++ { + cells[x] = t.get(x, y) + } + return cells +} + +// column returns the (0-offset) `x`th column in `t`. +func (t textTable) column(x int) cellList { + if !(0 <= x && x < t.w) { + panic(fmt.Errorf("x=%d is an invalid column for %s", x, t.String())) + } + cells := make(cellList, t.h) + for y := 0; y < t.h; y++ { + cells[y] = t.get(x, y) + } + return cells +} + +// cellSet returns `cells` as a set. +func (cells cellList) cellSet() map[*textPara]bool { + set := map[*textPara]bool{} + for _, cell := range cells { + set[cell] = true + } + return set +} + +// overlapRange returns i0, i1 where cells[i0,i1] is the maximum overlap with `other`. +func (cells cellList) overlapRange(other cellList) (int, int) { + i0, i1 := -1, len(cells) + for i, c := range cells { + if i0 < 0 { + if c == other[0] { + i0 = i + } + continue + } + if i-i0 >= len(other) || c != other[i-i0] { + i1 = i + break + } + } + if i0 < 0 { + panic("no match") + } + return i0, i1 +} + +// toTextTable returns the TextTable corresponding to `t`. +func (t textTable) toTextTable() TextTable { + cells := make([][]string, t.h) + for y := 0; y < t.h; y++ { + cells[y] = make([]string, t.w) + for x := 0; x < t.w; x++ { + cell := t.get(x, y) + if cell != nil { + cells[y][x] = cell.text() + } + } + } + return TextTable{W: t.w, H: t.h, Cells: cells} +} + +// +// Cell sorting +// +// x x x x x x +// x +// x x +// x +// x x x +// x +// x + +// 1. Compute all row candidates +// alignedY No intervening paras +// 2. Compute all column candidates +// alignedX No intervening paras + +// Table candidate +// 1. Top row fully populated +// 2. Left column fully populated +// 3. All cells in table are aligned with 1 top row element and 1 left column candidate +// 4. Mininum number of cells must be filled + +// Computation time +// 1. Row candidates O(N) +// Sort top to bottom, left to right +// Search +// 2. Column candidates O(N) +// Sort left to right, top to bottom +// Search +// 3. Find intersections O(N^2) +// For each row +// Find columns that start at row -> table candiates +// Sort table candidates by w x h descending +// 4. Test each candidate O(N^4) + +func (cells cellList) findTables() []*textTable { + if verboseTable { + common.Log.Info("findTables @@1: cells=%d", len(cells)) + } + + cols := cells.findGetterCandidates(getXLl, maxIntraReadingGapR, false) + rows := cells.findGetterCandidates(getYUr, lineDepthR, true) + sortContents(getYUr, true, cols) + sortContents(getXLl, false, rows) + if verboseTable { + common.Log.Info("findTables @@2: cols=%d rows=%d", len(cols), len(rows)) + } + if len(cols) == 0 || len(rows) == 0 { + return nil + } + + tables := cells.findTableCandidates(cols, rows) + logTables(tables, "candidates") + tables = removeDuplicateTables((tables)) + logTables(tables, "distinct") + return tables +} + +func removeDuplicateTables(tables []*textTable) []*textTable { + if len(tables) == 0 { + return nil + } + sort.Slice(tables, func(i, j int) bool { + ti, tj := tables[i], tables[j] + ai, aj := ti.w*ti.h, tj.w*tj.h + if ai != aj { + return ai > aj + } + return ti.Ury > tj.Ury + }) + distinct := []*textTable{tables[0]} + tables[0].log("removeDuplicateTables 0") +outer: + for _, t := range tables[1:] { + for _, d := range distinct { + if overlapped(t, d) { + continue outer + } + } + t.log("removeDuplicateTables x") + distinct = append(distinct, t) + } + return distinct +} + +func (cells cellList) findTableCandidates(cols, rows []cellList) []*textTable { + if verboseTable { + common.Log.Info("findTableCandidates: cols=%d rows=%d\n\tcols=%s\n\trows=%s", + len(cols), len(rows), cols[0].String(), rows[0].String()) + } + + var candidates [][2]cellList + for _, col := range cols { + for _, row := range rows { + col2, row2 := makeCandidate(col, row) + if col2 != nil && len(col2) >= 2 && len(row2) >= 2 { + candidates = append(candidates, [2]cellList{col2, row2}) + } + } + } + sort.Slice(candidates, func(i, j int) bool { + ci, cj := candidates[i], candidates[j] + ai := len(ci[0]) * len(ci[1]) + aj := len(cj[0]) * len(cj[1]) + if ai == 0 || aj == 0 { + panic("emprty") + } + if ai != aj { + return ai > aj + } + return i < j + }) + var tables []*textTable + for i, cand := range candidates { + col, row := cand[0], cand[1] + if verboseTable { + fmt.Printf("%8d: findTableCandidates: col=%2d %6.2f row=%2d %6.2f\n\tcol=%s\n\trow=%s\n", + i, len(col), col.bbox(), len(row), row.bbox(), col.asStrings(), row.asStrings()) + } + + if col.equals(row) { + // panic(fmt.Errorf("columns can't be rows\n\tcol=%6.2f %q\n\trow=%6.2f %q", + // col.bbox(), col.asStrings(), row.bbox(), row.asStrings())) + // common.Log.Error("columns can't be rows\n\tcol=%6.2f %q\n\trow=%6.2f %q", + // col.bbox(), col.asStrings(), row.bbox(), row.asStrings()) + continue + } + if len(col) == 0 || len(row) == 0 { + panic("emmmpty") + } + boundary := append(row, col...).bbox() + + subset := cells.within(boundary) + table := subset.validTable(col, row) + // fmt.Printf("%12s boundary=%6.2f subset=%3d=%6.2f valid=%t\n", "", + // boundary, len(subset), subset.bbox(), table != nil) + if table != nil { + table.log("VALID!!") + tables = append(tables, table) + } + } + return tables +} + +// within returns the elements of `cells` that are within `boundary`. +func (cells cellList) within(boundary model.PdfRectangle) cellList { + var subset cellList + for _, cell := range cells { + if rectContainsBounded(boundary, cell) { + subset = append(subset, cell) + } + } + return subset +} + +func makeCandidate(col, row cellList) (cellList, cellList) { + var col1, row1 cellList + for i, c := range col { + if c == row[0] { + col1 = col[i:] + row1 = row + break + } + } + var col2, row2 cellList + for i, c := range row { + if c == col[0] { + col2 = col + row2 = row[i:] + break + } + } + if col1 != nil && col2 != nil { + if len(col1)*len(row1) >= len(col2)*len(row2) { + return col1, row1 + } + return col2, row2 + } + if col1 != nil { + return col1, row1 + } + return col2, row2 +} + +// validTable returns a sparse table containing `cells`if `cells` make up a valid table with `col` +// on its left and `row` on its top. +// nil is returned if there is no valid table +func (cells cellList) validTable(col, row cellList) *textTable { + w, h := len(row), len(col) + if col.equals(row) { + panic("columns can't be rows") + } + if col[0] != row[0] { + panic("bad intersection") + } + if verboseTable { + common.Log.Info("validTable: w=%d h=%d cells=%d", w, h, len(cells)) + } + + table := newTextTable(w, h) + for x, cell := range row { + table.put(x, 0, cell) + } + for y, cell := range col { + table.put(0, y, cell) + } + fontsize := table.fontsize() + for i, cell := range cells { + y := col.getAlignedIndex(getYUr, fontsize*lineDepthR, cell) + x := row.getAlignedIndex(getXLl, fontsize*maxIntraReadingGapR, cell) + if x < 0 || y < 0 { + if verboseTable { + common.Log.Error("bad element: x=%d y=%d cell=%s", x, y, cell.String()) + } + return nil + } + if verboseTable { + fmt.Printf("%4d: y=%d x=%d %q\n", i, y, x, truncate(cell.text(), 50)) + } + table.put(x, y, cell) + fontsize = table.fontsize() + } + + w, h = table.maxDense() + if verboseTable { + common.Log.Info("maxDense: w=%d h=%d", w, h) + } + if w < 0 { + return nil + } + return table.subTable(w, h) +} + +func (t *textTable) maxDense() (int, int) { + var product [][2]int + for h := 2; h <= t.h; h++ { + for w := 2; w <= t.w; w++ { + product = append(product, [2]int{w, h}) + } + } + if len(product) == 0 { + return -1, -1 + } + sort.Slice(product, func(i, j int) bool { + pi, pj := product[i], product[j] + ai := pi[0] * pi[1] + aj := pj[0] * pj[1] + if ai != aj { + return ai > aj + } + if pi[1] != pj[1] { + return pi[1] > pj[1] + } + return i < j + }) + for i, p := range product { + w, h := p[0], p[1] + dense, reason := t.isDense(w, h) + if verboseTable { + fmt.Printf("%d: isDense w=%d h=%d dense=%5t %s\n", i, w, h, dense, reason) + } + if dense { + return w, h + } + } + return -1, -1 +} + +func (t *textTable) isDense(w, h int) (bool, string) { + minOccRow := 2 + minOccCol := 2 + minOccR := 0.3 + + count := 0 + for x := 0; x < w; x++ { + n := t.column(x).count() + if n < minOccCol { + // common.Log.Error("col %d has %d entries", x, n, t.column(x).asStrings()) + return false, fmt.Sprintf("col %d has %d entries %s", x, n, t.column(x).asStrings()) + } + count += n + } + for y := 0; y < h; y++ { + n := t.row(y).count() + if n < minOccRow { + // common.Log.Error("row %d has %d entries %s", y, n, t.row(y).asStrings()) + return false, fmt.Sprintf("row %d has %d entries %s", y, n, t.row(y).asStrings()) + } + } + occupancy := float64(count) / float64(w*h) + if occupancy < minOccR { + // common.Log.Error("table has %d of %d = %.2f entries", count, t.w*t.h, occupancy) + return false, fmt.Sprintf("table has %d of %d = %.2f entries", count, w*h, occupancy) + } + return true, "" +} + +func (cells cellList) count() int { + n := 0 + for _, c := range cells { + if c != nil { + n++ + } + } + return n +} + +func (cells cellList) getAlignedIndex(get getter, delta float64, targetCell *textPara) int { + for i, cell := range cells { + if parasAligned(get, delta, targetCell, cell) { + return i + } + } + return -1 +} + +func sortContents(get getter, reverse bool, cols []cellList) { + for _, cells := range cols { + sort.Slice(cells, func(i, j int) bool { + ci, cj := cells[i], cells[j] + if reverse { + return get(ci) > get(cj) + } + return get(ci) < get(cj) + }) + } +} + +// findGetterCandidates returns list of elements of `cells` that are within `delta` for attribute `get`. +func (cells cellList) findGetterCandidates(get getter, deltaR float64, reverse bool) []cellList { + delta := cells.fontsize() * deltaR + xIndex := cells.makeIndex(getXLl) + var columns []cellList + addCol := func(col cellList) { + if len(col) > 1 { + columns = append(columns, col) + } + } + for i0, idx0 := range xIndex[:len(xIndex)-1] { + cell0 := cells[idx0] + col := cellList{cell0} + for _, idx := range xIndex[i0+1:] { + cell := cells[idx] + if getXLl(cell) > get(cell0)+delta { + addCol(col) + col = cellList{cell} + } else if parasAligned(get, delta, cell0, cell) { + col = append(col, cell) + } + } + addCol(col) + } + sort.Slice(columns, func(i, j int) bool { + ci, cj := columns[i], columns[j] + if len(ci) != len(cj) { + return len(ci) > len(cj) + } + if reverse { + return get(ci[0]) > get(cj[0]) + } + return get(ci[0]) < get(cj[0]) + }) + return columns +} + +func (cells cellList) equals(other cellList) bool { + if len(cells) != len(other) { + return false + } + for i, cell := range cells { + if other[i] != cell { + return false + } + } + return true +} + +// makeIndex returns an indexes over cells on the `Llx` and `Ury `attributes. +func (cells cellList) xyIndexes() ([]int, []int) { + xIndex := cells.makeIndex(getXLl) + yIndex := cells.makeIndex(getYUr) + return xIndex, yIndex +} + +// makeIndex returns an index over cells on the `get` attributes. +func (cells cellList) makeIndex(get getter) []int { + index := make([]int, len(cells)) + for i := range cells { + index[i] = i + } + sort.Slice(index, func(i, j int) bool { + zi := get(cells[index[i]]) + zj := get(cells[index[j]]) + return zi < zj + }) + return index } type getter func(*textPara) float64 var ( + // gettersX get the x-center, left and right of cells. gettersX = []getter{getXCe, getXLl, getXUr} + // gettersX get the y-center, bottom and top of cells. gettersY = []getter{getYCe, getYLl, getYUr} ) @@ -540,18 +933,55 @@ func getXUr(para *textPara) float64 { return para.Urx } func getYCe(para *textPara) float64 { return 0.5 * (para.Lly + para.Ury) } func getYLl(para *textPara) float64 { return para.Lly } func getYUr(para *textPara) float64 { return para.Ury } +func getTop(para *textPara) float64 { return -para.Ury } -func parasAligned(para1, para2 *textPara, delta float64, get func(*textPara) float64) bool { - z1 := get(para1) - z2 := get(para2) - return math.Abs(z1-z2) <= delta +func (cells cellList) log(title string) { + paraList(cells).log(title) } -// fontsize for a paraList is the minimum font size of the paras. -func (paras cellList) fontsize() float64 { - size := paras[0].fontsize() - for _, p := range paras[1:] { - size = math.Min(size, p.fontsize()) +// logTables logs the contents of `tables`. +func logTables(tables []*textTable, title string) { + if !verboseTable { + return + } + common.Log.Info("%8s: %d tables =======!!!!!!!!=====", title, len(tables)) + for i, t := range tables { + t.log(fmt.Sprintf("%s-%02d", title, i)) } - return size +} + +// log logs the contents of `table`. +func (t *textTable) log(title string) { + if !verboseTable { + return + } + fmt.Printf("%4s[%dx%d] %s ++++++++++\n", "", t.w, t.h, title) + if t.w == 0 || t.h == 0 { + return + } + top := t.row(0) + left := t.column(0) + fmt.Printf("%8s top=%q\n", "", top.asStrings()) + fmt.Printf("%8sleft=%q\n", "", left.asStrings()) + // return + // common.Log.Info("%8s: %s: %2d x %2d %6.2f =======//////////=====\n"+ + // " %6.2f", title, fileLine(1, false), + // table.w, table.h, table.PdfRectangle, table.PdfRectangle) + // for i, p := range table.cells { + // if p == nil { + // continue + // } + // fmt.Printf("%4d: %6.2f %q\n", i, p.PdfRectangle, truncate(p.text(), 50)) + // } +} + +func (cells cellList) asStrings() []string { + n := minInt(5, len(cells)) + parts := make([]string, n) + for i, cell := range cells[:n] { + if cell != nil { + parts[i] = truncate(cell.text(), 20) + } + } + return parts } diff --git a/extractor/text_test.go b/extractor/text_test.go index 20a9038f..131216f3 100644 --- a/extractor/text_test.go +++ b/extractor/text_test.go @@ -175,7 +175,7 @@ func TestTermMarksFiles(t *testing.T) { if !doStress { t.Skip("skipping stress test") } - common.Log.Info("Running text stress tests. go test --short to skip these.") + common.Log.Info("Running text stress tests.") if len(corpusFolder) == 0 && !forceTest { t.Log("Corpus folder not set - skipping") return @@ -736,6 +736,11 @@ func testTermMarks(t *testing.T, text string, textMarks *TextMarkArray, n int) { ofs1d = len(text) } show := fmt.Sprintf("<%s|%s|%s>", text[ofs0d:ofs0], text[ofs0:ofs1], text[ofs1:ofs1d]) + { + show = fmt.Sprintf("%q", show) + runes := []rune(show) + show = string(runes[1 : len(runes)-1]) + } // Get TextMarks spanning `term` with RangeOffset(). spanArray, err := textMarks.RangeOffset(ofs0, ofs1) @@ -783,6 +788,7 @@ func startWith(str, sub string) bool { if strings.HasPrefix(str, sub[n:]) { return true } + // common.Log.Error("!startsWith: str=%q sub=%q sub[%d:]=%q", str, sub, n, sub[n:]) } return false } diff --git a/extractor/text_word.go b/extractor/text_word.go index 2f61ded6..20db6d78 100644 --- a/extractor/text_word.go +++ b/extractor/text_word.go @@ -170,6 +170,19 @@ func (w *textWord) text() string { return strings.Join(texts, "") } +// toTextMarks returns the TextMarks contained in `w`.text(). +// `offset` is used to give the TextMarks the correct Offset values. +func (w *textWord) toTextMarks(offset *int) []TextMark { + var marks []TextMark + for _, tm := range w.marks { + marks = appendTextMark(marks, offset, tm.ToTextMark()) + } + if len(w.text()) > 0 && len(marks) == 0 { + panic(w.text()) + } + return marks +} + // font returns the fontID of the `idx`th rune in text. // compute on creation? !@#$ func (w *textWord) font(idx int) string { diff --git a/internal/cmap/cmap.go b/internal/cmap/cmap.go index 11b2c634..2729f934 100644 --- a/internal/cmap/cmap.go +++ b/internal/cmap/cmap.go @@ -22,7 +22,7 @@ const ( // MissingCodeRune replaces runes that can't be decoded. '\ufffd' = �. Was '?'. MissingCodeRune = '\ufffd' // � - // MissingCodeRune replaces strings that can't be decoded. + // MissingCodeString replaces strings that can't be decoded. MissingCodeString = string(MissingCodeRune) ) @@ -44,7 +44,7 @@ type charRange struct { type fbRange struct { code0 CharCode code1 CharCode - r0 rune // TODO (peterwilliams97): Change to string for compound codes. + r0 string } // CIDSystemInfo contains information for identifying the character collection @@ -110,8 +110,7 @@ type CMap struct { // Used by ctype 2 CMaps. codeToUnicode map[CharCode]string // CID -> Unicode string - // XXXX(peterwilliams97): Should unicodeToCode be the inverse of codeToUnicode? - unicodeToCode map[rune]CharCode // Unicode rune -> CID + unicodeToCode map[string]CharCode // Unicode rune -> CID // cached contains the raw CMap data. It is used by the Bytes method in // order to avoid generating the data for every call. @@ -137,10 +136,10 @@ func NewToUnicodeCMap(codeToRune map[CharCode]rune) *CMap { Supplement: 0, }, codespaces: []Codespace{{Low: 0, High: 0xffff}}, - codeToCID: make(map[CharCode]CharCode), - cidToCode: make(map[CharCode]CharCode), codeToUnicode: codeToUnicode, - unicodeToCode: make(map[rune]CharCode), + unicodeToCode: make(map[string]CharCode, len(codeToRune)), + codeToCID: make(map[CharCode]CharCode, len(codeToRune)), + cidToCode: make(map[CharCode]CharCode, len(codeToRune)), } cmap.computeInverseMappings() @@ -159,7 +158,7 @@ func newCMap(isSimple bool) *CMap { codeToCID: make(map[CharCode]CharCode), cidToCode: make(map[CharCode]CharCode), codeToUnicode: make(map[CharCode]string), - unicodeToCode: make(map[rune]CharCode), + unicodeToCode: make(map[string]CharCode), } } @@ -265,13 +264,8 @@ func (cmap *CMap) computeInverseMappings() { // Generate Unicode -> CID map. for cid, s := range cmap.codeToUnicode { - // The CMap entries can be empty e.g. dobe_supplement_iso32000_1.pdf - if len(s) == 0 { - continue - } - r := rune0(s) - if c, ok := cmap.unicodeToCode[r]; !ok || (ok && c > cid) { - cmap.unicodeToCode[r] = cid + if c, ok := cmap.unicodeToCode[s]; !ok || (ok && c > cid) { + cmap.unicodeToCode[s] = cid } } @@ -326,10 +320,10 @@ func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) { return MissingCodeString, false } -// RuneToCID maps the specified rune to a character identifier. If the provided -// rune has no available mapping, the second return value is false. -func (cmap *CMap) RuneToCID(r rune) (CharCode, bool) { - cid, ok := cmap.unicodeToCode[r] +// StringToCID maps the specified string to a character identifier. If the provided +// string has no available mapping, the bool return value is false. +func (cmap *CMap) StringToCID(s string) (CharCode, bool) { + cid, ok := cmap.unicodeToCode[s] return cid, ok } @@ -484,10 +478,10 @@ func (cmap *CMap) toBfData() string { // character codes have been mapped to code ranges. var charRanges []charRange currCharRange := charRange{codes[0], codes[0]} - prevRune := rune0(cmap.codeToUnicode[codes[0]]) + prevRune := cmap.codeToUnicode[codes[0]] for _, c := range codes[1:] { - currRune := rune0(cmap.codeToUnicode[c]) - if c == currCharRange.code1+1 && currRune == prevRune+1 { + currRune := cmap.codeToUnicode[c] + if c == currCharRange.code1+1 && lastRune(currRune) == lastRune(prevRune)+1 { currCharRange.code1 = c } else { charRanges = append(charRanges, currCharRange) @@ -507,7 +501,7 @@ func (cmap *CMap) toBfData() string { fbRanges = append(fbRanges, fbRange{ code0: cr.code0, code1: cr.code1, - r0: rune0(cmap.codeToUnicode[cr.code0]), + r0: cmap.codeToUnicode[cr.code0], }) } } @@ -522,8 +516,8 @@ func (cmap *CMap) toBfData() string { lines = append(lines, fmt.Sprintf("%d beginbfchar", n)) for j := 0; j < n; j++ { code := fbChars[i*maxBfEntries+j] - r := rune0(cmap.codeToUnicode[code]) - lines = append(lines, fmt.Sprintf("<%04x> <%04x>", code, r)) + s := cmap.codeToUnicode[code] + lines = append(lines, fmt.Sprintf("<%04x> %s", code, hexCode(s))) } lines = append(lines, "endbfchar") } @@ -535,8 +529,8 @@ func (cmap *CMap) toBfData() string { lines = append(lines, fmt.Sprintf("%d beginbfrange", n)) for j := 0; j < n; j++ { rng := fbRanges[i*maxBfEntries+j] - r := rng.r0 - lines = append(lines, fmt.Sprintf("<%04x><%04x> <%04x>", rng.code0, rng.code1, r)) + lines = append(lines, fmt.Sprintf("<%04x><%04x> %s", + rng.code0, rng.code1, hexCode(rng.r0))) } lines = append(lines, "endbfrange") } @@ -544,6 +538,22 @@ func (cmap *CMap) toBfData() string { return strings.Join(lines, "\n") } +// lastRune returns the last rune in `s`. +func lastRune(s string) rune { + runes := []rune(s) + return runes[len(runes)-1] +} + +// hexCode return the CMap hex code for `s`. +func hexCode(s string) string { + runes := []rune(s) + codes := make([]string, len(runes)) + for i, r := range runes { + codes[i] = fmt.Sprintf("%04x", r) + } + return fmt.Sprintf("<%s>", strings.Join(codes, "")) +} + const ( maxBfEntries = 100 // Maximum number of entries in a bfchar or bfrange section. cmapHeader = ` @@ -563,9 +573,3 @@ end end ` ) - -// rune0 is a convenience function that returns the first rune in `s`. -// Caller must check that `s` is not empty. -func rune0(s string) rune { - return ([]rune(s))[0] -} diff --git a/internal/cmap/cmap_parser.go b/internal/cmap/cmap_parser.go index a160f32c..7ee40ee2 100644 --- a/internal/cmap/cmap_parser.go +++ b/internal/cmap/cmap_parser.go @@ -105,7 +105,7 @@ func (cmap *CMap) parse() error { func (cmap *CMap) parseName() error { name := "" done := false - // /Users/peter/testdata/programming/pdf_text/columns/Berg.pdf + // NOTE(peterwilliams97): We need up to 20 iterations of this loop for some PDFs I have seen. for i := 0; i < 20 && !done; i++ { o, err := cmap.parseObject() if err != nil { diff --git a/internal/textencoding/cmap.go b/internal/textencoding/cmap.go index 56b24c74..e727ab56 100644 --- a/internal/textencoding/cmap.go +++ b/internal/textencoding/cmap.go @@ -67,7 +67,7 @@ func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool) { } // Map rune to CID. - cid, ok := enc.cidToUnicode.RuneToCID(r) + cid, ok := enc.cidToUnicode.StringToCID(string(r)) if !ok { return 0, false } diff --git a/internal/textencoding/glyphs_glyphlist.go b/internal/textencoding/glyphs_glyphlist.go index 2567675f..0a8db594 100644 --- a/internal/textencoding/glyphs_glyphlist.go +++ b/internal/textencoding/glyphs_glyphlist.go @@ -23,7 +23,7 @@ const ( // MissingCodeRune replaces runes that can't be decoded. . MissingCodeRune = '\ufffd' // � - // MissingCodeRune replaces strings that can't be decoded. + // MissingCodeString replaces strings that can't be decoded. MissingCodeString = string(MissingCodeRune) ) diff --git a/model/font.go b/model/font.go index c1a9b609..a676845d 100644 --- a/model/font.go +++ b/model/font.go @@ -421,31 +421,26 @@ func (font *PdfFont) BytesToCharcodes(data []byte) []textencoding.CharCode { return charcodes } -// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except returns more statistical +// CharcodesToUnicodeWithStats is identical to CharcodesToUnicode except it returns more statistical // information about hits and misses from the reverse mapping process. // NOTE: The number of runes returned may be greater than the number of charcodes. -// TODO(peterwilliams97): Deprecate? +// TODO(peterwilliams97): Deprecate in v4 and use only CharcodesToStrings() func (font *PdfFont) CharcodesToUnicodeWithStats(charcodes []textencoding.CharCode) (runelist []rune, numHits, numMisses int) { - runeSlices, numHits, numMisses := font.CharcodesToRuneSlices(charcodes) - var runes []rune - for _, r := range runeSlices { - runes = append(runes, r...) - } - return runes, numHits, numMisses + texts, numHits, numMisses := font.CharcodesToStrings(charcodes) + return []rune(strings.Join(texts, "")), numHits, numMisses } -// CharcodesToRuneSlices returns the unicode strings corresponding to `charcodes` as rune slices. -// The int return is the number of unconvereted codes. -// NOTE: The number of rune slices returned is equal to the number of charcodes -func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([][]rune, int, int) { +// CharcodesToStrings returns the unicode strings corresponding to `charcodes`. +// The int returns are the number of strings and the number of unconvereted codes. +// NOTE: The number of strings returned is equal to the number of charcodes +func (font *PdfFont) CharcodesToStrings(charcodes []textencoding.CharCode) ([]string, int, int) { fontBase := font.baseFields() - runeSlices := make([][]rune, 0, len(charcodes)) + texts := make([]string, 0, len(charcodes)) numMisses := 0 for _, code := range charcodes { if fontBase.toUnicodeCmap != nil { if s, ok := fontBase.toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code)); ok { - runeSlices = append(runeSlices, []rune(s)) - // common.Log.Info("CharcodesToRuneSlices1: code=%d s=`%s`", code, s) + texts = append(texts, s) continue } } @@ -454,9 +449,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([ encoder := font.Encoder() if encoder != nil { if r, ok := encoder.CharcodeToRune(code); ok { - runeSlices = append(runeSlices, []rune{r}) - // common.Log.Info("CharcodesToRuneSlices2: code=%d s=%q encoder=%s", - // code, string(r), encoder.String()) + texts = append(texts, string(r)) continue } } @@ -465,7 +458,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([ "\tfont=%s\n\tencoding=%s", code, charcodes, fontBase.isCIDFont(), font, encoder) numMisses++ - runeSlices = append(runeSlices, []rune{cmap.MissingCodeRune}) + texts = append(texts, cmap.MissingCodeString) } if numMisses != 0 { @@ -475,7 +468,7 @@ func (font *PdfFont) CharcodesToRuneSlices(charcodes []textencoding.CharCode) ([ len(charcodes), numMisses, font) } - return runeSlices, len(runeSlices), numMisses + return texts, len(texts), numMisses } // CharcodeBytesToUnicode converts PDF character codes `data` to a Go unicode string. @@ -499,8 +492,8 @@ func (font *PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) { // 1) Use the ToUnicode CMap if there is one. // 2) Use the underlying font's encoding. func (font *PdfFont) CharcodesToUnicode(charcodes []textencoding.CharCode) []rune { - strlist, _, _ := font.CharcodesToUnicodeWithStats(charcodes) - return strlist + runes, _, _ := font.CharcodesToUnicodeWithStats(charcodes) + return runes } // RunesToCharcodeBytes maps the provided runes to charcode bytes and it