From f7b5ffa9545e7dd22f0f32c1312d40fb1211c9b0 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Wed, 30 Oct 2019 22:36:35 +0200 Subject: [PATCH] Prevent extractor panic for invalid PDF text objects (#196) * Prevent extractor panic for invalid PDF text objects * Document text extraction behavior of invalid text objects --- extractor/text.go | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/extractor/text.go b/extractor/text.go index 53758825..6009bd1f 100644 --- a/extractor/text.go +++ b/extractor/text.go @@ -61,7 +61,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes pageText := &PageText{} state := newTextState() fontStack := fontStacker{} - var to *textObject + to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &fontStack) + var inTextObj bool cstreamParser := contentstream.NewContentStreamParser(contents) operations, err := cstreamParser.Parse() @@ -102,16 +103,31 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes state.tfont = fontStack.pop() } case "BT": // Begin text - // Begin a text object, initializing the text matrix, Tm, and the text line matrix, - // Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall - // not appear before an ET. - if to != nil { + // Begin a text object, initializing the text matrix, Tm, and + // the text line matrix, Tlm, to the identity matrix. Text + // objects shall not be nested. A second BT shall not appear + // before an ET. However, if that happens, all existing marks + // are added to the page marks, in order to avoid losing content. + if inTextObj { common.Log.Debug("BT called while in a text object") + pageText.marks = append(pageText.marks, to.marks...) } + inTextObj = true to = newTextObject(e, resources, gs, &state, &fontStack) case "ET": // End Text + // End text object, discarding text matrix. If the current + // text object contains text marks, they are added to the + // page text marks collection. + // The ET operator should always have a matching BT operator. + // However, if ET appears outside of a text object, the behavior + // does not change: the text matrices are discarded and all + // existing marks in the text object are added to the page marks. + if !inTextObj { + common.Log.Debug("ET called outside of a text object") + } + inTextObj = false pageText.marks = append(pageText.marks, to.marks...) - to = nil + to.reset() case "T*": // Move to start of next text line to.nextLine() case "Td": // Move text location @@ -202,10 +218,6 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes } to.setCharSpacing(y) case "Tf": // Set font. - if to == nil { - // This is needed for 26-Hazard-Thermal-environment.pdf - to = newTextObject(e, resources, gs, &state, &fontStack) - } if ok, err := to.checkOp(op, 2, true); !ok { common.Log.Debug("ERROR: Tf err=%v", err) return err @@ -659,6 +671,14 @@ func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentst } } +// reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text +// object to the identity matrix. In addition, the marks collection is cleared. +func (to *textObject) reset() { + to.tm = transform.IdentityMatrix() + to.tlm = transform.IdentityMatrix() + to.marks = nil +} + // renderText processes and renders byte array `data` for extraction purposes. func (to *textObject) renderText(data []byte) error { font := to.getCurrentFont() @@ -1205,7 +1225,7 @@ func (pt *PageText) sortPosition(tol float64) { if pt.marks[i-1].orient != pt.marks[i].orient { cluster++ } else { - if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol { + if pt.marks[i-1].orientedStart.Y-pt.marks[i].orientedStart.Y > tol { cluster++ } }