Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects
* Document text extraction behavior of invalid text objects
This commit is contained in:
Adrian-George Bostan 2019-10-30 22:36:35 +02:00 committed by Gunnsteinn Hall
parent 362ba7349d
commit f7b5ffa954

View File

@ -61,7 +61,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
pageText := &PageText{}
state := newTextState()
fontStack := fontStacker{}
var to *textObject
to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &fontStack)
var inTextObj bool
cstreamParser := contentstream.NewContentStreamParser(contents)
operations, err := cstreamParser.Parse()
@ -102,16 +103,31 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
state.tfont = fontStack.pop()
}
case "BT": // Begin text
// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
// not appear before an ET.
if to != nil {
// Begin a text object, initializing the text matrix, Tm, and
// the text line matrix, Tlm, to the identity matrix. Text
// objects shall not be nested. A second BT shall not appear
// before an ET. However, if that happens, all existing marks
// are added to the page marks, in order to avoid losing content.
if inTextObj {
common.Log.Debug("BT called while in a text object")
pageText.marks = append(pageText.marks, to.marks...)
}
inTextObj = true
to = newTextObject(e, resources, gs, &state, &fontStack)
case "ET": // End Text
// End text object, discarding text matrix. If the current
// text object contains text marks, they are added to the
// page text marks collection.
// The ET operator should always have a matching BT operator.
// However, if ET appears outside of a text object, the behavior
// does not change: the text matrices are discarded and all
// existing marks in the text object are added to the page marks.
if !inTextObj {
common.Log.Debug("ET called outside of a text object")
}
inTextObj = false
pageText.marks = append(pageText.marks, to.marks...)
to = nil
to.reset()
case "T*": // Move to start of next text line
to.nextLine()
case "Td": // Move text location
@ -202,10 +218,6 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
}
to.setCharSpacing(y)
case "Tf": // Set font.
if to == nil {
// This is needed for 26-Hazard-Thermal-environment.pdf
to = newTextObject(e, resources, gs, &state, &fontStack)
}
if ok, err := to.checkOp(op, 2, true); !ok {
common.Log.Debug("ERROR: Tf err=%v", err)
return err
@ -659,6 +671,14 @@ func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentst
}
}
// reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text
// object to the identity matrix. In addition, the marks collection is cleared.
func (to *textObject) reset() {
to.tm = transform.IdentityMatrix()
to.tlm = transform.IdentityMatrix()
to.marks = nil
}
// renderText processes and renders byte array `data` for extraction purposes.
func (to *textObject) renderText(data []byte) error {
font := to.getCurrentFont()
@ -1205,7 +1225,7 @@ func (pt *PageText) sortPosition(tol float64) {
if pt.marks[i-1].orient != pt.marks[i].orient {
cluster++
} else {
if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol {
if pt.marks[i-1].orientedStart.Y-pt.marks[i].orientedStart.Y > tol {
cluster++
}
}