mirror of
https://github.com/unidoc/unipdf.git
synced 2025-04-26 13:48:55 +08:00
Prevent extractor panic for invalid PDF text objects (#196)
* Prevent extractor panic for invalid PDF text objects * Document text extraction behavior of invalid text objects
This commit is contained in:
parent
362ba7349d
commit
f7b5ffa954
@ -61,7 +61,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
|
||||
pageText := &PageText{}
|
||||
state := newTextState()
|
||||
fontStack := fontStacker{}
|
||||
var to *textObject
|
||||
to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &fontStack)
|
||||
var inTextObj bool
|
||||
|
||||
cstreamParser := contentstream.NewContentStreamParser(contents)
|
||||
operations, err := cstreamParser.Parse()
|
||||
@ -102,16 +103,31 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
|
||||
state.tfont = fontStack.pop()
|
||||
}
|
||||
case "BT": // Begin text
|
||||
// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
|
||||
// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
|
||||
// not appear before an ET.
|
||||
if to != nil {
|
||||
// Begin a text object, initializing the text matrix, Tm, and
|
||||
// the text line matrix, Tlm, to the identity matrix. Text
|
||||
// objects shall not be nested. A second BT shall not appear
|
||||
// before an ET. However, if that happens, all existing marks
|
||||
// are added to the page marks, in order to avoid losing content.
|
||||
if inTextObj {
|
||||
common.Log.Debug("BT called while in a text object")
|
||||
pageText.marks = append(pageText.marks, to.marks...)
|
||||
}
|
||||
inTextObj = true
|
||||
to = newTextObject(e, resources, gs, &state, &fontStack)
|
||||
case "ET": // End Text
|
||||
// End text object, discarding text matrix. If the current
|
||||
// text object contains text marks, they are added to the
|
||||
// page text marks collection.
|
||||
// The ET operator should always have a matching BT operator.
|
||||
// However, if ET appears outside of a text object, the behavior
|
||||
// does not change: the text matrices are discarded and all
|
||||
// existing marks in the text object are added to the page marks.
|
||||
if !inTextObj {
|
||||
common.Log.Debug("ET called outside of a text object")
|
||||
}
|
||||
inTextObj = false
|
||||
pageText.marks = append(pageText.marks, to.marks...)
|
||||
to = nil
|
||||
to.reset()
|
||||
case "T*": // Move to start of next text line
|
||||
to.nextLine()
|
||||
case "Td": // Move text location
|
||||
@ -202,10 +218,6 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
|
||||
}
|
||||
to.setCharSpacing(y)
|
||||
case "Tf": // Set font.
|
||||
if to == nil {
|
||||
// This is needed for 26-Hazard-Thermal-environment.pdf
|
||||
to = newTextObject(e, resources, gs, &state, &fontStack)
|
||||
}
|
||||
if ok, err := to.checkOp(op, 2, true); !ok {
|
||||
common.Log.Debug("ERROR: Tf err=%v", err)
|
||||
return err
|
||||
@ -659,6 +671,14 @@ func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentst
|
||||
}
|
||||
}
|
||||
|
||||
// reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text
|
||||
// object to the identity matrix. In addition, the marks collection is cleared.
|
||||
func (to *textObject) reset() {
|
||||
to.tm = transform.IdentityMatrix()
|
||||
to.tlm = transform.IdentityMatrix()
|
||||
to.marks = nil
|
||||
}
|
||||
|
||||
// renderText processes and renders byte array `data` for extraction purposes.
|
||||
func (to *textObject) renderText(data []byte) error {
|
||||
font := to.getCurrentFont()
|
||||
@ -1205,7 +1225,7 @@ func (pt *PageText) sortPosition(tol float64) {
|
||||
if pt.marks[i-1].orient != pt.marks[i].orient {
|
||||
cluster++
|
||||
} else {
|
||||
if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol {
|
||||
if pt.marks[i-1].orientedStart.Y-pt.marks[i].orientedStart.Y > tol {
|
||||
cluster++
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user