Prevent extractor panic for invalid PDF text objects (#196)

* Prevent extractor panic for invalid PDF text objects * Document text extraction behavior of invalid text objects
2025-04-26 13:48:55 +08:00 · 2019-10-30 22:36:35 +02:00 · 2019-10-30 22:36:35 +02:00 · f7b5ffa954
commit f7b5ffa954
parent 362ba7349d
1 changed files with 31 additions and 11 deletions
--- a/extractor/text.go
+++ b/extractor/text.go
@ -61,7 +61,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 	pageText := &PageText{}
 	state := newTextState()
 	fontStack := fontStacker{}
-	var to *textObject
+	to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &fontStack)
+	var inTextObj bool

 	cstreamParser := contentstream.NewContentStreamParser(contents)
 	operations, err := cstreamParser.Parse()
@ -102,16 +103,31 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 					state.tfont = fontStack.pop()
 				}
 			case "BT": // Begin text
-				// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
-				// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
-				// not appear before an ET.
-				if to != nil {
+				// Begin a text object, initializing the text matrix, Tm, and
+				// the text line matrix, Tlm, to the identity matrix. Text
+				// objects shall not be nested. A second BT shall not appear
+				// before an ET. However, if that happens, all existing marks
+				// are added to the  page marks, in order to avoid losing content.
+				if inTextObj {
 					common.Log.Debug("BT called while in a text object")
+					pageText.marks = append(pageText.marks, to.marks...)
 				}
+				inTextObj = true
 				to = newTextObject(e, resources, gs, &state, &fontStack)
 			case "ET": // End Text
+				// End text object, discarding text matrix. If the current
+				// text object contains text marks, they are added to the
+				// page text marks collection.
+				// The ET operator should always have a matching BT operator.
+				// However, if ET appears outside of a text object, the behavior
+				// does not change: the text matrices are discarded and all
+				// existing marks in the text object are added to the page marks.
+				if !inTextObj {
+					common.Log.Debug("ET called outside of a text object")
+				}
+				inTextObj = false
 				pageText.marks = append(pageText.marks, to.marks...)
-				to = nil
+				to.reset()
 			case "T*": // Move to start of next text line
 				to.nextLine()
 			case "Td": // Move text location
@ -202,10 +218,6 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 				}
 				to.setCharSpacing(y)
 			case "Tf": // Set font.
-				if to == nil {
-					// This is needed for 26-Hazard-Thermal-environment.pdf
-					to = newTextObject(e, resources, gs, &state, &fontStack)
-				}
 				if ok, err := to.checkOp(op, 2, true); !ok {
 					common.Log.Debug("ERROR: Tf err=%v", err)
 					return err
@ -659,6 +671,14 @@ func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentst
 	}
 }

+// reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text
+// object to the identity matrix. In addition, the marks collection is cleared.
+func (to *textObject) reset() {
+	to.tm = transform.IdentityMatrix()
+	to.tlm = transform.IdentityMatrix()
+	to.marks = nil
+}
+
 // renderText processes and renders byte array `data` for extraction purposes.
 func (to *textObject) renderText(data []byte) error {
 	font := to.getCurrentFont()
@ -1205,7 +1225,7 @@ func (pt *PageText) sortPosition(tol float64) {
 		if pt.marks[i-1].orient != pt.marks[i].orient {
 			cluster++
 		} else {
-			if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol {
+			if pt.marks[i-1].orientedStart.Y-pt.marks[i].orientedStart.Y > tol {
 				cluster++
 			}
 		}