Insert a space when TJ string is offset by more than a threshold

That threshold is hard-coded as -100 (is this font specific?)
This commit is contained in:
Nat Wilson 2017-10-29 10:34:40 -07:00
parent bdf676e045
commit 30ff60a44e
2 changed files with 36 additions and 2 deletions

View File

@ -131,8 +131,17 @@ func (this *ContentStreamParser) ExtractText() (string, error) {
return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
}
for _, obj := range *paramList {
if strObj, ok := obj.(*PdfObjectString); ok {
txt += string(*strObj)
switch v := obj.(type) {
case *PdfObjectString:
txt += string(*v)
case *PdfObjectFloat:
if *v < -100 {
txt += " "
}
case *PdfObjectInteger:
if *v < -100 {
txt += " "
}
}
}
} else if inText && op.Operand == "Tj" {

View File

@ -0,0 +1,25 @@
package contentstream
import (
"testing"
)
func TestOperandTJSpacing(t *testing.T) {
content := `BT
[(are)-328(h)5(ypothesized)-328(to)-327(in\003uence)-328(the)-328(stability)-328(of)-328(the)-328(upstream)-327(glaciers,)-328(and)-328(thus)-328(of)-328(the)-328(entire)-327(ice)-328(sheet)]TJ
ET`
referenceText := "are hypothesized to in\003uence the stability of the upstream glaciers, and thus of the entire ice sheet"
cStreamParser := NewContentStreamParser(content)
text, err := cStreamParser.ExtractText()
if err != nil {
t.Error()
}
if text != referenceText {
t.Fail()
}
}