From 30ff60a44e45b6782a1db46063ca8c4bf0b49f94 Mon Sep 17 00:00:00 2001 From: Nat Wilson Date: Sun, 29 Oct 2017 10:34:40 -0700 Subject: [PATCH] Insert a space when TJ string is offset by more than a threshold That threshold is hard-coded as -100 (is this font specific?) --- pdf/contentstream/contentstream.go | 13 +++++++++++-- pdf/contentstream/contentstream_test.go | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 pdf/contentstream/contentstream_test.go diff --git a/pdf/contentstream/contentstream.go b/pdf/contentstream/contentstream.go index f694451f..71fe51e8 100644 --- a/pdf/contentstream/contentstream.go +++ b/pdf/contentstream/contentstream.go @@ -131,8 +131,17 @@ func (this *ContentStreamParser) ExtractText() (string, error) { return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0]) } for _, obj := range *paramList { - if strObj, ok := obj.(*PdfObjectString); ok { - txt += string(*strObj) + switch v := obj.(type) { + case *PdfObjectString: + txt += string(*v) + case *PdfObjectFloat: + if *v < -100 { + txt += " " + } + case *PdfObjectInteger: + if *v < -100 { + txt += " " + } } } } else if inText && op.Operand == "Tj" { diff --git a/pdf/contentstream/contentstream_test.go b/pdf/contentstream/contentstream_test.go new file mode 100644 index 00000000..5237eb10 --- /dev/null +++ b/pdf/contentstream/contentstream_test.go @@ -0,0 +1,25 @@ +package contentstream + +import ( + "testing" +) + +func TestOperandTJSpacing(t *testing.T) { + + content := `BT + [(are)-328(h)5(ypothesized)-328(to)-327(in\003uence)-328(the)-328(stability)-328(of)-328(the)-328(upstream)-327(glaciers,)-328(and)-328(thus)-328(of)-328(the)-328(entire)-327(ice)-328(sheet)]TJ + ET` + referenceText := "are hypothesized to in\003uence the stability of the upstream glaciers, and thus of the entire ice sheet" + + cStreamParser := NewContentStreamParser(content) + + text, err := cStreamParser.ExtractText() + if err != nil { + t.Error() + } + + if text != referenceText { + t.Fail() + } + +}