Changes after pull request review

This commit is contained in:
Peter Williams 2018-07-24 21:32:02 +10:00
parent e2b4f908bd
commit e886846c6a
20 changed files with 146 additions and 187 deletions

View File

@ -20,6 +20,7 @@ func New(page *model.PdfPage) (*Extractor, error) {
return nil, err
}
// Uncomment these lines to see the contents of the page. For debugging.
// fmt.Println("========================= +++ =========================")
// fmt.Printf("%s\n", contents)
// fmt.Println("========================= ::: =========================")

View File

@ -54,9 +54,6 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
return textList, state.numChars, state.numMisses, err
}
// fmt.Println("========================= xxx =========================")
// fmt.Printf("%s\n", e.contents)
// fmt.Println("========================= ||| =========================")
processor := contentstream.NewContentStreamProcessor(*operations)
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
@ -64,7 +61,6 @@ func (e *Extractor) ExtractXYText() (*TextList, int, int, error) {
resources *model.PdfPageResources) error {
operand := op.Operand
// common.Log.Debug("++Operand: %s", op.String())
switch operand {
case "q":
@ -383,7 +379,7 @@ func (to *TextObject) setFont(name string, size float64) error {
(*to.fontStack)[len(*to.fontStack)-1] = font
}
} else if err == model.ErrFontNotSupported {
// XXX: !@#$ Do we need to handle this case in a special way?
// XXX: Do we need to handle this case in a special way?
return err
} else {
return err

View File

@ -38,14 +38,8 @@ type CIDSystemInfo struct {
// CMap represents a character code to unicode mapping used in PDF files.
//
// 9.7.5 CMaps (Page 272)
//
// Page 278
// c) The beginbfchar and endbfchar shall not appear in a CMap that is used as the Encoding entry of
// a Type 0 font; however, they may appear in the definition of a ToUnicode CMap
//
// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/5411.ToUnicode.pdf
// https://github.com/adobe-type-tools/cmap-resources/releases ***
// https://github.com/adobe-type-tools/cmap-resources/releases
type CMap struct {
*cMapParser
@ -59,15 +53,12 @@ type CMap struct {
// For regular cmaps
codespaces []Codespace
// Text encoder to look up runes from input glyph names. !@#$ Not used
// encoder textencoding.TextEncoder
// For ToUnicode (ctype 2) cmaps
codeToUnicode map[CharCode]string
toUnicodeIdentity bool
}
// String retuns a human readable description of `cmap`
// String retuns a human readable description of `cmap`.
func (cmap *CMap) String() string {
si := cmap.systemInfo
parts := []string{
@ -103,7 +94,7 @@ func newCMap(isSimple bool) *CMap {
return cmap
}
// String returns a human readable description of `info`
// String returns a human readable description of `info`.
// It looks like "Adobe-Japan2-000".
func (info *CIDSystemInfo) String() string {
return fmt.Sprintf("%s-%s-%03d", info.Registry, info.Ordering, info.Supplement)
@ -111,8 +102,11 @@ func (info *CIDSystemInfo) String() string {
// NewCIDSystemInfo returns the CIDSystemInfo encoded in PDFObject `obj`
func NewCIDSystemInfo(obj core.PdfObject) (info CIDSystemInfo, err error) {
obj = core.TraceToDirectObject(obj)
d := *obj.(*core.PdfObjectDictionary)
d, ok := core.GetDict(obj)
if !ok {
err = core.ErrTypeError
return
}
registry, ok := core.GetStringVal(d.Get("Registry"))
if !ok {
err = core.ErrTypeError
@ -154,7 +148,7 @@ var MissingCodeString = string(MissingCodeRune)
// CharcodeBytesToUnicode converts a byte array of charcodes to a unicode string representation.
// It also returns a bool flag to tell if the conversion was successful.
// NOTE: This only works for ToUnicode cmaps
// NOTE: This only works for ToUnicode cmaps.
func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
charcodes, matched := cmap.bytesToCharcodes(data)
if !matched {
@ -186,17 +180,10 @@ func (cmap *CMap) CharcodeBytesToUnicode(data []byte) (string, int) {
return unicode, len(missing)
}
// CharcodeToUnicode converts a single character code `code ` to a unicode string.
// If `code` is not in the unicode map, "?" is returned
// Note that CharcodeBytesToUnicode is typically more efficient.
func (cmap *CMap) CharcodeToUnicode(code CharCode) string {
s, _ := cmap.CharcodeToUnicode2(code)
return s
}
// CharcodeToUnicode2 converts a single character code `code` to a unicode string.
// The bool value is set to true if `code` is in the unicode map,
func (cmap *CMap) CharcodeToUnicode2(code CharCode) (string, bool) {
// CharcodeToUnicode converts a single character code `code` to a unicode string.
// If `code` is not in the unicode map, "<22>" is returned.
// NOTE: CharcodeBytesToUnicode is typically more efficient.
func (cmap *CMap) CharcodeToUnicode(code CharCode) (string, bool) {
if s, ok := cmap.codeToUnicode[code]; ok {
return s, true
}
@ -264,7 +251,7 @@ func (cmap *CMap) inCodespace(code CharCode, numBytes int) bool {
}
// LoadCmapFromDataCID parses the in-memory cmap `data` and returns the resulting CMap.
// It is a convenience function,
// It is a convenience function.
func LoadCmapFromDataCID(data []byte) (*CMap, error) {
return LoadCmapFromData(data, false)
}
@ -273,12 +260,6 @@ func LoadCmapFromDataCID(data []byte) (*CMap, error) {
// If isCID is true then it uses 1-byte encodings, otherwise it uses the codespaces in the cmap.
//
// 9.10.3 ToUnicode CMaps (page 293)
// The CMap defined in the ToUnicode entry of the font dictionary shall follow the syntax for CMaps
// • The CMap file shall contain begincodespacerange and endcodespacerange operators that are
// consistent with the encoding that the font uses. In particular, for a simple font, the
// codespace shall be one byte long.
// • It shall use the beginbfchar, endbfchar, beginbfrange, and endbfrange operators to define the
// mapping from character codes to Unicode character sequences expressed in UTF-16BE encoding
func LoadCmapFromData(data []byte, isSimple bool) (*CMap, error) {
common.Log.Trace("LoadCmapFromData: isSimple=%t", isSimple)

View File

@ -15,7 +15,6 @@ import (
// parse parses the CMap file and loads into the CMap structure.
func (cmap *CMap) parse() error {
inCmap := false
var prev cmapObject
for {
o, err := cmap.parseObject()
@ -26,17 +25,11 @@ func (cmap *CMap) parse() error {
common.Log.Debug("ERROR: parsing CMap: %v", err)
return err
}
// fmt.Printf("-- %#v\n", o)
switch t := o.(type) {
case cmapOperand:
op := t
switch op.Operand {
case begincmap:
inCmap = true
case endcmap:
inCmap = false
case begincodespacerange:
err := cmap.parseCodespaceRange()
if err != nil {
@ -95,13 +88,7 @@ func (cmap *CMap) parse() error {
return err
}
}
case cmapInt:
default:
if inCmap {
// Don't log this noise for now
// common.Log.Trace("Unhandled object: %#v", o)
}
}
prev = o
}
@ -119,7 +106,6 @@ func (cmap *CMap) parseName() error {
if err != nil {
return err
}
// fmt.Printf("^^ %d %#v\n", i, o)
switch t := o.(type) {
case cmapOperand:
switch t.Operand {
@ -159,7 +145,6 @@ func (cmap *CMap) parseType() error {
if err != nil {
return err
}
// fmt.Printf("^^ %d %#v\n", i, o)
switch t := o.(type) {
case cmapOperand:
switch t.Operand {
@ -190,7 +175,6 @@ func (cmap *CMap) parseVersion() error {
if err != nil {
return err
}
// fmt.Printf("^^ %d %#v\n", i, o)
switch t := o.(type) {
case cmapOperand:
switch t.Operand {
@ -228,9 +212,9 @@ func (cmap *CMap) parseSystemInfo() error {
done := false
systemInfo := CIDSystemInfo{}
// 50 is a generous but arbitrary limit to prevent an endless loop on badly formed cmap files.
for i := 0; i < 50 && !done; i++ {
o, err := cmap.parseObject()
// fmt.Printf("%2d: %#v\n", i, o)
if err != nil {
return err
}
@ -385,7 +369,6 @@ func (cmap *CMap) parseBfchar() error {
}
return err
}
// fmt.Printf("--- %#v\n", o)
var code CharCode
switch v := o.(type) {
@ -449,7 +432,6 @@ func (cmap *CMap) parseBfrange() error {
}
return err
}
// fmt.Printf("-== %#v\n", o)
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfrange {

View File

@ -105,13 +105,13 @@ func TestCMapParser1(t *testing.T) {
}
for k, expected := range expectedMappings {
if v := cmap.CharcodeToUnicode(k); v != string(expected) {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (%#v)", k, expected, v)
return
}
}
v := cmap.CharcodeToUnicode(0x99)
v, _ := cmap.CharcodeToUnicode(0x99)
if v != MissingCodeString { //!= "notdef" {
t.Errorf("Unmapped code, expected to map to undefined")
return
@ -191,7 +191,7 @@ func TestCMapParser2(t *testing.T) {
}
for k, expected := range expectedMappings {
if v := cmap.CharcodeToUnicode(k); v != string(expected) {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping, expecting 0x%X ➞ 0x%X (got 0x%X)", k, expected, v)
return
}
@ -302,7 +302,7 @@ func TestCMapParser3(t *testing.T) {
0xd140: 0xa000,
}
for k, expected := range expectedMappings {
if v := cmap.CharcodeToUnicode(k); v != string(expected) {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != string(expected) {
t.Errorf("incorrect mapping: expecting 0x%02X ➞ 0x%02X (got 0x%02X)", k, expected, v)
return
}
@ -414,7 +414,7 @@ func TestCMapParser4(t *testing.T) {
}
for k, expected := range expectedMappings {
if v := cmap.CharcodeToUnicode(k); v != expected {
if v, ok := cmap.CharcodeToUnicode(k); !ok || v != expected {
t.Errorf("incorrect mapping, expecting 0x%04X ➞ %+q (got %+q)", k, expected, v)
return
}

View File

@ -255,6 +255,10 @@ func (this *PdfColorspaceDeviceGray) ColorFromFloats(vals []float64) (PdfColor,
val := vals[0]
if val < 0.0 || val > 1.0 {
common.Log.Debug("Incompatibility: Range outside [0,1]")
}
// Needed for ~/testdata/acl2017_hllz.pdf
if val < 0.0 {
val = 0.0

View File

@ -97,7 +97,7 @@ func newPdfFontFromPdfObject(fontObj core.PdfObject, allowType0 bool) (*PdfFont,
return nil, err
}
font.context = type0font
case "Type1", "Type3", "MMType1", "TrueType": // !@#$
case "Type1", "Type3", "MMType1", "TrueType":
var simplefont *pdfFontSimple
if std, ok := standard14Fonts[base.basefont]; ok && base.subtype == "Type1" {
font.context = &std
@ -182,7 +182,7 @@ func (font PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
numMisses := 0
for _, code := range charcodes {
if font.baseFields().toUnicodeCmap != nil {
r, ok := font.baseFields().toUnicodeCmap.CharcodeToUnicode2(cmap.CharCode(code))
r, ok := font.baseFields().toUnicodeCmap.CharcodeToUnicode(cmap.CharCode(code))
if ok {
charstrings = append(charstrings, r)
continue
@ -237,7 +237,6 @@ func (font PdfFont) Encoder() textencoding.TextEncoder {
}
// SetEncoder sets the encoding for the underlying font.
// !@#$ Is this only possible for simple fonts?
func (font PdfFont) SetEncoder(encoder textencoding.TextEncoder) {
t := font.actualFont()
if t == nil {
@ -383,15 +382,9 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
font.objectNumber = obj.ObjectNumber
}
dictObj := core.TraceToDirectObject(fontObj)
d, ok := dictObj.(*core.PdfObjectDictionary)
d, ok := core.GetDict(fontObj)
if !ok {
if ref, ok := dictObj.(*core.PdfObjectReference); ok {
common.Log.Debug("ERROR: Font is reference %s", ref)
} else {
common.Log.Debug("ERROR: Font not given by a dictionary (%T)", fontObj)
}
common.Log.Debug("ERROR: Font not given by a dictionary (%T)", fontObj)
return nil, nil, ErrFontNotSupported
}
@ -405,7 +398,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
return nil, nil, core.ErrTypeError
}
subtype, ok := core.GetNameVal(core.TraceToDirectObject(d.Get("Subtype")))
subtype, ok := core.GetNameVal(d.Get("Subtype"))
if !ok {
common.Log.Debug("ERROR: Font Incompatibility. Subtype (Required) missing")
return nil, nil, ErrRequiredAttributeMissing
@ -417,7 +410,7 @@ func newFontBaseFieldsFromPdfObject(fontObj core.PdfObject) (*core.PdfObjectDict
return nil, nil, ErrFontNotSupported
}
basefont, ok := core.GetNameVal(core.TraceToDirectObject(d.Get("BaseFont")))
basefont, ok := core.GetNameVal(d.Get("BaseFont"))
if !ok {
common.Log.Debug("ERROR: Font Incompatibility. BaseFont (Required) missing")
return nil, nil, ErrRequiredAttributeMissing
@ -467,18 +460,17 @@ func toUnicodeToCmap(toUnicode core.PdfObject, font *fontCommon) (*cmap.CMap, er
return cm, err
}
// 9.8.2 Font Descriptor Flags (page 283)
const (
fontFlagFixedPitch = 1 << iota
fontFlagSerif
fontFlagSymbolic
fontFlagScript
// Bit position 5 is not defined
fontFlagNonsymbolic = 1 << (iota + 1)
fontFlagItalic
// Bit position 8 - 16 are not defined
fontFlagAllCap = 1 << (iota + 10)
fontFlagSmallCap
fontFlagForceBold
fontFlagFixedPitch = 0x00001
fontFlagSerif = 0x00002
fontFlagSymbolic = 0x00004
fontFlagScript = 0x00008
fontFlagNonsymbolic = 0x00020
fontFlagItalic = 0x00040
fontFlagAllCap = 0x10000
fontFlagSmallCap = 0x20000
fontFlagForceBold = 0x40000
)
// PdfFontDescriptor specifies metrics and other attributes of a font and can refer to a FontFile

View File

@ -1,3 +1,8 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package model
import (
@ -170,16 +175,18 @@ func newPdfFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon)
font := pdfFontType0FromSkeleton(base)
font.DescendantFont = df
encoderName, ok := core.GetNameVal(core.TraceToDirectObject(d.Get("Encoding")))
// XXX: FIXME This is not valid if encoder is not Identity-H !@#$
if ok /*&& encoderName == "Identity-H"*/ {
font.encoder = textencoding.NewIdentityTextEncoder(encoderName)
encoderName, ok := core.GetNameVal(d.Get("Encoding"))
if ok {
if encoderName == "Identity-H" || encoderName == "Identity-V" {
font.encoder = textencoding.NewIdentityTextEncoder(encoderName)
} else {
common.Log.Debug("Unhandled cmap %q", encoderName)
}
}
return font, nil
}
// pdfCIDFontType0 represents a CIDFont Type0 font dictionary.
// XXX: This is a stub.
type pdfCIDFontType0 struct {
container *core.PdfIndirectObject
fontCommon
@ -216,20 +223,17 @@ func (font pdfCIDFontType0) SetEncoder(encoder textencoding.TextEncoder) {
// GetGlyphCharMetrics returns the character metrics for the specified glyph. A bool flag is
// returned to indicate whether or not the entry was found in the glyph to charcode mapping.
// XXX: This is a stub.
func (font pdfCIDFontType0) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics, bool) {
return fonts.CharMetrics{}, true
}
// ToPdfObject converts the pdfCIDFontType0 to a PDF representation.
// XXX: This is a stub.
func (font *pdfCIDFontType0) ToPdfObject() core.PdfObject {
return core.MakeNull()
}
// newPdfCIDFontType0FromPdfObject creates a pdfCIDFontType0 object from a dictionary (either direct
// or via indirect object). If a problem occurs with loading an error is returned.
// XXX: This is a stub.
func newPdfCIDFontType0FromPdfObject(d *core.PdfObjectDictionary, base *fontCommon) (*pdfCIDFontType0, error) {
if base.subtype != "CIDFontType0" {
common.Log.Debug("ERROR: Font SubType != CIDFontType0. font=%s", base)
@ -255,7 +259,7 @@ type pdfCIDFontType2 struct {
fontCommon
// These fields are specific to Type 0 fonts.
encoder textencoding.TextEncoder // !@#$ In base?
encoder textencoding.TextEncoder
ttfParser *fonts.TtfType
CIDSystemInfo core.PdfObject

View File

@ -1,3 +1,8 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package model
import (
@ -110,17 +115,13 @@ func (font pdfFontSimple) GetGlyphCharMetrics(glyph string) (fonts.CharMetrics,
// newSimpleFontFromPdfObject creates a pdfFontSimple from dictionary `d`. Elements of `d` that
// are already parsed are contained in `base`.
// An error is returned if there is a problem with loading.
// !@#$ Just return a base 14 font, if obj is a base 14 font
//
// The value of Encoding is subject to limitations that are described in 9.6.6, "Character Encoding".
// • The value of BaseFont is derived differently.
//
// !@#$ 9.6.6.4 Encodings for TrueType Fonts (page 265)
// Need to get TrueType font's cmap
func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon, std14 bool) (*pdfFontSimple, error) {
font := pdfFontSimpleFromSkeleton(base)
// !@#$ Failing on ~/testdata/The-Byzantine-Generals-Problem.pdf
// FirstChar is not defined in ~/testdata/shamirturing.pdf
if !std14 {
obj := d.Get("FirstChar")
@ -129,31 +130,31 @@ func newSimpleFontFromPdfObject(d *core.PdfObjectDictionary, base *fontCommon, s
}
font.FirstChar = obj
intVal, ok := core.TraceToDirectObject(obj).(*core.PdfObjectInteger)
intVal, ok := core.GetIntVal(obj)
if !ok {
common.Log.Debug("ERROR: Invalid FirstChar type (%T)", obj)
return nil, core.ErrTypeError
}
font.firstChar = int(*intVal)
font.firstChar = int(intVal)
obj = d.Get("LastChar")
if obj == nil {
obj = core.PdfObject(core.MakeInteger(255))
obj = core.MakeInteger(255)
}
font.LastChar = obj
intVal, ok = core.TraceToDirectObject(obj).(*core.PdfObjectInteger)
intVal, ok = core.GetIntVal(obj)
if !ok {
common.Log.Debug("ERROR: Invalid LastChar type (%T)", obj)
return nil, core.ErrTypeError
}
font.lastChar = int(*intVal)
font.lastChar = int(intVal)
font.charWidths = []float64{}
obj = d.Get("Widths")
if obj != nil {
font.Widths = obj
arr, ok := core.TraceToDirectObject(obj).(*core.PdfObjectArray)
arr, ok := core.GetArray(obj)
if !ok {
common.Log.Debug("ERROR: Widths attribute != array (%T)", obj)
return nil, core.ErrTypeError
@ -186,7 +187,7 @@ func (font *pdfFontSimple) addEncoding() error {
var err error
if font.Encoding != nil {
// !@#$ Stop setting default encoding in getFontEncoding XXX
// XXX: TODO Stop setting default encoding in getFontEncoding
baseEncoder, differences, err = getFontEncoding(font.Encoding)
if err != nil {
common.Log.Debug("ERROR: BaseFont=%q Subtype=%q Encoding=%s (%T) err=%v", font.basefont,
@ -209,7 +210,6 @@ func (font *pdfFontSimple) addEncoding() error {
if descriptor != nil {
switch font.subtype {
case "Type1":
// XXX: !@#$ Is this the right order? Do the /Differences need to be reapplied?
if descriptor.fontFile != nil && descriptor.fontFile.encoder != nil {
common.Log.Debug("Using fontFile")
font.SetEncoder(descriptor.fontFile.encoder)
@ -247,8 +247,8 @@ func (font *pdfFontSimple) addEncoding() error {
// Except for Type 3 fonts, every font program shall have a built-in encoding. Under certain
// circumstances, a PDF font dictionary may change the encoding used with the font program to match
// the requirements of the conforming writer generating the text being shown.
func getFontEncoding(obj core.PdfObject) (string, map[byte]string, error) {
baseName := "StandardEncoding"
func getFontEncoding(obj core.PdfObject) (baseName string, differences map[byte]string, err error) {
baseName = "StandardEncoding"
if obj == nil {
// Fall back to StandardEncoding
@ -259,9 +259,9 @@ func getFontEncoding(obj core.PdfObject) (string, map[byte]string, error) {
case *core.PdfObjectName:
return string(*encoding), nil, nil
case *core.PdfObjectDictionary:
typ, ok := core.GetNameVal(core.TraceToDirectObject(encoding.Get("Type")))
typ, ok := core.GetNameVal(encoding.Get("Type"))
if ok && typ == "Encoding" {
base, ok := core.GetNameVal(core.TraceToDirectObject(encoding.Get("BaseEncoding")))
base, ok := core.GetNameVal(encoding.Get("BaseEncoding"))
if ok {
baseName = base
}
@ -272,7 +272,7 @@ func getFontEncoding(obj core.PdfObject) (string, map[byte]string, error) {
return "", nil, core.ErrTypeError
}
differences, err := textencoding.FromFontDifferences(diffList)
differences, err = textencoding.FromFontDifferences(diffList)
return baseName, differences, err
default:
common.Log.Debug("ERROR: Encoding not a name or dict (%T) %s", obj, obj.String())

View File

@ -1,3 +1,8 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package model_test
import (
@ -14,7 +19,7 @@ import (
)
func init() {
// common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
}
var simpleFontDicts = []string{

View File

@ -1,3 +1,17 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*
/*
* A font file is a stream containing a Type 1 font program. It appears in PDF files as a
* /FontFile entry in a /FontDescriptor dictionary.
*
* 9.9 Embedded Font Programs (page 289)
*
* TODO: Add Type1C support
*/
package model
import (
@ -13,13 +27,16 @@ import (
"github.com/unidoc/unidoc/pdf/model/textencoding"
)
// fontFile represents a font file.
// Currently this is just the identifying information and the text encoder created from the font
// file's encoding section.
type fontFile struct {
name string
subtype string
encoder textencoding.TextEncoder
// binary []byte
}
// String retuns a human readable description of `fontfile`.
func (fontfile *fontFile) String() string {
encoding := "[None]"
if fontfile.encoder != nil {
@ -47,7 +64,7 @@ func newFontFileFromPdfObject(obj core.PdfObject) (*fontFile, error) {
return nil, err
}
subtype, ok := core.GetNameVal(core.TraceToDirectObject(d.Get("Subtype")))
subtype, ok := core.GetNameVal(d.Get("Subtype"))
if !ok {
fontfile.subtype = subtype
if subtype == "Type1C" {
@ -57,8 +74,9 @@ func newFontFileFromPdfObject(obj core.PdfObject) (*fontFile, error) {
}
}
length1 := int(*(core.TraceToDirectObject(d.Get("Length1")).(*core.PdfObjectInteger)))
length2 := int(*(core.TraceToDirectObject(d.Get("Length2")).(*core.PdfObjectInteger)))
length1, _ := core.GetIntVal(d.Get("Length1"))
length2, _ := core.GetIntVal(d.Get("Length2"))
if length1 > len(data) {
length1 = len(data)
}
@ -95,19 +113,14 @@ func (fontfile *fontFile) loadFromSegments(segment1, segment2 []byte) error {
if len(segment2) == 0 {
return nil
}
// err = fontfile.parseEexecPart(segment2)
// if err != nil {
// common.Log.Debug("err=%v", err)
// return err
// }
common.Log.Trace("fontfile=%s", fontfile)
return nil
}
// parseAsciiPart parses the ASCII part of the FontFile.
func (fontfile *fontFile) parseAsciiPart(data []byte) error {
common.Log.Trace("parseAsciiPart: %d ", len(data))
// Uncomment these lines to see the contents of the font file. For debugging.
// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~^^^~~~~~~~~~~~~~~~~~~~~~~~")
// fmt.Printf("data=%s\n", string(data))
// fmt.Println("~~~~~~~~~~~~~~~~~~~~~~~!!!~~~~~~~~~~~~~~~~~~~~~~~")
@ -133,15 +146,6 @@ func (fontfile *fontFile) parseAsciiPart(data []byte) error {
return ErrRequiredAttributeMissing
}
// encodingName, ok := keyValues["Encoding"]
// !@#$ I am not sure why we don't do this
// if ok {
// encoder, err := textencoding.NewSimpleTextEncoder(encodingName, nil)
// if err != nil {
// return err
// }
// fontfile.encoder = encoder
// }
if encodingSection != "" {
encodings, err := getEncodings(encodingSection)
if err != nil {
@ -149,7 +153,7 @@ func (fontfile *fontFile) parseAsciiPart(data []byte) error {
}
encoder, err := textencoding.NewCustomSimpleTextEncoder(encodings, nil)
if err != nil {
// XXX: !@#$ We need to fix all these errors
// XXX: Logging an error because we need to fix all these misses.
common.Log.Error("UNKNOWN GLYPH: err=%v", err)
return nil
}
@ -158,23 +162,6 @@ func (fontfile *fontFile) parseAsciiPart(data []byte) error {
return nil
}
// // parseEexecPart parses the binary encrypted part of the FontFile.
// func (fontfile *fontFile) parseEexecPart(data []byte) error {
// // Sometimes, fonts use hex format
// if !isBinary(data) {
// decoded, err := hex.DecodeString(string(data))
// if err != nil {
// return err
// }
// data = decoded
// }
// decoded := decodeEexec(data)
// fmt.Println(":::::::::::::::::::::<<>>:::::::::::::::::::::")
// fmt.Printf("%s\n", string(decoded))
// fmt.Println(":::::::::::::::::::::<><>:::::::::::::::::::::")
// return nil
// }
var (
reDictBegin = regexp.MustCompile(`\d+ dict\s+(dup\s+)?begin`)
reKeyVal = regexp.MustCompile(`^\s*/(\S+?)\s+(.+?)\s+def\s*$`)
@ -216,12 +203,11 @@ func getAsciiSections(data []byte) (keySection, encodingSection string, err erro
return
}
// ~/testdata/invoice61781040.pdf has \r line endings
// ~/testdata/private/invoice61781040.pdf has \r line endings
var reEndline = regexp.MustCompile(`[\n\r]+`)
// getKeyValues returns the map encoded in `data`.
func getKeyValues(data string) map[string]string {
// lines := strings.Split(data, "\n")
lines := reEndline.Split(data, -1)
keyValues := map[string]string{}
for _, line := range lines {
@ -250,10 +236,6 @@ func getEncodings(data string) (map[uint16]string, error) {
common.Log.Debug("ERROR: Bad encoding line. %q", line)
return nil, core.ErrTypeError
}
// if !textencoding.KnownGlyph(glyph) {
// common.Log.Debug("ERROR: Unknown glyph %q. line=%q", glyph, line)
// return nil, ErrTypeCheck
// }
keyValues[uint16(code)] = glyph
}
common.Log.Trace("getEncodings: keyValues=%#v", keyValues)

View File

@ -115,6 +115,7 @@ func NewFontFile2FromPdfObject(obj core.PdfObject) (rec TtfType, err error) {
return
}
// Uncomment these lines to see the contents of the font file. For debugging.
// fmt.Println("===============&&&&===============")
// fmt.Printf("%#q", string(data))
// fmt.Println("===============####===============")
@ -148,7 +149,6 @@ func (t *ttfParser) Parse() (TtfRec TtfType, err error) {
err = errors.New("fonts based on PostScript outlines are not supported")
return
}
// XXX: !@#$ Not sure what to do here. Have seen version="true"
if version != "\x00\x01\x00\x00" {
common.Log.Debug("ERROR: Unrecognized TrueType file format. version=%q", version)
}
@ -416,7 +416,7 @@ func (t *ttfParser) ParseCmap() (err error) {
return
}
common.Log.Debug("ParseCmap")
/* version := */ t.ReadUShort()
t.ReadUShort() // version is ignored.
numTables := int(t.ReadUShort())
offset10 := int64(0)
offset31 := int64(0)
@ -428,7 +428,6 @@ func (t *ttfParser) ParseCmap() (err error) {
// (3,1) subtable. Windows Unicode.
offset31 = offset
}
//fmt.Printf("(%d,%d) subtable @ %d\n", platformID, encodingID, offset)
}
// Latin font support based on (3,1) table encoding.
@ -440,9 +439,7 @@ func (t *ttfParser) ParseCmap() (err error) {
}
// Many non-Latin fonts (including asian fonts) use subtable (1,0).
if offset10 != 0 {
// fmt.Printf("Offset10: %d\n", offset10)
err = t.parseCmapVersion(offset10)
if err != nil {
return
@ -578,18 +575,16 @@ func (t *ttfParser) ParsePost() (err error) {
if err = t.Seek("post"); err != nil {
return
}
//versionUpper := t.ReadShort()
//versionFraction := t.ReadUShort()
formatType := t.Read32Fixed()
t.rec.ItalicAngle = t.Read32Fixed()
t.rec.UnderlinePosition = t.ReadShort()
t.rec.UnderlineThickness = t.ReadShort()
t.rec.IsFixedPitch = t.ReadULong() != 0
/*minMemType42 := */ t.ReadULong()
/*maxMemType42 := */ t.ReadULong()
/*mimMemType1 := */ t.ReadULong()
/*maxMemType1 := */ t.ReadULong()
t.ReadULong() // minMemType42 ignored.
t.ReadULong() // maxMemType42 ignored.
t.ReadULong() // mimMemType1 ignored.
t.ReadULong() // maxMemType1 ignored.
common.Log.Trace("ParsePost: formatType=%f", formatType)
@ -628,13 +623,11 @@ func (t *ttfParser) ParsePost() (err error) {
} else if index >= len(macGlyphNames) && index <= 32767 {
t.rec.GlyphNames[i] = nameArray[index-len(macGlyphNames)]
} else {
// PDFBOX-808: Index numbers between 32768 and 65535 are
// reserved for future use, so we should just ignore them
t.rec.GlyphNames[i] = ".undefined"
}
}
case 2.5:
glyphNameIndex := make([]int, t.numGlyphs) // !@#$ Check that this is parsed first
glyphNameIndex := make([]int, t.numGlyphs)
for i := 0; i < len(glyphNameIndex); i++ {
offset := int(t.ReadSByte())
glyphNameIndex[i] = i + 1 + offset
@ -645,7 +638,7 @@ func (t *ttfParser) ParsePost() (err error) {
t.rec.GlyphNames[i] = name
}
case 3.0:
// no postscript information is provided.
// no PostScript information is provided.
common.Log.Debug("No PostScript name information is provided for the font.")
default:
common.Log.Debug("ERROR: Unknown formatType=%f", formatType)
@ -710,10 +703,13 @@ func (t *ttfParser) Seek(tag string) error {
return nil
}
// Skip moves the file point n bytes forward.
func (t *ttfParser) Skip(n int) {
t.f.Seek(int64(n), os.SEEK_CUR)
}
// ReadStr reads `length` bytes from the file and returns them as a string, or an error if there was
// a problem.
func (t *ttfParser) ReadStr(length int) (str string, err error) {
var n int
buf := make([]byte, length)
@ -729,31 +725,38 @@ func (t *ttfParser) ReadStr(length int) (str string, err error) {
return
}
// ReadByte reads a byte and returns it as unsigned.
func (t *ttfParser) ReadByte() (val uint8) {
binary.Read(t.f, binary.BigEndian, &val)
return
}
// ReadSByte reads a byte and returns it as signed.
func (t *ttfParser) ReadSByte() (val int8) {
binary.Read(t.f, binary.BigEndian, &val)
return
}
// ReadUShort reads 2 bytes and returns them as a big endian unsigned 16 bit integer.
func (t *ttfParser) ReadUShort() (val uint16) {
binary.Read(t.f, binary.BigEndian, &val)
return
}
// ReadShort reads 2 bytes and returns them as a big endian signed 16 bit integer.
func (t *ttfParser) ReadShort() (val int16) {
binary.Read(t.f, binary.BigEndian, &val)
return
}
// ReadULong reads 4 bytes and returns them as a big endian unsigned 32 bit integer.
func (t *ttfParser) ReadULong() (val uint32) {
binary.Read(t.f, binary.BigEndian, &val)
return
}
// ReadULong reads 4 bytes and returns them as a float, the first 2 bytes for the whole number and
// the second 2 bytes for the fraction.
func (t *ttfParser) Read32Fixed() float64 {
whole := float64(t.ReadUShort())
frac := float64(t.ReadUShort()) / 65536.0

View File

@ -1,3 +1,8 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package model
import (

View File

@ -1,3 +1,8 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package textencoding
import "github.com/unidoc/unidoc/pdf/core"

View File

@ -1,3 +1,8 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package textencoding
import "testing"

View File

@ -7,7 +7,7 @@ package textencoding
import (
"github.com/unidoc/unidoc/common"
. "github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/core"
)
type TextEncoder interface {
@ -44,7 +44,7 @@ type TextEncoder interface {
GlyphToRune(glyph string) (rune, bool)
// ToPdfObject returns a PDF Object that represents the encoding.
ToPdfObject() PdfObject
ToPdfObject() core.PdfObject
}
// Convenience functions

View File

@ -63,8 +63,6 @@ func charcodeToGlyphListPath(filename string) error {
line = strings.Trim(line, " \r\n")
//fmt.Printf("%s\n", line)
parts := strings.Split(line, " ")
for _, part := range parts {
index++
@ -99,8 +97,6 @@ func glyphToCharcodeListPath(filename string) error {
line = strings.Trim(line, " \r\n")
//fmt.Printf("%s\n", line)
parts := strings.Split(line, " ")
for _, part := range parts {
index++

View File

@ -237,7 +237,6 @@ func loadGlyphlist(filename string) ([]string, error) {
if part == "notdef" {
continue
}
//fmt.Printf("%d: \"%s\",\n", index, part)
glyphs = append(glyphs, part)
}
}

View File

@ -17,12 +17,6 @@ import (
// MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'.
const MissingCodeRune = '\ufffd' // <20>
// GlyphToRune returns true if `glyph` is in our GlyphToRune mapping.
func KnownGlyph(glyph string) bool {
_, ok := GlyphToRune(glyph)
return ok
}
// GlyphToRune returns the rune corresponding to glyph `glyph` if there is one.
// XXX: TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi"
// rather than 'ffi'. We only need a glyph ➞ rune map when we need to convert back to

View File

@ -198,7 +198,7 @@ func (se *SimpleEncoder) makeEncoder() {
codeToGlyph := map[uint16]string{}
glyphToCode := map[string]uint16{}
for code, r := range codeToRune {
glyph := glyphlistRuneToGlyphMap[r] // !@#$ Build out this map
glyph := glyphlistRuneToGlyphMap[r]
codeToGlyph[code] = glyph
glyphToCode[glyph] = code
if glyph == "" {
@ -207,9 +207,11 @@ func (se *SimpleEncoder) makeEncoder() {
}
se.codeToGlyph = codeToGlyph
se.glyphToCode = glyphToCode
se.codeToRune = codeToRune // XXX: !@#$ Make this a string
se.codeToRune = codeToRune
}
// FromFontDifferences converts `diffList`, a /Differences array from an /Encoding object to a map
// representing character code to glyph mappings.
func FromFontDifferences(diffList []core.PdfObject) (map[byte]string, error) {
differences := map[byte]string{}
var n byte
@ -229,6 +231,8 @@ func FromFontDifferences(diffList []core.PdfObject) (map[byte]string, error) {
return differences, nil
}
// ToFontDifferences converts `differences`, a map representing character code to glyph mappings,
// to a /Differences array for an /Encoding object.
func ToFontDifferences(differences map[byte]string) []core.PdfObject {
if len(differences) == 0 {
return []core.PdfObject{}
@ -255,6 +259,7 @@ func ToFontDifferences(differences map[byte]string) []core.PdfObject {
return diffList
}
// simpleEncodings is a map of the standard 8 bit character encodings.
var simpleEncodings = map[string]map[uint16]rune{
"MacExpertEncoding": map[uint16]rune{
0x20: '\u0020', // "space"