/* * This file is subject to the terms and conditions defined in * file 'LICENSE.md', which is part of this source code package. */ package cmap import ( "bufio" "bytes" "errors" "fmt" "io" "strconv" "encoding/hex" "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/core" ) // cMapParser parses CMap character to unicode mapping files. type cMapParser struct { reader *bufio.Reader } // cMapParser creates a new instance of the PDF CMap parser from input data. func newCMapParser(content []byte) *cMapParser { parser := cMapParser{} buffer := bytes.NewBuffer(content) parser.reader = bufio.NewReader(buffer) return &parser } // Detect the signature at the current file position and parse // the corresponding object. func (p *cMapParser) parseObject() (cmapObject, error) { p.skipSpaces() for { bb, err := p.reader.Peek(2) if err != nil { return nil, err } if bb[0] == '%' { p.parseComment() p.skipSpaces() continue } else if bb[0] == '/' { name, err := p.parseName() return name, err } else if bb[0] == '(' { str, err := p.parseString() return str, err } else if bb[0] == '[' { arr, err := p.parseArray() return arr, err } else if (bb[0] == '<') && (bb[1] == '<') { dict, err := p.parseDict() return dict, err } else if bb[0] == '<' { shex, err := p.parseHexString() return shex, err } else if core.IsDecimalDigit(bb[0]) || (bb[0] == '-' && core.IsDecimalDigit(bb[1])) { number, err := p.parseNumber() if err != nil { return nil, err } return number, nil } else { // Operand? operand, err := p.parseOperand() if err != nil { return nil, err } return operand, nil } } } // Skip over any spaces. Returns the number of spaces skipped and // an error if any. func (p *cMapParser) skipSpaces() (int, error) { cnt := 0 for { bb, err := p.reader.Peek(1) if err != nil { return 0, err } if core.IsWhiteSpace(bb[0]) { p.reader.ReadByte() cnt++ } else { break } } return cnt, nil } // parseComment reads a comment line starting with '%'. func (p *cMapParser) parseComment() (string, error) { var r bytes.Buffer _, err := p.skipSpaces() if err != nil { return r.String(), err } isFirst := true for { bb, err := p.reader.Peek(1) if err != nil { common.Log.Debug("Error %s", err.Error()) return r.String(), err } if isFirst && bb[0] != '%' { return r.String(), errors.New("Comment should start with %") } isFirst = false if (bb[0] != '\r') && (bb[0] != '\n') { b, _ := p.reader.ReadByte() r.WriteByte(b) } else { break } } return r.String(), nil } // Parse a name starting with '/'. func (p *cMapParser) parseName() (cmapName, error) { name := "" nameStarted := false for { bb, err := p.reader.Peek(1) if err == io.EOF { break // Can happen when loading from object stream. } if err != nil { return cmapName{name}, err } if !nameStarted { // Should always start with '/', otherwise not valid. if bb[0] == '/' { nameStarted = true p.reader.ReadByte() } else { common.Log.Debug("ERROR Name starting with %s (% x)", bb, bb) return cmapName{name}, fmt.Errorf("Invalid name: (%c)", bb[0]) } } else { if core.IsWhiteSpace(bb[0]) { break } else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') { break // Looks like start of next statement. } else if bb[0] == '#' { hexcode, err := p.reader.Peek(3) if err != nil { return cmapName{name}, err } p.reader.Discard(3) code, err := hex.DecodeString(string(hexcode[1:3])) if err != nil { return cmapName{name}, err } name += string(code) } else { b, _ := p.reader.ReadByte() name += string(b) } } } return cmapName{name}, nil } // A string starts with '(' and ends with ')'. func (p *cMapParser) parseString() (cmapString, error) { p.reader.ReadByte() buf := bytes.Buffer{} count := 1 for { bb, err := p.reader.Peek(1) if err != nil { return cmapString{buf.String()}, err } if bb[0] == '\\' { // Escape sequence. p.reader.ReadByte() // Skip the escape \ byte. b, err := p.reader.ReadByte() if err != nil { return cmapString{buf.String()}, err } // Octal '\ddd' number (base 8). if core.IsOctalDigit(b) { bb, err := p.reader.Peek(2) if err != nil { return cmapString{buf.String()}, err } numeric := []byte{} numeric = append(numeric, b) for _, val := range bb { if core.IsOctalDigit(val) { numeric = append(numeric, val) } else { break } } p.reader.Discard(len(numeric) - 1) common.Log.Trace("Numeric string \"%s\"", numeric) code, err := strconv.ParseUint(string(numeric), 8, 32) if err != nil { return cmapString{buf.String()}, err } buf.WriteByte(byte(code)) continue } switch b { case 'n': buf.WriteByte('\n') case 'r': buf.WriteByte('\r') case 't': buf.WriteByte('\t') case 'b': buf.WriteByte('\b') case 'f': buf.WriteByte('\f') case '(': buf.WriteByte('(') case ')': buf.WriteByte(')') case '\\': buf.WriteByte('\\') } continue } else if bb[0] == '(' { count++ } else if bb[0] == ')' { count-- if count == 0 { p.reader.ReadByte() break } } b, _ := p.reader.ReadByte() buf.WriteByte(b) } return cmapString{buf.String()}, nil } // Starts with '<' ends with '>'. // Currently not converting the hex codes to characters. func (p *cMapParser) parseHexString() (cmapHexString, error) { p.reader.ReadByte() hextable := []byte("0123456789abcdefABCDEF") buf := bytes.Buffer{} //tmp := []byte{} for { p.skipSpaces() bb, err := p.reader.Peek(1) if err != nil { return cmapHexString{[]byte("")}, err } if bb[0] == '>' { p.reader.ReadByte() break } b, _ := p.reader.ReadByte() if bytes.IndexByte(hextable, b) >= 0 { buf.WriteByte(b) } } if buf.Len()%2 == 1 { buf.WriteByte('0') } hexb, _ := hex.DecodeString(buf.String()) return cmapHexString{hexb}, nil } // Starts with '[' ends with ']'. Can contain any kinds of direct objects. func (p *cMapParser) parseArray() (cmapArray, error) { arr := cmapArray{} arr.Array = []cmapObject{} p.reader.ReadByte() for { p.skipSpaces() bb, err := p.reader.Peek(1) if err != nil { return arr, err } if bb[0] == ']' { p.reader.ReadByte() break } obj, err := p.parseObject() if err != nil { return arr, err } arr.Array = append(arr.Array, obj) } return arr, nil } // Reads and parses a PDF dictionary object enclosed with '<<' and '>>' func (p *cMapParser) parseDict() (cmapDict, error) { common.Log.Trace("Reading PDF Dict!") dict := makeDict() // Pass the '<<' c, _ := p.reader.ReadByte() if c != '<' { return dict, errors.New("Invalid dict") } c, _ = p.reader.ReadByte() if c != '<' { return dict, errors.New("Invalid dict") } for { p.skipSpaces() bb, err := p.reader.Peek(2) if err != nil { return dict, err } if (bb[0] == '>') && (bb[1] == '>') { p.reader.ReadByte() p.reader.ReadByte() break } key, err := p.parseName() common.Log.Trace("Key: %s", key.Name) if err != nil { common.Log.Debug("ERROR Returning name err %s", err) return dict, err } p.skipSpaces() val, err := p.parseObject() if err != nil { return dict, err } dict.Dict[key.Name] = val // Skip "def" which optionally follows key value dict definitions in CMaps. p.skipSpaces() bb, err = p.reader.Peek(3) if err != nil { return dict, err } if string(bb) == "def" { p.reader.Discard(3) } } return dict, nil } func (p *cMapParser) parseNumber() (cmapObject, error) { isFloat := false allowSigns := true numStr := bytes.Buffer{} for { bb, err := p.reader.Peek(1) if err == io.EOF { break } if err != nil { return nil, err } if allowSigns && (bb[0] == '-' || bb[0] == '+') { // Only appear in the beginning, otherwise serves as a delimiter. b, _ := p.reader.ReadByte() numStr.WriteByte(b) allowSigns = false // Only allowed in beginning, and after e (exponential). } else if core.IsDecimalDigit(bb[0]) { b, _ := p.reader.ReadByte() numStr.WriteByte(b) } else if bb[0] == '.' { b, _ := p.reader.ReadByte() numStr.WriteByte(b) isFloat = true } else if bb[0] == 'e' { // Exponential number format. b, _ := p.reader.ReadByte() numStr.WriteByte(b) isFloat = true allowSigns = true } else { break } } if isFloat { fVal, err := strconv.ParseFloat(numStr.String(), 64) o := cmapFloat{fVal} return o, err } intVal, err := strconv.ParseInt(numStr.String(), 10, 64) o := cmapInt{intVal} return o, err } // An operand is a text command represented by a word. func (p *cMapParser) parseOperand() (cmapOperand, error) { op := cmapOperand{} buf := bytes.Buffer{} for { bb, err := p.reader.Peek(1) if err != nil { if err == io.EOF { break } return op, err } if core.IsDelimiter(bb[0]) { break } if core.IsWhiteSpace(bb[0]) { break } b, _ := p.reader.ReadByte() buf.WriteByte(b) } if buf.Len() == 0 { return op, fmt.Errorf("Invalid operand (empty)") } op.Operand = buf.String() return op, nil }