/* * This file is subject to the terms and conditions defined in * file 'LICENSE.md', which is part of this source code package. */ package textencoding import ( "github.com/unidoc/unidoc/common" "github.com/unidoc/unidoc/pdf/core" ) func splitWords(raw string, encoder TextEncoder) []string { runes := []rune(raw) words := []string{} startsAt := 0 for idx, code := range runes { glyph, found := encoder.RuneToGlyphName(code) if !found { common.Log.Debug("Glyph not found for code: %s\n", string(code)) continue } if glyph == "space" { word := runes[startsAt:idx] words = append(words, string(word)) startsAt = idx + 1 } } word := runes[startsAt:] if len(word) > 0 { words = append(words, string(word)) } return words } type WinAnsiEncoder struct { } func NewWinAnsiTextEncoder() WinAnsiEncoder { encoder := WinAnsiEncoder{} return encoder } func (winenc WinAnsiEncoder) ToPdfObject() core.PdfObject { return core.MakeName("WinAnsiEncoding") } // Convert utf8 runes to WinAnsiEncoded encoded string (series of char codes). func (winenc WinAnsiEncoder) Encode(raw string) string { encoded := []byte{} for _, rune := range raw { if code, has := utf8ToWinAnsiEncodingMap[rune]; has { encoded = append(encoded, code) } } return string(encoded) } func (winenc WinAnsiEncoder) RuneToGlyphName(val rune) (string, bool) { code, found := winenc.RuneToCharcode(val) if !found { return "", false } glyph, found := winenc.CharcodeToGlyphName(code) if !found { return "", false } return glyph, true } func (winenc WinAnsiEncoder) CharcodeToGlyphName(code byte) (string, bool) { glyph, has := winAnsiEncodingGlyphMap[code] if !has { return "", false } return glyph, true } func (winenc WinAnsiEncoder) GlyphNameToCharcode(glyph string) (byte, bool) { for code, name := range winAnsiEncodingGlyphMap { if name == glyph { return code, true } } // Not found. return 0, false } // Convert UTF-8 rune to character code. If applicable. func (winenc WinAnsiEncoder) RuneToCharcode(val rune) (byte, bool) { code, has := utf8ToWinAnsiEncodingMap[val] if !has { return 0, false } return code, true } func (winenc WinAnsiEncoder) CharcodeToRune(charcode byte) (rune, bool) { val, has := winAnsiEncodingToUtf8Map[charcode] if !has { return 0, false } return val, true } // WinAnsiEncoding. // Convert a UTF8 string to WinAnsiEncoding byte string. func utf8ToWinAnsiEncoding(strUtf8 string) string { encoded := []byte{} for _, rune := range strUtf8 { if code, has := utf8ToWinAnsiEncodingMap[rune]; has { encoded = append(encoded, code) } } return string(encoded) } // Maps to enable conversion of WinAnsiEncoding character codes to glyphs, utf8 and vice versa. var winAnsiEncodingGlyphMap = map[byte]string{ 32: "space", 33: "exclam", 34: "quotedbl", 35: "numbersign", 36: "dollar", 37: "percent", 38: "ampersand", 39: "quotesingle", 40: "parenleft", 41: "parenright", 42: "asterisk", 43: "plus", 44: "comma", 45: "hyphen", 46: "period", 47: "slash", 48: "zero", 49: "one", 50: "two", 51: "three", 52: "four", 53: "five", 54: "six", 55: "seven", 56: "eight", 57: "nine", 58: "colon", 59: "semicolon", 60: "less", 61: "equal", 62: "greater", 63: "question", 64: "at", 65: "A", 66: "B", 67: "C", 68: "D", 69: "E", 70: "F", 71: "G", 72: "H", 73: "I", 74: "J", 75: "K", 76: "L", 77: "M", 78: "N", 79: "O", 80: "P", 81: "Q", 82: "R", 83: "S", 84: "T", 85: "U", 86: "V", 87: "W", 88: "X", 89: "Y", 90: "Z", 91: "bracketleft", 92: "backslash", 93: "bracketright", 94: "asciicircum", 95: "underscore", 96: "grave", 97: "a", 98: "b", 99: "c", 100: "d", 101: "e", 102: "f", 103: "g", 104: "h", 105: "i", 106: "j", 107: "k", 108: "l", 109: "m", 110: "n", 111: "o", 112: "p", 113: "q", 114: "r", 115: "s", 116: "t", 117: "u", 118: "v", 119: "w", 120: "x", 121: "y", 122: "z", 123: "braceleft", 124: "bar", 125: "braceright", 126: "asciitilde", 127: "bullet", 128: "Euro", 129: "bullet", 130: "quotesinglbase", 131: "florin", 132: "quotedblbase", 133: "ellipsis", 134: "dagger", 135: "daggerdbl", 136: "circumflex", 137: "perthousand", 138: "Scaron", 139: "guilsinglleft", 140: "OE", 141: "bullet", 142: "Zcaron", 143: "bullet", 144: "bullet", 145: "quoteleft", 146: "quoteright", 147: "quotedblleft", 148: "quotedblright", 149: "bullet", 150: "endash", 151: "emdash", 152: "tilde", 153: "trademark", 154: "scaron", 155: "guilsinglright", 156: "oe", 157: "bullet", 158: "zcaron", 159: "Ydieresis", 160: "space", 161: "exclamdown", 162: "cent", 163: "sterling", 164: "currency", 165: "yen", 166: "brokenbar", 167: "section", 168: "dieresis", 169: "copyright", 170: "ordfeminine", 171: "guillemotleft", 172: "logicalnot", 173: "hyphen", 174: "registered", 175: "macron", 176: "degree", 177: "plusminus", 178: "twosuperior", 179: "threesuperior", 180: "acute", 181: "mu", 182: "paragraph", 183: "periodcentered", 184: "cedilla", 185: "onesuperior", 186: "ordmasculine", 187: "guillemotright", 188: "onequarter", 189: "onehalf", 190: "threequarters", 191: "questiondown", 192: "Agrave", 193: "Aacute", 194: "Acircumflex", 195: "Atilde", 196: "Adieresis", 197: "Aring", 198: "AE", 199: "Ccedilla", 200: "Egrave", 201: "Eacute", 202: "Ecircumflex", 203: "Edieresis", 204: "Igrave", 205: "Iacute", 206: "Icircumflex", 207: "Idieresis", 208: "Eth", 209: "Ntilde", 210: "Ograve", 211: "Oacute", 212: "Ocircumflex", 213: "Otilde", 214: "Odieresis", 215: "multiply", 216: "Oslash", 217: "Ugrave", 218: "Uacute", 219: "Ucircumflex", 220: "Udieresis", 221: "Yacute", 222: "Thorn", 223: "germandbls", 224: "agrave", 225: "aacute", 226: "acircumflex", 227: "atilde", 228: "adieresis", 229: "aring", 230: "ae", 231: "ccedilla", 232: "egrave", 233: "eacute", 234: "ecircumflex", 235: "edieresis", 236: "igrave", 237: "iacute", 238: "icircumflex", 239: "idieresis", 240: "eth", 241: "ntilde", 242: "ograve", 243: "oacute", 244: "ocircumflex", 245: "otilde", 246: "odieresis", 247: "divide", 248: "oslash", 249: "ugrave", 250: "uacute", 251: "ucircumflex", 252: "udieresis", 253: "yacute", 254: "thorn", 255: "ydieresis", } var winAnsiEncodingToUtf8Map = map[byte]rune{ 32: '\u0020', 33: '\u0021', 34: '\u0022', 35: '\u0023', 36: '\u0024', 37: '\u0025', 38: '\u0026', 39: '\u0027', 40: '\u0028', 41: '\u0029', 42: '\u002a', 43: '\u002b', 44: '\u002c', 45: '\u002d', 46: '\u002e', 47: '\u002f', 48: '\u0030', 49: '\u0031', 50: '\u0032', 51: '\u0033', 52: '\u0034', 53: '\u0035', 54: '\u0036', 55: '\u0037', 56: '\u0038', 57: '\u0039', 58: '\u003a', 59: '\u003b', 60: '\u003c', 61: '\u003d', 62: '\u003e', 63: '\u003f', 64: '\u0040', 65: '\u0041', 66: '\u0042', 67: '\u0043', 68: '\u0044', 69: '\u0045', 70: '\u0046', 71: '\u0047', 72: '\u0048', 73: '\u0049', 74: '\u004a', 75: '\u004b', 76: '\u004c', 77: '\u004d', 78: '\u004e', 79: '\u004f', 80: '\u0050', 81: '\u0051', 82: '\u0052', 83: '\u0053', 84: '\u0054', 85: '\u0055', 86: '\u0056', 87: '\u0057', 88: '\u0058', 89: '\u0059', 90: '\u005a', 91: '\u005b', 92: '\u005c', 93: '\u005d', 94: '\u005e', 95: '\u005f', 96: '\u0060', 97: '\u0061', 98: '\u0062', 99: '\u0063', 100: '\u0064', 101: '\u0065', 102: '\u0066', 103: '\u0067', 104: '\u0068', 105: '\u0069', 106: '\u006a', 107: '\u006b', 108: '\u006c', 109: '\u006d', 110: '\u006e', 111: '\u006f', 112: '\u0070', 113: '\u0071', 114: '\u0072', 115: '\u0073', 116: '\u0074', 117: '\u0075', 118: '\u0076', 119: '\u0077', 120: '\u0078', 121: '\u0079', 122: '\u007a', 123: '\u007b', 124: '\u007c', 125: '\u007d', 126: '\u007e', 127: '\u2022', 128: '\u20ac', 129: '\u2022', 130: '\u201a', 131: '\u0192', 132: '\u201e', 133: '\u2026', 134: '\u2020', 135: '\u2021', 136: '\u02c6', 137: '\u2030', 138: '\u0160', 139: '\u2039', 140: '\u0152', 141: '\u2022', 142: '\u017d', 143: '\u2022', 144: '\u2022', 145: '\u2018', 146: '\u2019', 147: '\u201c', 148: '\u201d', 149: '\u2022', 150: '\u2013', 151: '\u2014', 152: '\u02dc', 153: '\u2122', 154: '\u0161', 155: '\u203a', 156: '\u0153', 157: '\u2022', 158: '\u017e', 159: '\u0178', 160: '\u0020', 161: '\u00a1', 162: '\u00a2', 163: '\u00a3', 164: '\u00a4', 165: '\u00a5', 166: '\u00a6', 167: '\u00a7', 168: '\u00a8', 169: '\u00a9', 170: '\u00aa', 171: '\u00ab', 172: '\u00ac', 173: '\u002d', 174: '\u00ae', 175: '\u00af', 176: '\u00b0', 177: '\u00b1', 178: '\u00b2', 179: '\u00b3', 180: '\u00b4', 181: '\u00b5', 182: '\u00b6', 183: '\u00b7', 184: '\u00b8', 185: '\u00b9', 186: '\u00ba', 187: '\u00bb', 188: '\u00bc', 189: '\u00bd', 190: '\u00be', 191: '\u00bf', 192: '\u00c0', 193: '\u00c1', 194: '\u00c2', 195: '\u00c3', 196: '\u00c4', 197: '\u00c5', 198: '\u00c6', 199: '\u00c7', 200: '\u00c8', 201: '\u00c9', 202: '\u00ca', 203: '\u00cb', 204: '\u00cc', 205: '\u00cd', 206: '\u00ce', 207: '\u00cf', 208: '\u00d0', 209: '\u00d1', 210: '\u00d2', 211: '\u00d3', 212: '\u00d4', 213: '\u00d5', 214: '\u00d6', 215: '\u00d7', 216: '\u00d8', 217: '\u00d9', 218: '\u00da', 219: '\u00db', 220: '\u00dc', 221: '\u00dd', 222: '\u00de', 223: '\u00df', 224: '\u00e0', 225: '\u00e1', 226: '\u00e2', 227: '\u00e3', 228: '\u00e4', 229: '\u00e5', 230: '\u00e6', 231: '\u00e7', 232: '\u00e8', 233: '\u00e9', 234: '\u00ea', 235: '\u00eb', 236: '\u00ec', 237: '\u00ed', 238: '\u00ee', 239: '\u00ef', 240: '\u00f0', 241: '\u00f1', 242: '\u00f2', 243: '\u00f3', 244: '\u00f4', 245: '\u00f5', 246: '\u00f6', 247: '\u00f7', 248: '\u00f8', 249: '\u00f9', 250: '\u00fa', 251: '\u00fb', 252: '\u00fc', 253: '\u00fd', 254: '\u00fe', 255: '\u00ff', } var utf8ToWinAnsiEncodingMap = map[rune]byte{ '\u0020': 32, '\u0021': 33, '\u0022': 34, '\u0023': 35, '\u0024': 36, '\u0025': 37, '\u0026': 38, '\u0027': 39, '\u0028': 40, '\u0029': 41, '\u002a': 42, '\u002b': 43, '\u002c': 44, '\u002d': 45, '\u002e': 46, '\u002f': 47, '\u0030': 48, '\u0031': 49, '\u0032': 50, '\u0033': 51, '\u0034': 52, '\u0035': 53, '\u0036': 54, '\u0037': 55, '\u0038': 56, '\u0039': 57, '\u003a': 58, '\u003b': 59, '\u003c': 60, '\u003d': 61, '\u003e': 62, '\u003f': 63, '\u0040': 64, '\u0041': 65, '\u0042': 66, '\u0043': 67, '\u0044': 68, '\u0045': 69, '\u0046': 70, '\u0047': 71, '\u0048': 72, '\u0049': 73, '\u004a': 74, '\u004b': 75, '\u004c': 76, '\u004d': 77, '\u004e': 78, '\u004f': 79, '\u0050': 80, '\u0051': 81, '\u0052': 82, '\u0053': 83, '\u0054': 84, '\u0055': 85, '\u0056': 86, '\u0057': 87, '\u0058': 88, '\u0059': 89, '\u005a': 90, '\u005b': 91, '\u005c': 92, '\u005d': 93, '\u005e': 94, '\u005f': 95, '\u0060': 96, '\u0061': 97, '\u0062': 98, '\u0063': 99, '\u0064': 100, '\u0065': 101, '\u0066': 102, '\u0067': 103, '\u0068': 104, '\u0069': 105, '\u006a': 106, '\u006b': 107, '\u006c': 108, '\u006d': 109, '\u006e': 110, '\u006f': 111, '\u0070': 112, '\u0071': 113, '\u0072': 114, '\u0073': 115, '\u0074': 116, '\u0075': 117, '\u0076': 118, '\u0077': 119, '\u0078': 120, '\u0079': 121, '\u007a': 122, '\u007b': 123, '\u007c': 124, '\u007d': 125, '\u007e': 126, '\u2022': 127, '\u20ac': 128, // '\u2022': 129, // duplicate '\u201a': 130, '\u0192': 131, '\u201e': 132, '\u2026': 133, '\u2020': 134, '\u2021': 135, '\u02c6': 136, '\u2030': 137, '\u0160': 138, '\u2039': 139, '\u0152': 140, //'\u2022': 141, // duplicate '\u017d': 142, //'\u2022': 143, // duplicate // '\u2022': 144, // duplicate '\u2018': 145, '\u2019': 146, '\u201c': 147, '\u201d': 148, //'\u2022': 149, // duplicate '\u2013': 150, '\u2014': 151, '\u02dc': 152, '\u2122': 153, '\u0161': 154, '\u203a': 155, '\u0153': 156, //'\u2022': 157, // duplicate '\u017e': 158, '\u0178': 159, //'\u0020': 160, // duplicate '\u00a1': 161, '\u00a2': 162, '\u00a3': 163, '\u00a4': 164, '\u00a5': 165, '\u00a6': 166, '\u00a7': 167, '\u00a8': 168, '\u00a9': 169, '\u00aa': 170, '\u00ab': 171, '\u00ac': 172, //'\u002d': 173, // duplicate '\u00ae': 174, '\u00af': 175, '\u00b0': 176, '\u00b1': 177, '\u00b2': 178, '\u00b3': 179, '\u00b4': 180, '\u00b5': 181, '\u00b6': 182, '\u00b7': 183, '\u00b8': 184, '\u00b9': 185, '\u00ba': 186, '\u00bb': 187, '\u00bc': 188, '\u00bd': 189, '\u00be': 190, '\u00bf': 191, '\u00c0': 192, '\u00c1': 193, '\u00c2': 194, '\u00c3': 195, '\u00c4': 196, '\u00c5': 197, '\u00c6': 198, '\u00c7': 199, '\u00c8': 200, '\u00c9': 201, '\u00ca': 202, '\u00cb': 203, '\u00cc': 204, '\u00cd': 205, '\u00ce': 206, '\u00cf': 207, '\u00d0': 208, '\u00d1': 209, '\u00d2': 210, '\u00d3': 211, '\u00d4': 212, '\u00d5': 213, '\u00d6': 214, '\u00d7': 215, '\u00d8': 216, '\u00d9': 217, '\u00da': 218, '\u00db': 219, '\u00dc': 220, '\u00dd': 221, '\u00de': 222, '\u00df': 223, '\u00e0': 224, '\u00e1': 225, '\u00e2': 226, '\u00e3': 227, '\u00e4': 228, '\u00e5': 229, '\u00e6': 230, '\u00e7': 231, '\u00e8': 232, '\u00e9': 233, '\u00ea': 234, '\u00eb': 235, '\u00ec': 236, '\u00ed': 237, '\u00ee': 238, '\u00ef': 239, '\u00f0': 240, '\u00f1': 241, '\u00f2': 242, '\u00f3': 243, '\u00f4': 244, '\u00f5': 245, '\u00f6': 246, '\u00f7': 247, '\u00f8': 248, '\u00f9': 249, '\u00fa': 250, '\u00fb': 251, '\u00fc': 252, '\u00fd': 253, '\u00fe': 254, '\u00ff': 255, }