compromise
Version:
natural language processing in the browser
56 lines (54 loc) • 2.47 kB
JavaScript
;
//a hugely-ignorant, and widely subjective transliteration of latin, cryllic, greek unicode characters to english ascii.
//approximate visual (not semantic or phonetic) relationship between unicode and ascii characters
//http://en.wikipedia.org/wiki/List_of_Unicode_characters
//https://docs.google.com/spreadsheet/ccc?key=0Ah46z755j7cVdFRDM1A2YVpwa1ZYWlpJM2pQZ003M0E
let compact = {
'!': '¡',
'?': '¿Ɂ',
'a': 'ªÀÁÂÃÄÅàáâãäåĀāĂ㥹ǍǎǞǟǠǡǺǻȀȁȂȃȦȧȺΆΑΔΛάαλАДадѦѧӐӑӒӓƛɅæ',
'b': 'ßþƀƁƂƃƄƅɃΒβϐϦБВЪЬбвъьѢѣҌҍҔҕƥƾ',
'c': '¢©ÇçĆćĈĉĊċČčƆƇƈȻȼͻͼͽϲϹϽϾϿЄСсєҀҁҪҫ',
'd': 'ÐĎďĐđƉƊȡƋƌǷ',
'e': 'ÈÉÊËèéêëĒēĔĕĖėĘęĚěƎƏƐǝȄȅȆȇȨȩɆɇΈΕΞΣέεξϱϵ϶ЀЁЕЭеѐёҼҽҾҿӖӗӘәӚӛӬӭ',
'f': 'ƑƒϜϝӺӻҒғӶӷſ',
'g': 'ĜĝĞğĠġĢģƓǤǥǦǧǴǵ',
'h': 'ĤĥĦħƕǶȞȟΉΗЂЊЋНнђћҢңҤҥҺһӉӊ',
'I': 'ÌÍÎÏ',
'i': 'ìíîïĨĩĪīĬĭĮįİıƖƗȈȉȊȋΊΐΪίιϊІЇії',
'j': 'ĴĵǰȷɈɉϳЈј',
'k': 'ĶķĸƘƙǨǩΚκЌЖКжкќҚқҜҝҞҟҠҡ',
'l': 'ĹĺĻļĽľĿŀŁłƚƪǀǏǐȴȽΙӀӏ',
'm': 'ΜϺϻМмӍӎ',
'n': 'ÑñŃńŅņŇňʼnŊŋƝƞǸǹȠȵΝΠήηϞЍИЙЛПийлпѝҊҋӅӆӢӣӤӥπ',
'o': 'ÒÓÔÕÖØðòóôõöøŌōŎŏŐőƟƠơǑǒǪǫǬǭǾǿȌȍȎȏȪȫȬȭȮȯȰȱΌΘΟθοσόϕϘϙϬϭϴОФоѲѳӦӧӨөӪӫ¤ƍΏ',
'p': 'ƤƿΡρϷϸϼРрҎҏÞ',
'q': 'Ɋɋ',
'r': 'ŔŕŖŗŘřƦȐȑȒȓɌɍЃГЯгяѓҐґ',
's': 'ŚśŜŝŞşŠšƧƨȘșȿςϚϛϟϨϩЅѕ',
't': 'ŢţŤťŦŧƫƬƭƮȚțȶȾΓΤτϮϯТт҂Ҭҭ',
'u': 'µÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųƯưƱƲǓǔǕǖǗǘǙǚǛǜȔȕȖȗɄΰμυϋύϑЏЦЧцџҴҵҶҷӋӌӇӈ',
'v': 'νѴѵѶѷ',
'w': 'ŴŵƜωώϖϢϣШЩшщѡѿ',
'x': '×ΧχϗϰХхҲҳӼӽӾӿ',
'y': 'ÝýÿŶŷŸƳƴȲȳɎɏΎΥΫγψϒϓϔЎУучўѰѱҮүҰұӮӯӰӱӲӳ',
'z': 'ŹźŻżŽžƩƵƶȤȥɀΖζ'
};
//decompress data into two hashes
let unicode = {};
Object.keys(compact).forEach(function (k) {
compact[k].split('').forEach(function (s) {
unicode[s] = k;
});
});
const fixUnicode = (str) => {
let chars = str.split('');
chars.forEach((s, i) => {
if (unicode[s]) {
chars[i] = unicode[s];
}
});
return chars.join('');
};
module.exports = fixUnicode;
// console.log(fixUnicode('bjŏȒk'));