UNPKG

myanglish-translator

Version:

[Home Page](https://myanglish.bytete.com) This library was created to translate between myanglish to burmese unicode and burmese to myanglish vice-visa. We have trained over 2400+ myanglish phrase and words source from internet, multiple social media plat

509 lines (417 loc) 16.3 kB
import MyanglishToMyanmarDict from './dict.js'; // console.log(MyanglishToMyanmarDict); const specialCharacterReg = /[&\/\\#,+()$~%.'":*?<>{}]/g; let dataOptimized = false; let TotalDataCount = 0; let DataSet = { '1': {} }; const MyanmarToMyanglishDataSet = { '1': {} } const MyanmarAlphabet = [ 'က','ခ','ဂ','ဃ','င', 'စ','ဆ','ဇ','ဈ','ည', 'ဋ','ဌ','ဍ','ဎ','ဏ', 'တ','ထ','ဒ','ဓ','န', 'ပ','ဖ','ဗ','ဘ','မ', 'ယ','ရ','လ','ဝ','သ','ဟ', 'ဠ','အ','ဢ','ဣ','ဤ','ဥ', 'ဩ', 'ဿ', '၌', '၍', '၎', '၏','ၮ','႟','ဧ', '၁','၂','၃','၄','၅','၆','၇','၈','၉','၀' ] const MyanmarVowelSound = [ // 'ခ်', 'ခံ', 'ခ့', 'ခး', 'ခ္', '်', 'း', '္', "့", // 'ာ' // 'ံ' ]; function isBurmeseVowel(str) { // const key = str.slice(-1); // return key == 'း' || key == "ံ" || key == '့'; // return str.includes('း') || str.includes("ံ") || str.includes('့'); if(str.includes('း')) return { type: 'double', originalValue: str.slice(0, -1), symbol: null } if(str.includes('့')) return { type: '.', originalValue: str.slice(0, -1), symbol: 'k' } return false; } function optimizeData() { Object.entries(MyanglishToMyanmarDict).sort(([aKey, aValue], [bKey, bValue]) => { return bKey.length - aKey.length; }).forEach(([key, value]) => { const split = key.split(' '); const keyLength = split.length; if(DataSet[keyLength]){ DataSet[keyLength][key] = value; DataSet['1'][split.join('')] = value; }else{ DataSet[keyLength] = { [key]: value }; } TotalDataCount++; }) let Temp = {}; Object.entries(MyanglishToMyanmarDict).forEach(([key, value]) => { Temp[value] = key; }); Object.entries(Temp).sort(([aKey, aValue], [bKey, bValue]) => { return bKey.length - aKey.length; }).forEach(([key, value]) => { const len = myanmarWordSpliter(key).length; if(MyanmarToMyanglishDataSet[len]){ MyanmarToMyanglishDataSet[len][key] = value; }else{ MyanmarToMyanglishDataSet[len] = { [key]: value }; } }) // Temp == null; // console.log(MyanmarToMyanglishDataSet); // console.log(DataSet); dataOptimized = true; } function detectVowel(token){ // detect : function detectDoubleSound(token){ const doubleSound = token.slice(-2).split(''); return doubleSound[0] == doubleSound[1] && token.length > 2 && doubleSound[0] !== 'k'; } if(DataSet['1'][token]){ return { sound: '', originalValue: token } } // detect : with token ending with double keyword, such as ending with "nn", "yayy" if(detectDoubleSound(token)){ return { sound: 'း', originalValue: token.slice(0, -1) } } // detect ့ with token ending with k if(token.slice(-1).toLowerCase() == 'k' || token.slice(-1).toLowerCase() == 't'){ return { sound: '့', originalValue: token.slice(0, -1) } } return { sound: '', originalValue: token } } function characterSimilarity(text1, text2){ if(text1 == text2) return{ rate: 1, input: {text1, text2} } const text1Split = text1.replace(/ /g, '').split(''); const text2Split = text2.replace(/ /g, '').split(''); // chars test const text1Chars = {}; const text2Chars = {}; text1Split.forEach(c => text1Chars[c] ? text1Chars[c] = text1Chars[c] + 1 : text1Chars[c] = 1); text2Split.forEach(c => text2Chars[c] ? text2Chars[c] = text2Chars[c] + 1 : text2Chars[c] = 1); let text1ToText2Score = 0; let text2ToText1Score = 0; Object.entries(text1Chars).forEach(([key, value]) => { const t2CharScore = text2Chars[key] || 0; text1ToText2Score = t2CharScore ? text1ToText2Score + (value / t2CharScore) : text1ToText2Score; }) Object.entries(text2Chars).forEach(([key, value]) => { const t1CharScore = text1Chars[key] || 0; text2ToText1Score = t1CharScore ? text2ToText1Score + (value / t1CharScore) : text2ToText1Score; }) const mean = (text1ToText2Score + text2ToText1Score) / 2; const lengthMean = (text1.length + text2.length) / 2; return { rate: mean / lengthMean, input: { text1, text2 } } } function phraseSimilarity(text1, text2){ if(text1 == text2) return{ rate: 1, input: {text1, text2} } const text1pharase = text1.split(' ').filter(t => t != ''); const text2pharase = text2.split(' ').filter(t => t != ''); const text1PharseToText2 = {}; text1pharase.forEach(phrase1 => { text2pharase.forEach(phrase2 => { if(phrase1 == phrase2){ text1PharseToText2[phrase1] = !!text1PharseToText2[phrase1] ? text1PharseToText2[phrase1] + 1 : 1; } }) !text1PharseToText2[phrase1] && (text1PharseToText2[phrase1] = 0); }); const text2PharseToText1 = {}; text2pharase.forEach(phrase2 => { text1pharase.forEach(phrase1 => { if(phrase1 == phrase2){ text2PharseToText1[phrase2] = !!text2PharseToText1[phrase2] ? text2PharseToText1[phrase2] + 1 : 1; } }) !text2PharseToText1[phrase2] && (text2PharseToText1[phrase2] = 0); }); const score = (() => { const ent1 = Object.entries(text1PharseToText2); const ent2 = Object.entries(text2PharseToText1); const sc1 = (() => { let sc = 0; ent1.forEach(([key, value]) => { const t2Val = text2PharseToText1[key] || 0; sc = sc + (value + t2Val) / 2 }) return sc / ((ent1.length + ent2.length) / 2); })() const sc2 = (() => { let sc = 0; ent2.forEach(([key, value]) => { const t1Val = text1PharseToText2[key] || 0; sc = sc + (value + t1Val) / 2 }) return sc / ((ent1.length + ent2.length) / 2); })() return (sc1 + sc2) / 2; })() // console.log(text1PharseToText2); // console.log(text2PharseToText1); // console.log('score', score); return { rate: score, input: { text1, text2 } }; } function removeSpecialCharacters(value){ return value.trim().replace(specialCharacterReg, ' '); } function myanmarWordSpliter(text){ // func.myanmarWordSpliter("ခြှောက်ပြစ်ကင်းသဲလဲဆင်ကိစ္စကိုစန္ဒာသင်္ကြန်အန္ဒြေရမရှိဘူးအတွေးအခေါ်မရှိကြဘူး"); text = removeSpecialCharacters(text); // // Splitting myanmar sentence will generate following // result // ['ခ', 'ြ', 'ှ', 'ေ', 'ာ', 'က', '်', 'ပ', 'ြ', 'စ', '်', 'က', 'င', '်', 'း', 'သ', 'ဲ', 'လ', 'ဲ', 'ဆ', 'င', '်', 'က', 'ိ', 'စ', '္', 'စ', 'က', 'ိ', 'ု', 'စ', 'န', '္', 'ဒ', 'ာ', 'သ', 'င', '်', '္', 'က', 'ြ', 'န', '်', 'အ', 'န', '္', 'ဒ', 'ြ', 'ေ', 'ရ', 'မ', 'ရ', 'ှ', 'ိ', 'ဘ', 'ူ', 'း', 'အ', 'တ', 'ွ', 'ေ', 'း', 'အ', 'ခ', 'ေ', 'ါ', '်', 'မ', 'ရ', 'ှ', 'ိ', 'က', 'ြ', 'ဘ', 'ူ', 'း'] // const split = text.replace(/\n/g, '').split(''); const word = []; // THEN // general group အသက် with အက္ခရာ // // this will concat with generalization of myanmar consonant with vouwel // by detecting base က္ခရာ and concat with အသက် // output result will be // ['ခြှော', 'က်', 'ပြ', 'စ်', 'က', 'င်း', 'သဲ', 'လဲ', 'ဆ', 'င်', 'ကိ', 'စ္', 'စ', 'ကို', 'စ', 'န္', 'ဒာ', 'သ', 'င်္', 'ကြ', 'န်', 'အ', 'န္', 'ဒြေ', 'ရ', 'မ', 'ရှိ', 'ဘူး'] // // let tmp = ''; for(let i = 0; i < split.length; i++){ const char = split[i]; if(char == ' ' && i > 0){ word.push(tmp); word.push(' '); tmp = ''; continue; } if(MyanmarAlphabet.includes(char) && i > 0){ tmp.length > 0 && word.push(tmp); tmp = char; }else{ tmp = tmp + char; } // push end of word if(i == split.length - 1) { word.push(tmp); tmp = ''; } } // THEN // concat with အသတ် // // this will concat with generalization of myanmar consonant with vouwel // ['ကော','က်', 'ရ', 'င်', 'ကျိုး', 'ရ', 'စန္', 'ဒာ'] => ['ကောက်', 'ရင်', 'ကျိုး', 'ရ', 'စန္', 'ဒာ'] // output result will be // ['ခြှောက်', 'ပြစ်', 'ကင်း', 'သဲ', 'လဲ', 'ဆင်', 'ကိစ္', 'စ', 'ကို', 'စန္', 'ဒာ', 'သင်္', 'ကြန်', 'အန္', 'ဒြေ', 'ရ', 'မ', 'ရှိ', 'ဘူး'] // // let len = word.length; for(let i = 0; i < len; i++){ const wd = word[i]; if(!wd) continue; if(wd.length >= 2 && wd.length <= 3){ if(MyanmarVowelSound.includes(wd.split('')[1]) && word[i - 1] != ' '){ word[i - 1] = word[i - 1] + wd; word.splice(i, 1); len--; } } } // THEN // concat with ပဆင့် // // this will concat with generalization of myanmar consonant with vouwel // ['ကောက်', 'ရင်', 'ကျိုး', 'ရ', 'စန္', 'ဒာ'] => ['ကောက်', 'ရင်', 'ကျိုး', 'ရ', 'စန္ဒာ'] // output result will be // ['ခြှောက်', 'ပြစ်', 'ကင်း', 'သဲ', 'လဲ', 'ဆင်', 'ကိစ္စ', 'ကို', 'စန္ဒာ', 'သင်္ကြန်', 'အန္ဒြေ', 'ရ', 'မ', 'ရှိ', 'ဘူး', 'အ', 'တွေး', 'အ', 'ခေါ်', 'မ', 'ရှိ', 'ကြ', 'ဘူး'] // // for(let i = 0; i < word.length; i++){ const wd = word[i]; if(wd.includes('္')){ word[i + 1] = wd + word[i + 1]; word.splice(i, 1); } } return word.filter(c => c.trim().length > 0); } export const ConvertMode = Object.freeze({ ADD_BRACKET_UNKNOWN_KEYWORDS: 'add-bracket-unknown-keywords', LEAVE_UNKNOWN_KEYWORDS: 'leave-unknown-keywords', }); export default { convertToBurmese: (string, option = {mode: ConvertMode.LEAVE_UNKNOWN_KEYWORDS}) => { if(!dataOptimized) optimizeData(); let str = ''; string = string.toLowerCase(); string = string.replace(/\n/g, ''); if (string.includes(' ')) { const tokens = string.trim().split(' ') // console.log(tokens); for(let i = 0; i < tokens.length; i++){ const groupIndex = tokens.length - i <= 5 ? tokens.length - i : 5; if(groupIndex > 1){ for(let j = groupIndex; j > 0; j--) { const phrase = tokens.slice(i, i + j); const matchedIndex = DataSet[phrase.length]; if(matchedIndex){ const re = matchedIndex[phrase.join(' ')]; if(re){ str = str + re; i = i + j - 1; break; } const groupPhrase = phrase.join(''); const mat = DataSet['1'][groupPhrase]; if(mat){ str = str + mat; i = i + j - 1; break; } } if(j == 1){ matchSingleToken(tokens[i]); break; } } }else{ matchSingleToken(tokens[i]) } } return str; } else { return matchSingleToken(string); } function matchSingleToken(token){ const vol = detectVowel(token); // console.log(vol); if (DataSet['1'][vol.originalValue]) { str = str + DataSet['1'][vol.originalValue] + vol.sound; } else { if(token.length == 0){ str = str + ' '; }else{ str = str + (option.mode == ConvertMode.ADD_BRACKET_UNKNOWN_KEYWORDS ? `{${token}}` : token); } } return str; } }, convertToMyanglish: (mmString, option = {mode: ConvertMode.LEAVE_UNKNOWN_KEYWORDS}) => { if(!dataOptimized) optimizeData(); const tokens = myanmarWordSpliter(mmString); // console.log(tokens); let str = ''; for(let i = 0; i < tokens.length; i ++){ const groupIndex = tokens.length - i <= 5 ? tokens.length - i : 5; for(let j = groupIndex; j > 0; j--){ const splitToken = tokens.slice(i, i + j); const phrase = splitToken.join(''); const matched = MyanmarToMyanglishDataSet[splitToken.length][`${phrase}`]; if(matched){ appendStr(matched); i = i + j - 1; break; } if(j == 1){ matchSingleToken(phrase); break; } } } function matchSingleToken(token){ const includeSpecialVowel = isBurmeseVowel(token); if(includeSpecialVowel){ const mat = MyanmarToMyanglishDataSet['1'][includeSpecialVowel.originalValue]; if(mat){ if(includeSpecialVowel.type == 'double') appendStr(mat + mat.slice(-1)); else if(includeSpecialVowel.type == '.') appendStr(mat + 'k'); else appendStr(mat); }else if(token == ' ' && token.length == 1) { appendStr(' '); }else{ appendStr(mat || (option.mode == ConvertMode.ADD_BRACKET_UNKNOWN_KEYWORDS ? `{${token}}` : token)); } }else{ const mat = MyanmarToMyanglishDataSet['1'][token]; if(token == ' ' && token.length == 1) { appendStr(' '); }else{ appendStr(mat || (option.mode == ConvertMode.ADD_BRACKET_UNKNOWN_KEYWORDS ? `{${token}}` : token)); } } return str; } function appendStr(append){ str = str + (str.length > 0 ? ' ' : '') + append; } return str; }, getDatasetCount: () => { if(!dataOptimized) optimizeData(); return TotalDataCount; }, textSimilarity: (text1, text2) => { // if(text1 == text2) return { // rate: 1, // input: { // text1, text2 // } // } // const text1Split = text1.split(''); // const text2Split = text2.split(''); // const charSimilarity = characterSimilarity(text1, text2); const charSimilar = characterSimilarity(text1, text2); const phraseSimilar = phraseSimilarity(text1, text2); return (phraseSimilar.rate + charSimilar.rate) /2; }, myanmarWordSpliter }