UNPKG

wanakana

Version:

JS library that transliterates between japanese kana and roman letters.

840 lines (785 loc) 18.7 kB
wanakana = wanakana || {} # version is inserted from package.json by compiler wanakana.version = "%version%" # Support AMD if typeof define is "function" and define.amd define "wanakana", [], ->wanakana wanakana.LOWERCASE_START = 0x61 wanakana.LOWERCASE_END = 0x7A wanakana.UPPERCASE_START = 0x41 wanakana.UPPERCASE_END = 0x5A wanakana.HIRAGANA_START = 0x3041 wanakana.HIRAGANA_END = 0x3096 wanakana.KATAKANA_START = 0x30A1 wanakana.KATAKANA_END = 0x30FA wanakana.LOWERCASE_FULLWIDTH_START = 0xFF41 wanakana.LOWERCASE_FULLWIDTH_END = 0xFF5A wanakana.UPPERCASE_FULLWIDTH_START = 0xFF21 wanakana.UPPERCASE_FULLWIDTH_END = 0xFF3A wanakana.defaultOptions = # Transliterates wi and we to ゐ and ゑ useObseleteKana: no # Special mode for handling input from a text input that is transliterated on the fly. IMEMode: off ###* * Automatically sets up an input field to be an IME. ### wanakana.bind = (input) -> input.addEventListener('input', wanakana._onInput) wanakana.unbind = (input) -> input.removeEventListener('input', wanakana._onInput) wanakana._onInput = (event) -> input = event.target startingCursor = input.selectionStart startingLength = input.value.length normalizedInputString = wanakana._convertFullwidthCharsToASCII (input.value) newText = (wanakana.toKana(normalizedInputString, {IMEMode: true})) unless normalizedInputString is newText input.value = newText if (typeof input.selectionStart == "number") input.selectionStart = input.selectionEnd = input.value.length else if (typeof input.createTextRange != "undefined") input.focus() range = input.createTextRange() range.collapse(false) range.select() wanakana._extend = (target, source) -> if not target? return source for prop of source if not target[prop]? and source[prop]? target[prop] = source[prop] return target ###* * Takes a character and a unicode range. Returns true if the char is in the range. ### wanakana._isCharInRange = (char, start, end) -> code = char.charCodeAt 0 return start <= code <= end wanakana._isCharVowel = (char, includeY = yes) -> regexp = if includeY then /[aeiouy]/ else /[aeiou]/ return char.toLowerCase().charAt(0).search(regexp) isnt -1 wanakana._isCharConsonant = (char, includeY = yes) -> regexp = if includeY then /[bcdfghjklmnpqrstvwxyz]/ else /[bcdfghjklmnpqrstvwxz]/ return char.toLowerCase().charAt(0).search(regexp) isnt -1 wanakana._isCharKatakana = (char) -> wanakana._isCharInRange(char, wanakana.KATAKANA_START, wanakana.KATAKANA_END) wanakana._isCharHiragana = (char) -> wanakana._isCharInRange(char, wanakana.HIRAGANA_START, wanakana.HIRAGANA_END) wanakana._isCharKana = (char) -> wanakana._isCharHiragana(char) or wanakana._isCharKatakana(char) wanakana._isCharNotKana = (char) -> not wanakana._isCharHiragana(char) and not wanakana._isCharKatakana(char) wanakana._convertFullwidthCharsToASCII = (string) -> chars = string.split "" for char,i in chars code = char.charCodeAt(0) if wanakana._isCharInRange(char, wanakana.LOWERCASE_FULLWIDTH_START, wanakana.LOWERCASE_FULLWIDTH_END) chars[i] = String.fromCharCode(code - wanakana.LOWERCASE_FULLWIDTH_START + wanakana.LOWERCASE_START) if wanakana._isCharInRange(char, wanakana.UPPERCASE_FULLWIDTH_START, wanakana.UPPERCASE_FULLWIDTH_END) chars[i] String.fromCharCode(code - wanakana.UPPERCASE_FULLWIDTH_START + wanakana.UPPERCASE_START) chars.join "" wanakana._katakanaToHiragana = (kata) -> hira = [] for kataChar in kata.split "" if wanakana._isCharKatakana(kataChar) code = kataChar.charCodeAt 0 # Shift charcode. code += wanakana.HIRAGANA_START - wanakana.KATAKANA_START hiraChar = String.fromCharCode code hira.push hiraChar else # pass non katakana chars through hira.push kataChar hira.join "" wanakana._hiraganaToKatakana = (hira) -> kata = [] for hiraChar in hira.split "" if wanakana._isCharHiragana(hiraChar) code = hiraChar.charCodeAt 0 # Shift charcode. code += wanakana.KATAKANA_START - wanakana.HIRAGANA_START kataChar = String.fromCharCode code kata.push kataChar else # pass non hiragana chars through kata.push hiraChar kata.join "" wanakana._hiraganaToRomaji = (hira, options) -> # merge options with default options options = wanakana._extend(options, wanakana.defaultOptions) len = hira.length roma = [] cursor = 0 chunkSize = 0 maxChunk = 2 getChunk = () -> hira.substr(cursor, chunkSize) # Don't pick a chunk that is bigger than the remaining characters. resetChunkSize = () -> chunkSize = Math.min(maxChunk, len-cursor) while cursor < len resetChunkSize() while chunkSize > 0 chunk = getChunk() if wanakana.isKatakana(chunk) chunk = wanakana._katakanaToHiragana(chunk) # special case for small tsus if chunk.charAt(0) is "っ" and chunkSize is 1 and cursor < (len-1) nextCharIsDoubleConsonant = true romaChar = "" break romaChar = wanakana.J_to_R[chunk] if romaChar? and nextCharIsDoubleConsonant romaChar = romaChar.charAt(0).concat(romaChar) nextCharIsDoubleConsonant = false # DEBUG # console.log (cursor + "x" + chunkSize + ":" + chunk + " => " + romaChar ) break if romaChar? chunkSize-- unless romaChar? # console.log("Couldn't find " + chunk + ". Passing through.") # Passthrough undefined values romaChar = chunk # Handle special cases. roma.push romaChar cursor += chunkSize or 1 roma.join("") wanakana._romajiToHiragana = (roma, options) -> wanakana._romajiToKana(roma, options, true) wanakana._romajiToKana = (roma, options, ignoreCase = false) -> # console.log (new Date().getTime()) # merge options with default options options = wanakana._extend(options, wanakana.defaultOptions) len = roma.length # Final output array kana = [] # Position in the string that is being evaluated cursor = 0 # Maximum size of the chunk of characters to evaluate at one time maxChunk = 3 # Pulls a chunk of characters based on the cursor position and chunkSize getChunk = () -> roma.substr(cursor, chunkSize) # Checks if the character is uppercase isCharUpperCase = (char) -> wanakana._isCharInRange(char, wanakana.UPPERCASE_START, wanakana.UPPERCASE_END) # Steps through the string pulling out chunks of characters. Each chunk will be evaluated # against the romaji to kana table. If there is no match, the last character in the chunk # is dropped and the chunk is reevaluated. If nothing matches, the character is assumed # to be invalid or puncuation or other and gets passed through. while cursor < len # Don't pick a chunk that is bigger than the remaining characters. chunkSize = Math.min(maxChunk, len-cursor) while chunkSize > 0 chunk = getChunk() chunkLC = chunk.toLowerCase() # Handle super-rare edge cases with 4 char chunks (like ltsu, chya, shya) if chunkLC in wanakana.FOUR_CHARACTER_EDGE_CASES and (len-cursor) >= 4 chunkSize++ chunk = getChunk() chunkLC = chunk.toLowerCase() else # Handle edge case of n followed by consonant if chunkLC.charAt(0) is "n" if options.IMEMode and chunkLC.charAt(1) is "'" and chunkSize is 2 #convert n' to "ん" kanaChar = "ん" break # Handle edge case of n followed by n and vowel if wanakana._isCharConsonant(chunkLC.charAt(1), no) and wanakana._isCharVowel(chunkLC.charAt(2)) chunkSize = 1 chunk = getChunk() chunkLC = chunk.toLowerCase() # Handle case of double consonants if chunkLC.charAt(0) isnt "n" and wanakana._isCharConsonant(chunkLC.charAt(0)) and chunk.charAt(0) == chunk.charAt(1) chunkSize = 1 # Return katakana ッ if chunk is uppercase, otherwise return hiragana っ if wanakana._isCharInRange(chunk.charAt(0), wanakana.UPPERCASE_START, wanakana.UPPERCASE_END) chunkLC = chunk = "ッ" else chunkLC = chunk = "っ" kanaChar = wanakana.R_to_J[chunkLC] # DEBUG # console.log (cursor + "x" + chunkSize + ":" + chunk + " => " + kanaChar ) break if kanaChar? # Step down the chunk size. # If chunkSize was 4, step down twice. if chunkSize == 4 chunkSize -= 2 else chunkSize-- unless kanaChar? chunk = wanakana._convertPunctuation(chunk) # console.log("Couldn't find " + chunk + ". Passing through.") # Passthrough undefined values kanaChar = chunk # Handle special cases. if options?.useObseleteKana if chunkLC is "wi" then kanaChar = "ゐ" if chunkLC is "we" then kanaChar = "ゑ" if options.IMEMode and chunkLC.charAt(0) is "n" if roma.charAt(cursor + 1).toLowerCase() is "y" and wanakana._isCharVowel(roma.charAt(cursor + 2)) is false or cursor is (len - 1) or wanakana.isKana(roma.charAt(cursor + 1)) # Don't transliterate this yet. kanaChar = chunk.charAt(0) # Use katakana if first letter in chunk is uppercase unless ignoreCase if isCharUpperCase(chunk.charAt(0)) kanaChar = wanakana._hiraganaToKatakana(kanaChar) kana.push kanaChar cursor += chunkSize or 1 kana.join("") wanakana._convertPunctuation = (input, options) -> if input is ' ' then return ' ' if input is '-' then return 'ー' input ###* * Returns true if input is entirely hiragana. ### wanakana.isHiragana = (input) -> chars = input.split("") chars.every(wanakana._isCharHiragana) wanakana.isKatakana = (input) -> chars = input.split("") chars.every(wanakana._isCharKatakana) wanakana.isKana = (input) -> chars = input.split("") chars.every((char) -> (wanakana.isHiragana char) or (wanakana.isKatakana char)) wanakana.isRomaji = (input) -> chars = input.split("") chars.every((char) -> (not wanakana.isHiragana char) and (not wanakana.isKatakana char)) wanakana.toHiragana = (input, options) -> if wanakana.isRomaji(input) return input = wanakana._romajiToHiragana(input, options) if wanakana.isKatakana(input) return input = wanakana._katakanaToHiragana(input, options) # otherwise input wanakana.toKatakana = (input, options) -> if wanakana.isHiragana(input) return input = wanakana._hiraganaToKatakana(input, options) if wanakana.isRomaji(input) input = wanakana._romajiToHiragana(input, options) return input = wanakana._hiraganaToKatakana(input, options) #otherwise input wanakana.toKana = (input, options) -> return input = wanakana._romajiToKana(input, options) wanakana.toRomaji = (input, options) -> return input = wanakana._hiraganaToRomaji(input) wanakana.R_to_J = a: 'あ' i: 'い' u: 'う' e: 'え' o: 'お' yi: 'い' wu: 'う' whu: 'う' xa: 'ぁ' xi: 'ぃ' xu: 'ぅ' xe: 'ぇ' xo: 'ぉ' xyi: 'ぃ' xye: 'ぇ' ye: 'いぇ' wha: 'うぁ' whi: 'うぃ' whe: 'うぇ' who: 'うぉ' wi: 'うぃ' we: 'うぇ' va: 'ゔぁ' vi: 'ゔぃ' vu: 'ゔ' ve: 'ゔぇ' vo: 'ゔぉ' vya: 'ゔゃ' vyi: 'ゔぃ' vyu: 'ゔゅ' vye: 'ゔぇ' vyo: 'ゔょ' ka: 'か' ki: 'き' ku: 'く' ke: 'け' ko: 'こ' lka: 'ヵ' lke: 'ヶ' xka: 'ヵ' xke: 'ヶ' kya: 'きゃ' kyi: 'きぃ' kyu: 'きゅ' kye: 'きぇ' kyo: 'きょ' ca: 'か' ci: 'き' cu: 'く' ce: 'け' co: 'こ' lca: 'ヵ' lce: 'ヶ' xca: 'ヵ' xce: 'ヶ' qya: 'くゃ' qyu: 'くゅ' qyo: 'くょ' qwa: 'くぁ' qwi: 'くぃ' qwu: 'くぅ' qwe: 'くぇ' qwo: 'くぉ' qa: 'くぁ' qi: 'くぃ' qe: 'くぇ' qo: 'くぉ' kwa: 'くぁ' qyi: 'くぃ' qye: 'くぇ' ga: 'が' gi: 'ぎ' gu: 'ぐ' ge: 'げ' go: 'ご' gya: 'ぎゃ' gyi: 'ぎぃ' gyu: 'ぎゅ' gye: 'ぎぇ' gyo: 'ぎょ' gwa: 'ぐぁ' gwi: 'ぐぃ' gwu: 'ぐぅ' gwe: 'ぐぇ' gwo: 'ぐぉ' sa: 'さ' si: 'し' shi: 'し' su: 'す' se: 'せ' so: 'そ' za: 'ざ' zi: 'じ' zu: 'ず' ze: 'ぜ' zo: 'ぞ' ji: 'じ' sya: 'しゃ' syi: 'しぃ' syu: 'しゅ' sye: 'しぇ' syo: 'しょ' sha: 'しゃ' shu: 'しゅ' she: 'しぇ' sho: 'しょ' shya: 'しゃ' # note 4 character code shyu: 'しゅ' # note 4 character code shye: 'しぇ' # note 4 character code shyo: 'しょ' # note 4 character code swa: 'すぁ' swi: 'すぃ' swu: 'すぅ' swe: 'すぇ' swo: 'すぉ' zya: 'じゃ' zyi: 'じぃ' zyu: 'じゅ' zye: 'じぇ' zyo: 'じょ' ja: 'じゃ' ju: 'じゅ' je: 'じぇ' jo: 'じょ' jya: 'じゃ' jyi: 'じぃ' jyu: 'じゅ' jye: 'じぇ' jyo: 'じょ' ta: 'た' ti: 'ち' tu: 'つ' te: 'て' to: 'と' chi: 'ち' tsu: 'つ' ltu: 'っ' xtu: 'っ' tya: 'ちゃ' tyi: 'ちぃ' tyu: 'ちゅ' tye: 'ちぇ' tyo: 'ちょ' cha: 'ちゃ' chu: 'ちゅ' che: 'ちぇ' cho: 'ちょ' cya: 'ちゃ' cyi: 'ちぃ' cyu: 'ちゅ' cye: 'ちぇ' cyo: 'ちょ' chya: 'ちゃ' # note 4 character code chyu: 'ちゅ' # note 4 character code chye: 'ちぇ' # note 4 character code chyo: 'ちょ' # note 4 character code tsa: 'つぁ' tsi: 'つぃ' tse: 'つぇ' tso: 'つぉ' tha: 'てゃ' thi: 'てぃ' thu: 'てゅ' the: 'てぇ' tho: 'てょ' twa: 'とぁ' twi: 'とぃ' twu: 'とぅ' twe: 'とぇ' two: 'とぉ' da: 'だ' di: 'ぢ' du: 'づ' de: 'で' do: 'ど' dya: 'ぢゃ' dyi: 'ぢぃ' dyu: 'ぢゅ' dye: 'ぢぇ' dyo: 'ぢょ' dha: 'でゃ' dhi: 'でぃ' dhu: 'でゅ' dhe: 'でぇ' dho: 'でょ' dwa: 'どぁ' dwi: 'どぃ' dwu: 'どぅ' dwe: 'どぇ' dwo: 'どぉ' na: 'な' ni: 'に' nu: 'ぬ' ne: 'ね' no: 'の' nya: 'にゃ' nyi: 'にぃ' nyu: 'にゅ' nye: 'にぇ' nyo: 'にょ' ha: 'は' hi: 'ひ' hu: 'ふ' he: 'へ' ho: 'ほ' fu: 'ふ' hya: 'ひゃ' hyi: 'ひぃ' hyu: 'ひゅ' hye: 'ひぇ' hyo: 'ひょ' fya: 'ふゃ' fyu: 'ふゅ' fyo: 'ふょ' fwa: 'ふぁ' fwi: 'ふぃ' fwu: 'ふぅ' fwe: 'ふぇ' fwo: 'ふぉ' fa: 'ふぁ' fi: 'ふぃ' fe: 'ふぇ' fo: 'ふぉ' fyi: 'ふぃ' fye: 'ふぇ' ba: 'ば' bi: 'び' bu: 'ぶ' be: 'べ' bo: 'ぼ' bya: 'びゃ' byi: 'びぃ' byu: 'びゅ' bye: 'びぇ' byo: 'びょ' pa: 'ぱ' pi: 'ぴ' pu: 'ぷ' pe: 'ぺ' po: 'ぽ' pya: 'ぴゃ' pyi: 'ぴぃ' pyu: 'ぴゅ' pye: 'ぴぇ' pyo: 'ぴょ' ma: 'ま' mi: 'み' mu: 'む' me: 'め' mo: 'も' mya: 'みゃ' myi: 'みぃ' myu: 'みゅ' mye: 'みぇ' myo: 'みょ' ya: 'や' yu: 'ゆ' yo: 'よ' xya: 'ゃ' xyu: 'ゅ' xyo: 'ょ' ra: 'ら' ri: 'り' ru: 'る' re: 'れ' ro: 'ろ' rya: 'りゃ' ryi: 'りぃ' ryu: 'りゅ' rye: 'りぇ' ryo: 'りょ' la: 'ら' li: 'り' lu: 'る' le: 'れ' lo: 'ろ' lya: 'りゃ' lyi: 'りぃ' lyu: 'りゅ' lye: 'りぇ' lyo: 'りょ' wa: 'わ' wo: 'を' lwe: 'ゎ' xwa: 'ゎ' n: 'ん' nn: 'ん' 'n ': 'ん' # n + space xn: 'ん' ltsu: 'っ' # note 4 character code wanakana.FOUR_CHARACTER_EDGE_CASES = ['lts', 'chy', 'shy'] wanakana.J_to_R = あ: 'a' い: 'i' う: 'u' え: 'e' お: 'o' ゔぁ: 'va' ゔぃ: 'vi' ゔ: 'vu' ゔぇ: 've' ゔぉ: 'vo' か: 'ka' き: 'ki' きゃ: 'kya' きぃ: 'kyi' きゅ: 'kyu' く: 'ku' け: 'ke' こ: 'ko' が: 'ga' ぎ: 'gi' ぐ: 'gu' げ: 'ge' ご: 'go' ぎゃ: 'gya' ぎぃ: 'gyi' ぎゅ: 'gyu' ぎぇ: 'gye' ぎょ: 'gyo' さ: 'sa' す: 'su' せ: 'se' そ: 'so' ざ: 'za' ず: 'zu' ぜ: 'ze' ぞ: 'zo' し: 'shi' しゃ: 'sha' しゅ: 'shu' しょ: 'sho' じ: 'ji' じゃ: 'ja' じゅ: 'ju' じょ: 'jo' た: 'ta' ち: 'chi' ちゃ: 'cha' ちゅ: 'chu' ちょ: 'cho' つ: 'tsu' て: 'te' と: 'to' だ: 'da' ぢ: 'di' づ: 'du' で: 'de' ど: 'do' な: 'na' に: 'ni' にゃ: 'nya' にゅ: 'nyu' にょ: 'nyo' ぬ: 'nu' ね: 'ne' の: 'no' は: 'ha' ひ: 'hi' ふ: 'fu' へ: 'he' ほ: 'ho' ひゃ: 'hya' ひゅ: 'hyu' ひょ: 'hyo' ふぁ: 'fa' ふぃ: 'fi' ふぇ: 'fe' ふぉ: 'fo' ば: 'ba' び: 'bi' ぶ: 'bu' べ: 'be' ぼ: 'bo' びゃ: 'bya' びゅ: 'byu' びょ: 'byo' ぱ: 'pa' ぴ: 'pi' ぷ: 'pu' ぺ: 'pe' ぽ: 'po' ぴゃ: 'pya' ぴゅ: 'pyu' ぴょ: 'pyo' ま: 'ma' み: 'mi' む: 'mu' め: 'me' も: 'mo' みゃ: 'mya' みゅ: 'myu' みょ: 'myo' や: 'ya' ゆ: 'yu' よ: 'yo' ら: 'ra' り: 'ri' る: 'ru' れ: 're' ろ: 'ro' りゃ: 'rya' りゅ: 'ryu' りょ: 'ryo' わ: 'wa' を: 'wo' ん: 'n' # Archaic characters ゐ: 'wi' ゑ: 'we' # Uncommon character combos きぇ: 'kye' きょ: 'kyo' じぃ: 'jyi' じぇ: 'jye' ちぃ: 'cyi' ちぇ: 'che' ひぃ: 'hyi' ひぇ: 'hye' びぃ: 'byi' びぇ: 'bye' ぴぃ: 'pyi' ぴぇ: 'pye' みぇ: 'mye' みぃ: 'myi' りぃ: 'ryi' りぇ: 'rye' にぃ: 'nyi' にぇ: 'nye' しぃ: 'syi' しぇ: 'she' いぇ: 'ye' うぁ: 'wha' うぉ: 'who' うぃ: 'wi' うぇ: 'we' ゔゃ: 'vya' ゔゅ: 'vyu' ゔょ: 'vyo' すぁ: 'swa' すぃ: 'swi' すぅ: 'swu' すぇ: 'swe' すぉ: 'swo' くゃ: 'qya' くゅ: 'qyu' くょ: 'qyo' くぁ: 'qwa' くぃ: 'qwi' くぅ: 'qwu' くぇ: 'qwe' くぉ: 'qwo' ぐぁ: 'gwa' ぐぃ: 'gwi' ぐぅ: 'gwu' ぐぇ: 'gwe' ぐぉ: 'gwo' つぁ: 'tsa' つぃ: 'tsi' つぇ: 'tse' つぉ: 'tso' てゃ: 'tha' てぃ: 'thi' てゅ: 'thu' てぇ: 'the' てょ: 'tho' とぁ: 'twa' とぃ: 'twi' とぅ: 'twu' とぇ: 'twe' とぉ: 'two' ぢゃ: 'dya' ぢぃ: 'dyi' ぢゅ: 'dyu' ぢぇ: 'dye' ぢょ: 'dyo' でゃ: 'dha' でぃ: 'dhi' でゅ: 'dhu' でぇ: 'dhe' でょ: 'dho' どぁ: 'dwa' どぃ: 'dwi' どぅ: 'dwu' どぇ: 'dwe' どぉ: 'dwo' ふぅ: 'fwu' ふゃ: 'fya' ふゅ: 'fyu' ふょ: 'fyo' # Small Characters (normally not transliterated alone) ぁ: 'a' ぃ: 'i' ぇ: 'e' ぅ: 'u' ぉ: 'o' ゃ: 'ya' ゅ: 'yu' ょ: 'yo' っ: '' ゕ: 'ka' ゖ: 'ka' ゎ: 'wa' # Punctuation ' ': ' ' # Ambiguous consonant vowel pairs んあ: 'n\'a' んい: 'n\'i' んう: 'n\'u' んえ: 'n\'e' んお: 'n\'o' んや: 'n\'ya' んゆ: 'n\'yu' んよ: 'n\'yo'