UNPKG

kanji.js

Version:

漢字 search/lookup library for Node.js and browsers

eidoriantan/kanji.js

1,658 lines (1,498 loc) • 1.93 MB

JavaScript

(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : typeof define === 'function' && define.amd ? define(factory) : (global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.Kanji = factory()); })(this, (function () { 'use strict'; /** * Returns detailed type as string (instead of just 'object' for arrays etc) * @private * @param {any} value js value * @returns {String} type of value * @example * typeOf({}); // 'object' * typeOf([]); // 'array' * typeOf(function() {}); // 'function' * typeOf(/a/); // 'regexp' * typeOf(new Date()); // 'date' * typeOf(null); // 'null' * typeOf(undefined); // 'undefined' * typeOf('a'); // 'string' * typeOf(1); // 'number' * typeOf(true); // 'boolean' * typeOf(new Map()); // 'map' * typeOf(new Set()); // 'map' */ function typeOf(value) { if (value === null) { return 'null'; } if (value !== Object(value)) { return typeof value; } return {}.toString .call(value) .slice(8, -1) .toLowerCase(); } /** * Checks if input string is empty * @param {String} input text input * @return {Boolean} true if no input */ function isEmpty(input) { if (typeOf(input) !== 'string') { return true; } return !input.length; } /** * Takes a character and a unicode range. Returns true if the char is in the range. * @param {String} char unicode character * @param {Number} start unicode start range * @param {Number} end unicode end range * @return {Boolean} */ function isCharInRange(char = '', start, end) { if (isEmpty(char)) return false; const code = char.charCodeAt(0); return start <= code && code <= end; } const TO_KANA_METHODS = { HIRAGANA: 'toHiragana', KATAKANA: 'toKatakana', }; const ROMANIZATIONS = { HEPBURN: 'hepburn', }; /** * Default config for WanaKana, user passed options will be merged with these * @type {DefaultOptions} * @name defaultOptions * @property {Boolean} [useObsoleteKana=false] - Set to true to use obsolete characters, such as ゐ and ゑ. * @example * toHiragana('we', { useObsoleteKana: true }) * // => 'ゑ' * @property {Boolean} [passRomaji=false] - Set to true to pass romaji when using mixed syllabaries with toKatakana() or toHiragana() * @example * toHiragana('only convert the katakana: ヒラガナ', { passRomaji: true }) * // => "only convert the katakana: ひらがな" * @property {Object} [convertLongVowelMark=true] - Set to false to prevent conversions of 'ー' to extended vowels with toHiragana() * @example * toHiragana('ラーメン', { convertLongVowelMark: false }); * // => 'らーめん * @property {Boolean} [upcaseKatakana=false] - Set to true to convert katakana to uppercase using toRomaji() * @example * toRomaji('ひらがなカタカナ', { upcaseKatakana: true }) * // => "hiragana KATAKANA" * @property {Boolean|String} [IMEMode=false] - Set to true, 'toHiragana', or 'toKatakana' to handle conversion while it is being typed. * @property {String} [romanization='hepburn'] - choose toRomaji() romanization map (currently only 'hepburn') * @property {Object} [customKanaMapping] - custom map will be merged with default conversion * @example * toKana('wanakana', { customKanaMapping: { na: 'に', ka: 'Bana' }) }; * // => 'わにBanaに' * @property {Object} [customRomajiMapping] - custom map will be merged with default conversion * @example * toRomaji('つじぎり', { customRomajiMapping: { じ: 'zi', つ: 'tu', り: 'li' }) }; * // => 'tuzigili' */ const DEFAULT_OPTIONS = { useObsoleteKana: false, passRomaji: false, upcaseKatakana: false, IMEMode: false, convertLongVowelMark: true, romanization: ROMANIZATIONS.HEPBURN, }; const LATIN_UPPERCASE_START = 0x41; const LATIN_UPPERCASE_END = 0x5a; const HIRAGANA_START = 0x3041; const HIRAGANA_END = 0x3096; const KATAKANA_START = 0x30a1; const KATAKANA_END = 0x30fc; const KANJI_START = 0x4e00; const KANJI_END = 0x9faf; const PROLONGED_SOUND_MARK = 0x30fc; const KANA_SLASH_DOT = 0x30fb; const MODERN_ENGLISH = [0x0000, 0x007f]; const HEPBURN_MACRON_RANGES = [ [0x0100, 0x0101], // Ā ā [0x0112, 0x0113], // Ē ē [0x012a, 0x012b], // Ī ī [0x014c, 0x014d], // Ō ō [0x016a, 0x016b], // Ū ū ]; const SMART_QUOTE_RANGES = [ [0x2018, 0x2019], // ‘ ’ [0x201c, 0x201d], // “ ” ]; const ROMAJI_RANGES = [MODERN_ENGLISH, ...HEPBURN_MACRON_RANGES]; const EN_PUNCTUATION_RANGES = [ [0x20, 0x2f], [0x3a, 0x3f], [0x5b, 0x60], [0x7b, 0x7e], ...SMART_QUOTE_RANGES, ]; var safeIsNaN = Number.isNaN || function ponyfill(value) { return typeof value === 'number' && value !== value; }; function isEqual(first, second) { if (first === second) { return true; } if (safeIsNaN(first) && safeIsNaN(second)) { return true; } return false; } function areInputsEqual(newInputs, lastInputs) { if (newInputs.length !== lastInputs.length) { return false; } for (var i = 0; i < newInputs.length; i++) { if (!isEqual(newInputs[i], lastInputs[i])) { return false; } } return true; } function memoizeOne(resultFn, isEqual) { if (isEqual === void 0) { isEqual = areInputsEqual; } var cache = null; function memoized() { var newArgs = []; for (var _i = 0; _i < arguments.length; _i++) { newArgs[_i] = arguments[_i]; } if (cache && cache.lastThis === this && isEqual(newArgs, cache.lastArgs)) { return cache.lastResult; } var lastResult = resultFn.apply(this, newArgs); cache = { lastResult: lastResult, lastArgs: newArgs, lastThis: this, }; return lastResult; } memoized.clear = function clear() { cache = null; }; return memoized; } var has = Object.prototype.hasOwnProperty; function find(iter, tar, key) { for (key of iter.keys()) { if (dequal(key, tar)) return key; } } function dequal(foo, bar) { var ctor, len, tmp; if (foo === bar) return true; if (foo && bar && (ctor=foo.constructor) === bar.constructor) { if (ctor === Date) return foo.getTime() === bar.getTime(); if (ctor === RegExp) return foo.toString() === bar.toString(); if (ctor === Array) { if ((len=foo.length) === bar.length) { while (len-- && dequal(foo[len], bar[len])); } return len === -1; } if (ctor === Set) { if (foo.size !== bar.size) { return false; } for (len of foo) { tmp = len; if (tmp && typeof tmp === 'object') { tmp = find(bar, tmp); if (!tmp) return false; } if (!bar.has(tmp)) return false; } return true; } if (ctor === Map) { if (foo.size !== bar.size) { return false; } for (len of foo) { tmp = len[0]; if (tmp && typeof tmp === 'object') { tmp = find(bar, tmp); if (!tmp) return false; } if (!dequal(len[1], bar.get(tmp))) { return false; } } return true; } if (ctor === ArrayBuffer) { foo = new Uint8Array(foo); bar = new Uint8Array(bar); } else if (ctor === DataView) { if ((len=foo.byteLength) === bar.byteLength) { while (len-- && foo.getInt8(len) === bar.getInt8(len)); } return len === -1; } if (ArrayBuffer.isView(foo)) { if ((len=foo.byteLength) === bar.byteLength) { while (len-- && foo[len] === bar[len]); } return len === -1; } if (!ctor || typeof foo === 'object') { len = 0; for (ctor in foo) { if (has.call(foo, ctor) && ++len && !has.call(bar, ctor)) return false; if (!(ctor in bar) || !dequal(foo[ctor], bar[ctor])) return false; } return Object.keys(bar).length === len; } } return foo !== foo && bar !== bar; } /** * Easy re-use of merging with default options * @param {Object} opts user options * @returns user options merged over default options */ const mergeWithDefaultOptions = (opts = {}) => Object.assign({}, DEFAULT_OPTIONS, opts); function applyMapping(string, mapping, convertEnding) { const root = mapping; function nextSubtree(tree, nextChar) { const subtree = tree[nextChar]; if (subtree === undefined) { return undefined; } // if the next child node does not have a node value, set its node value to the input return Object.assign({ '': tree[''] + nextChar }, tree[nextChar]); } function newChunk(remaining, currentCursor) { // start parsing a new chunk const firstChar = remaining.charAt(0); return parse( Object.assign({ '': firstChar }, root[firstChar]), remaining.slice(1), currentCursor, currentCursor + 1 ); } function parse(tree, remaining, lastCursor, currentCursor) { if (!remaining) { if (convertEnding || Object.keys(tree).length === 1) { // nothing more to consume, just commit the last chunk and return it // so as to not have an empty element at the end of the result return tree[''] ? [[lastCursor, currentCursor, tree['']]] : []; } // if we don't want to convert the ending, because there are still possible continuations // return null as the final node value return [[lastCursor, currentCursor, null]]; } if (Object.keys(tree).length === 1) { return [[lastCursor, currentCursor, tree['']]].concat( newChunk(remaining, currentCursor) ); } const subtree = nextSubtree(tree, remaining.charAt(0)); if (subtree === undefined) { return [[lastCursor, currentCursor, tree['']]].concat( newChunk(remaining, currentCursor) ); } // continue current branch return parse(subtree, remaining.slice(1), lastCursor, currentCursor + 1); } return newChunk(string, 0); } // transform the tree, so that for example hepburnTree['ゔ']['ぁ'][''] === 'va' // or kanaTree['k']['y']['a'][''] === 'きゃ' function transform(tree) { return Object.entries(tree).reduce((map, [char, subtree]) => { const endOfBranch = typeOf(subtree) === 'string'; // eslint-disable-next-line no-param-reassign map[char] = endOfBranch ? { '': subtree } : transform(subtree); return map; }, {}); } function getSubTreeOf(tree, string) { return string.split('').reduce((correctSubTree, char) => { if (correctSubTree[char] === undefined) { // eslint-disable-next-line no-param-reassign correctSubTree[char] = {}; } return correctSubTree[char]; }, tree); } /** * Creates a custom mapping tree, returns a function that accepts a defaultMap which the newly created customMapping will be merged with and returned * (customMap) => (defaultMap) => mergedMap * @param {Object} customMap { 'ka' : 'な' } * @return {Function} (defaultMap) => defaultMergedWithCustomMap * @example * const sillyMap = createCustomMapping({ 'ちゃ': 'time', '茎': 'cookie'　}); * // sillyMap is passed defaultMapping to merge with when called in toRomaji() * toRomaji("It's 茎ちゃよ", { customRomajiMapping: sillyMap }); * // => 'It's cookie time yo'; */ function createCustomMapping(customMap = {}) { const customTree = {}; if (typeOf(customMap) === 'object') { Object.entries(customMap).forEach(([roma, kana]) => { let subTree = customTree; roma.split('').forEach((char) => { if (subTree[char] === undefined) { subTree[char] = {}; } subTree = subTree[char]; }); subTree[''] = kana; }); } return function makeMap(map) { const mapCopy = JSON.parse(JSON.stringify(map)); function transformMap(mapSubtree, customSubtree) { if (mapSubtree === undefined || typeOf(mapSubtree) === 'string') { return customSubtree; } return Object.entries(customSubtree).reduce( (newSubtree, [char, subtree]) => { // eslint-disable-next-line no-param-reassign newSubtree[char] = transformMap(mapSubtree[char], subtree); return newSubtree; }, mapSubtree ); } return transformMap(mapCopy, customTree); }; } // allow consumer to pass either function or object as customMapping function mergeCustomMapping(map, customMapping) { if (!customMapping) { return map; } return typeOf(customMapping) === 'function' ? customMapping(map) : createCustomMapping(customMapping)(map); } // NOTE: not exactly kunrei shiki, for example ぢゃ -> dya instead of zya, to avoid name clashing /* eslint-disable */ // prettier-ignore const BASIC_KUNREI = { a: 'あ', i: 'い', u: 'う', e: 'え', o: 'お', k: { a: 'か', i: 'き', u: 'く', e: 'け', o: 'こ', }, s: { a: 'さ', i: 'し', u: 'す', e: 'せ', o: 'そ', }, t: { a: 'た', i: 'ち', u: 'つ', e: 'て', o: 'と', }, n: { a: 'な', i: 'に', u: 'ぬ', e: 'ね', o: 'の', }, h: { a: 'は', i: 'ひ', u: 'ふ', e: 'へ', o: 'ほ', }, m: { a: 'ま', i: 'み', u: 'む', e: 'め', o: 'も', }, y: { a: 'や', u: 'ゆ', o: 'よ' }, r: { a: 'ら', i: 'り', u: 'る', e: 'れ', o: 'ろ', }, w: { a: 'わ', i: 'ゐ', e: 'ゑ', o: 'を', }, g: { a: 'が', i: 'ぎ', u: 'ぐ', e: 'げ', o: 'ご', }, z: { a: 'ざ', i: 'じ', u: 'ず', e: 'ぜ', o: 'ぞ', }, d: { a: 'だ', i: 'ぢ', u: 'づ', e: 'で', o: 'ど', }, b: { a: 'ば', i: 'び', u: 'ぶ', e: 'べ', o: 'ぼ', }, p: { a: 'ぱ', i: 'ぴ', u: 'ぷ', e: 'ぺ', o: 'ぽ', }, v: { a: 'ゔぁ', i: 'ゔぃ', u: 'ゔ', e: 'ゔぇ', o: 'ゔぉ', }, }; const SPECIAL_SYMBOLS$1 = { '.': '。', ',': '、', ':': '：', '/': '・', '!': '！', '?': '？', '~': '〜', '-': 'ー', '‘': '「', '’': '」', '“': '『', '”': '』', '[': '［', ']': '］', '(': '（', ')': '）', '{': '｛', '}': '｝', }; const CONSONANTS = { k: 'き', s: 'し', t: 'ち', n: 'に', h: 'ひ', m: 'み', r: 'り', g: 'ぎ', z: 'じ', d: 'ぢ', b: 'び', p: 'ぴ', v: 'ゔ', q: 'く', f: 'ふ', }; const SMALL_Y$1 = { ya: 'ゃ', yi: 'ぃ', yu: 'ゅ', ye: 'ぇ', yo: 'ょ' }; const SMALL_VOWELS = { a: 'ぁ', i: 'ぃ', u: 'ぅ', e: 'ぇ', o: 'ぉ' }; // typing one should be the same as having typed the other instead const ALIASES = { sh: 'sy', // sha -> sya ch: 'ty', // cho -> tyo cy: 'ty', // cyo -> tyo chy: 'ty', // chyu -> tyu shy: 'sy', // shya -> sya j: 'zy', // ja -> zya jy: 'zy', // jye -> zye // exceptions to above rules shi: 'si', chi: 'ti', tsu: 'tu', ji: 'zi', fu: 'hu', }; // xtu -> っ const SMALL_LETTERS = Object.assign( { tu: 'っ', wa: 'ゎ', ka: 'ヵ', ke: 'ヶ', }, SMALL_VOWELS, SMALL_Y$1 ); // don't follow any notable patterns const SPECIAL_CASES = { yi: 'い', wu: 'う', ye: 'いぇ', wi: 'うぃ', we: 'うぇ', kwa: 'くぁ', whu: 'う', // because it's not thya for てゃ but tha // and tha is not てぁ, but てゃ tha: 'てゃ', thu: 'てゅ', tho: 'てょ', dha: 'でゃ', dhu: 'でゅ', dho: 'でょ', }; const AIUEO_CONSTRUCTIONS = { wh: 'う', kw: 'く', qw: 'く', q: 'く', gw: 'ぐ', sw: 'す', ts: 'つ', th: 'て', tw: 'と', dh: 'で', dw: 'ど', fw: 'ふ', f: 'ふ', }; /* eslint-enable */ function createRomajiToKanaMap$1() { const kanaTree = transform(BASIC_KUNREI); // pseudo partial application const subtreeOf = (string) => getSubTreeOf(kanaTree, string); // add tya, sya, etc. Object.entries(CONSONANTS).forEach(([consonant, yKana]) => { Object.entries(SMALL_Y$1).forEach(([roma, kana]) => { // for example kyo -> き + ょ subtreeOf(consonant + roma)[''] = yKana + kana; }); }); Object.entries(SPECIAL_SYMBOLS$1).forEach(([symbol, jsymbol]) => { subtreeOf(symbol)[''] = jsymbol; }); // things like うぃ, くぃ, etc. Object.entries(AIUEO_CONSTRUCTIONS).forEach(([consonant, aiueoKana]) => { Object.entries(SMALL_VOWELS).forEach(([vowel, kana]) => { const subtree = subtreeOf(consonant + vowel); subtree[''] = aiueoKana + kana; }); }); // different ways to write ん ['n', "n'", 'xn'].forEach((nChar) => { subtreeOf(nChar)[''] = 'ん'; }); // c is equivalent to k, but not for chi, cha, etc. that's why we have to make a copy of k kanaTree.c = JSON.parse(JSON.stringify(kanaTree.k)); Object.entries(ALIASES).forEach(([string, alternative]) => { const allExceptLast = string.slice(0, string.length - 1); const last = string.charAt(string.length - 1); const parentTree = subtreeOf(allExceptLast); // copy to avoid recursive containment parentTree[last] = JSON.parse(JSON.stringify(subtreeOf(alternative))); }); function getAlternatives(string) { return [...Object.entries(ALIASES), ...[['c', 'k']]].reduce( (list, [alt, roma]) => (string.startsWith(roma) ? list.concat(string.replace(roma, alt)) : list), [] ); } Object.entries(SMALL_LETTERS).forEach(([kunreiRoma, kana]) => { const last = (char) => char.charAt(char.length - 1); const allExceptLast = (chars) => chars.slice(0, chars.length - 1); const xRoma = `x${kunreiRoma}`; const xSubtree = subtreeOf(xRoma); xSubtree[''] = kana; // ltu -> xtu -> っ const parentTree = subtreeOf(`l${allExceptLast(kunreiRoma)}`); parentTree[last(kunreiRoma)] = xSubtree; // ltsu -> ltu -> っ getAlternatives(kunreiRoma).forEach((altRoma) => { ['l', 'x'].forEach((prefix) => { const altParentTree = subtreeOf(prefix + allExceptLast(altRoma)); altParentTree[last(altRoma)] = subtreeOf(prefix + kunreiRoma); }); }); }); Object.entries(SPECIAL_CASES).forEach(([string, kana]) => { subtreeOf(string)[''] = kana; }); // add kka, tta, etc. function addTsu(tree) { return Object.entries(tree).reduce((tsuTree, [key, value]) => { if (!key) { // we have reached the bottom of this branch // eslint-disable-next-line no-param-reassign tsuTree[key] = `っ${value}`; } else { // more subtrees // eslint-disable-next-line no-param-reassign tsuTree[key] = addTsu(value); } return tsuTree; }, {}); } // have to explicitly name c here, because we made it a copy of k, not a reference [...Object.keys(CONSONANTS), 'c', 'y', 'w', 'j'].forEach((consonant) => { const subtree = kanaTree[consonant]; subtree[consonant] = addTsu(subtree); }); // nn should not be っん delete kanaTree.n.n; // solidify the results, so that there there is referential transparency within the tree return Object.freeze(JSON.parse(JSON.stringify(kanaTree))); } let romajiToKanaMap = null; function getRomajiToKanaTree() { if (romajiToKanaMap == null) { romajiToKanaMap = createRomajiToKanaMap$1(); } return romajiToKanaMap; } const USE_OBSOLETE_KANA_MAP = createCustomMapping({ wi: 'ゐ', we: 'ゑ', }); function IME_MODE_MAP(map) { // in IME mode, we do not want to convert single ns const mapCopy = JSON.parse(JSON.stringify(map)); mapCopy.n.n = { '': 'ん' }; mapCopy.n[' '] = { '': 'ん' }; return mapCopy; } /** * Tests if char is in English unicode uppercase range * @param {String} char * @return {Boolean} */ function isCharUpperCase(char = '') { if (isEmpty(char)) return false; return isCharInRange(char, LATIN_UPPERCASE_START, LATIN_UPPERCASE_END); } /** * Returns true if char is 'ー' * @param {String} char to test * @return {Boolean} */ function isCharLongDash(char = '') { if (isEmpty(char)) return false; return char.charCodeAt(0) === PROLONGED_SOUND_MARK; } /** * Tests if char is '・' * @param {String} char * @return {Boolean} true if '・' */ function isCharSlashDot(char = '') { if (isEmpty(char)) return false; return char.charCodeAt(0) === KANA_SLASH_DOT; } /** * Tests a character. Returns true if the character is [Hiragana](https://en.wikipedia.org/wiki/Hiragana). * @param {String} char character string to test * @return {Boolean} */ function isCharHiragana(char = '') { if (isEmpty(char)) return false; if (isCharLongDash(char)) return true; return isCharInRange(char, HIRAGANA_START, HIRAGANA_END); } /** * Convert [Hiragana](https://en.wikipedia.org/wiki/Hiragana) to [Katakana](https://en.wikipedia.org/wiki/Katakana) * Passes through any non-hiragana chars * @private * @param {String} [input=''] text input * @return {String} converted text * @example * hiraganaToKatakana('ひらがな') * // => "ヒラガナ" * hiraganaToKatakana('ひらがな is a type of kana') * // => "ヒラガナ is a type of kana" */ function hiraganaToKatakana(input = '') { const kata = []; input.split('').forEach((char) => { // Short circuit to avoid incorrect codeshift for 'ー' and '・' if (isCharLongDash(char) || isCharSlashDot(char)) { kata.push(char); } else if (isCharHiragana(char)) { // Shift charcode. const code = char.charCodeAt(0) + (KATAKANA_START - HIRAGANA_START); const kataChar = String.fromCharCode(code); kata.push(kataChar); } else { // Pass non-hiragana chars through kata.push(char); } }); return kata.join(''); } // memoize and deeply compare args so we only recreate when necessary const createRomajiToKanaMap = memoizeOne( (IMEMode, useObsoleteKana, customKanaMapping) => { let map = getRomajiToKanaTree(); map = IMEMode ? IME_MODE_MAP(map) : map; map = useObsoleteKana ? USE_OBSOLETE_KANA_MAP(map) : map; if (customKanaMapping) { map = mergeCustomMapping(map, customKanaMapping); } return map; }, dequal ); /** * Convert [Romaji](https://en.wikipedia.org/wiki/Romaji) to [Kana](https://en.wikipedia.org/wiki/Kana), lowercase text will result in [Hiragana](https://en.wikipedia.org/wiki/Hiragana) and uppercase text will result in [Katakana](https://en.wikipedia.org/wiki/Katakana). * @param {String} [input=''] text * @param {DefaultOptions} [options=defaultOptions] * @return {String} converted text * @example * toKana('onaji BUTTSUUJI') * // => 'おなじブッツウジ' * toKana('ONAJI buttsuuji') * // => 'オナジぶっつうじ' * toKana('座禅‘zazen’スタイル') * // => '座禅「ざぜん」スタイル' * toKana('batsuge-mu') * // => 'ばつげーむ' * toKana('!?.:/,~-‘’“”[](){}') // Punctuation conversion * // => '！？。：・、〜ー「」『』［］（）｛｝' * toKana('we', { useObsoleteKana: true }) * // => 'ゑ' * toKana('wanakana', { customKanaMapping: { na: 'に', ka: 'bana' } }); * // => 'わにbanaに' */ function toKana(input = '', options = {}, map) { let config; if (!map) { config = mergeWithDefaultOptions(options); map = createRomajiToKanaMap( config.IMEMode, config.useObsoleteKana, config.customKanaMapping ); } else { config = options; } // throw away the substring index information and just concatenate all the kana return splitIntoConvertedKana(input, config, map) .map((kanaToken) => { const [start, end, kana] = kanaToken; if (kana === null) { // haven't converted the end of the string, since we are in IME mode return input.slice(start); } const enforceHiragana = config.IMEMode === TO_KANA_METHODS.HIRAGANA; const enforceKatakana = config.IMEMode === TO_KANA_METHODS.KATAKANA || [...input.slice(start, end)].every(isCharUpperCase); return enforceHiragana || !enforceKatakana ? kana : hiraganaToKatakana(kana); }) .join(''); } /** * * @private * @param {String} [input=''] input text * @param {DefaultOptions} [options=defaultOptions] toKana options * @param {Object} [map] custom mapping * @returns {Array[]} [[start, end, token]] * @example * splitIntoConvertedKana('buttsuuji') * // => [[0, 2, 'ぶ'], [2, 6, 'っつ'], [6, 7, 'う'], [7, 9, 'じ']] */ function splitIntoConvertedKana(input = '', options = {}, map) { const { IMEMode, useObsoleteKana, customKanaMapping } = options; if (!map) { map = createRomajiToKanaMap(IMEMode, useObsoleteKana, customKanaMapping); } return applyMapping(input.toLowerCase(), map, !IMEMode); } /** * Tests a character. Returns true if the character is [Romaji](https://en.wikipedia.org/wiki/Romaji) (allowing [Hepburn romanisation](https://en.wikipedia.org/wiki/Hepburn_romanization)) * @param {String} char character string to test * @return {Boolean} */ function isCharRomaji(char = '') { if (isEmpty(char)) return false; return ROMAJI_RANGES.some(([start, end]) => isCharInRange(char, start, end)); } /** * Test if `input` is [Romaji](https://en.wikipedia.org/wiki/Romaji) (allowing [Hepburn romanisation](https://en.wikipedia.org/wiki/Hepburn_romanization)) * @param {String} [input=''] text * @param {Regexp} [allowed] additional test allowed to pass for each char * @return {Boolean} true if [Romaji](https://en.wikipedia.org/wiki/Romaji) * @example * isRomaji('Tōkyō and Ōsaka') * // => true * isRomaji('12a*b&c-d') * // => true * isRomaji('あアA') * // => false * isRomaji('お願い') * // => false * isRomaji('a！b&cーd') // Zenkaku punctuation fails * // => false * isRomaji('a！b&cーd', /[！ー]/) * // => true */ function isRomaji(input = '', allowed) { const augmented = typeOf(allowed) === 'regexp'; return isEmpty(input) ? false : [...input].every((char) => { const isRoma = isCharRomaji(char); return !augmented ? isRoma : isRoma || allowed.test(char); }); } /** * Tests a character. Returns true if the character is [Katakana](https://en.wikipedia.org/wiki/Katakana). * @param {String} char character string to test * @return {Boolean} */ function isCharKatakana(char = '') { return isCharInRange(char, KATAKANA_START, KATAKANA_END); } /** * Test if `input` is [Hiragana](https://en.wikipedia.org/wiki/Hiragana) * @param {String} [input=''] text * @return {Boolean} true if all [Hiragana](https://en.wikipedia.org/wiki/Hiragana) * @example * isHiragana('げーむ') * // => true * isHiragana('A') * // => false * isHiragana('あア') * // => false */ function isHiragana(input = '') { if (isEmpty(input)) return false; return [...input].every(isCharHiragana); } /** * Test if `input` is [Katakana](https://en.wikipedia.org/wiki/Katakana) * @param {String} [input=''] text * @return {Boolean} true if all [Katakana](https://en.wikipedia.org/wiki/Katakana) * @example * isKatakana('ゲーム') * // => true * isKatakana('あ') * // => false * isKatakana('A') * // => false * isKatakana('あア') * // => false */ function isKatakana(input = '') { if (isEmpty(input)) return false; return [...input].every(isCharKatakana); } /** * Tests a character. Returns true if the character is a CJK ideograph (kanji). * @param {String} char character string to test * @return {Boolean} */ function isCharKanji(char = '') { return isCharInRange(char, KANJI_START, KANJI_END); } /** * Tests if `input` is [Kanji](https://en.wikipedia.org/wiki/Kanji) ([Japanese CJK ideographs](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs)) * @param {String} [input=''] text * @return {Boolean} true if all [Kanji](https://en.wikipedia.org/wiki/Kanji) * @example * isKanji('刀') * // => true * isKanji('切腹') * // => true * isKanji('勢い') * // => false * isKanji('あAア') * // => false * isKanji('🐸') * // => false */ function isKanji(input = '') { if (isEmpty(input)) return false; return [...input].every(isCharKanji); } /** * Test if `input` contains a mix of [Romaji](https://en.wikipedia.org/wiki/Romaji) *and* [Kana](https://en.wikipedia.org/wiki/Kana), defaults to pass through [Kanji](https://en.wikipedia.org/wiki/Kanji) * @param {String} input text * @param {Object} [options={ passKanji: true }] optional config to pass through kanji * @return {Boolean} true if mixed * @example * isMixed('Abあア')) * // => true * isMixed('お腹A')) // ignores kanji by default * // => true * isMixed('お腹A', { passKanji: false })) * // => false * isMixed('ab')) * // => false * isMixed('あア')) * // => false */ function isMixed(input = '', options = { passKanji: true }) { const chars = [...input]; let hasKanji = false; if (!options.passKanji) { hasKanji = chars.some(isKanji); } return (chars.some(isHiragana) || chars.some(isKatakana)) && chars.some(isRomaji) && !hasKanji; } const isCharInitialLongDash = (char, index) => isCharLongDash(char) && index < 1; const isCharInnerLongDash = (char, index) => isCharLongDash(char) && index > 0; const isKanaAsSymbol = (char) => ['ヶ', 'ヵ'].includes(char); const LONG_VOWELS = { a: 'あ', i: 'い', u: 'う', e: 'え', o: 'う', }; // inject toRomaji to avoid circular dependency between toRomaji <-> katakanaToHiragana function katakanaToHiragana( input = '', toRomaji, { isDestinationRomaji, convertLongVowelMark } = {} ) { let previousKana = ''; return input .split('') .reduce((hira, char, index) => { // Short circuit to avoid incorrect codeshift for 'ー' and '・' if ( isCharSlashDot(char) || isCharInitialLongDash(char, index) || isKanaAsSymbol(char) ) { return hira.concat(char); } // Transform long vowels: 'オー' to 'おう' if ( convertLongVowelMark && previousKana && isCharInnerLongDash(char, index) ) { // Transform previousKana back to romaji, and slice off the vowel const romaji = toRomaji(previousKana).slice(-1); // However, ensure 'オー' => 'おお' => 'oo' if this is a transform on the way to romaji if ( isCharKatakana(input[index - 1]) && romaji === 'o' && isDestinationRomaji ) { return hira.concat('お'); } return hira.concat(LONG_VOWELS[romaji]); // Transform all other chars } if (!isCharLongDash(char) && isCharKatakana(char)) { const code = char.charCodeAt(0) + (HIRAGANA_START - KATAKANA_START); const hiraChar = String.fromCharCode(code); previousKana = hiraChar; return hira.concat(hiraChar); } // Pass non katakana chars through previousKana = ''; return hira.concat(char); }, []) .join(''); } let kanaToHepburnMap = null; /* eslint-disable */ // prettier-ignore const BASIC_ROMAJI = { あ:'a', い:'i', う:'u', え:'e', お:'o', か:'ka', き:'ki', く:'ku', け:'ke', こ:'ko', さ:'sa', し:'shi', す:'su', せ:'se', そ:'so', た:'ta', ち:'chi', つ:'tsu', て:'te', と:'to', な:'na', に:'ni', ぬ:'nu', ね:'ne', の:'no', は:'ha', ひ:'hi', ふ:'fu', へ:'he', ほ:'ho', ま:'ma', み:'mi', む:'mu', め:'me', も:'mo', ら:'ra', り:'ri', る:'ru', れ:'re', ろ:'ro', や:'ya', ゆ:'yu', よ:'yo', わ:'wa', ゐ:'wi', ゑ:'we', を:'wo', ん: 'n', が:'ga', ぎ:'gi', ぐ:'gu', げ:'ge', ご:'go', ざ:'za', じ:'ji', ず:'zu', ぜ:'ze', ぞ:'zo', だ:'da', ぢ:'ji', づ:'zu', で:'de', ど:'do', ば:'ba', び:'bi', ぶ:'bu', べ:'be', ぼ:'bo', ぱ:'pa', ぴ:'pi', ぷ:'pu', ぺ:'pe', ぽ:'po', ゔぁ:'va', ゔぃ:'vi', ゔ:'vu', ゔぇ:'ve', ゔぉ:'vo', }; /* eslint-enable */ const SPECIAL_SYMBOLS = { '。': '.', '、': ',', '：': ':', '・': '/', '！': '!', '？': '?', '〜': '~', 'ー': '-', '「': '‘', '」': '’', '『': '“', '』': '”', '［': '[', '］': ']', '（': '(', '）': ')', '｛': '{', '｝': '}', '　': ' ', }; // んい -> n'i const AMBIGUOUS_VOWELS = ['あ', 'い', 'う', 'え', 'お', 'や', 'ゆ', 'よ']; const SMALL_Y = { ゃ: 'ya', ゅ: 'yu', ょ: 'yo' }; const SMALL_Y_EXTRA = { ぃ: 'yi', ぇ: 'ye' }; const SMALL_AIUEO = { ぁ: 'a', ぃ: 'i', ぅ: 'u', ぇ: 'e', ぉ: 'o', }; const YOON_KANA = [ 'き', 'に', 'ひ', 'み', 'り', 'ぎ', 'び', 'ぴ', 'ゔ', 'く', 'ふ', ]; const YOON_EXCEPTIONS = { し: 'sh', ち: 'ch', じ: 'j', ぢ: 'j', }; const SMALL_KANA = { っ: '', ゃ: 'ya', ゅ: 'yu', ょ: 'yo', ぁ: 'a', ぃ: 'i', ぅ: 'u', ぇ: 'e', ぉ: 'o', }; // going with the intuitive (yet incorrect) solution where っや -> yya and っぃ -> ii // in other words, just assume the sokuon could have been applied to anything const SOKUON_WHITELIST = { b: 'b', c: 't', d: 'd', f: 'f', g: 'g', h: 'h', j: 'j', k: 'k', m: 'm', p: 'p', q: 'q', r: 'r', s: 's', t: 't', v: 'v', w: 'w', x: 'x', z: 'z', }; function getKanaToHepburnTree() { if (kanaToHepburnMap == null) { kanaToHepburnMap = createKanaToHepburnMap(); } return kanaToHepburnMap; } function getKanaToRomajiTree(romanization) { switch (romanization) { case ROMANIZATIONS.HEPBURN: return getKanaToHepburnTree(); default: return {}; } } function createKanaToHepburnMap() { const romajiTree = transform(BASIC_ROMAJI); const subtreeOf = (string) => getSubTreeOf(romajiTree, string); const setTrans = (string, transliteration) => { subtreeOf(string)[''] = transliteration; }; Object.entries(SPECIAL_SYMBOLS).forEach(([jsymbol, symbol]) => { subtreeOf(jsymbol)[''] = symbol; }); [...Object.entries(SMALL_Y), ...Object.entries(SMALL_AIUEO)].forEach( ([roma, kana]) => { setTrans(roma, kana); } ); // きゃ -> kya YOON_KANA.forEach((kana) => { const firstRomajiChar = subtreeOf(kana)[''][0]; Object.entries(SMALL_Y).forEach(([yKana, yRoma]) => { setTrans(kana + yKana, firstRomajiChar + yRoma); }); // きぃ -> kyi Object.entries(SMALL_Y_EXTRA).forEach(([yKana, yRoma]) => { setTrans(kana + yKana, firstRomajiChar + yRoma); }); }); Object.entries(YOON_EXCEPTIONS).forEach(([kana, roma]) => { // じゃ -> ja Object.entries(SMALL_Y).forEach(([yKana, yRoma]) => { setTrans(kana + yKana, roma + yRoma[1]); }); // じぃ -> jyi, じぇ -> je setTrans(`${kana}ぃ`, `${roma}yi`); setTrans(`${kana}ぇ`, `${roma}e`); }); romajiTree['っ'] = resolveTsu(romajiTree); Object.entries(SMALL_KANA).forEach(([kana, roma]) => { setTrans(kana, roma); }); AMBIGUOUS_VOWELS.forEach((kana) => { setTrans(`ん${kana}`, `n'${subtreeOf(kana)['']}`); }); // NOTE: could be re-enabled with an option? // // んば -> mbo // const LABIAL = [ // 'ば', 'び', 'ぶ', 'べ', 'ぼ', // 'ぱ', 'ぴ', 'ぷ', 'ぺ', 'ぽ', // 'ま', 'み', 'む', 'め', 'も', // ]; // LABIAL.forEach((kana) => { // setTrans(`ん${kana}`, `m${subtreeOf(kana)['']}`); // }); return Object.freeze(JSON.parse(JSON.stringify(romajiTree))); } function resolveTsu(tree) { return Object.entries(tree).reduce((tsuTree, [key, value]) => { if (!key) { // we have reached the bottom of this branch const consonant = value.charAt(0); // eslint-disable-next-line no-param-reassign tsuTree[key] = Object.keys(SOKUON_WHITELIST).includes(consonant) ? SOKUON_WHITELIST[consonant] + value : value; } else { // more subtrees // eslint-disable-next-line no-param-reassign tsuTree[key] = resolveTsu(value); } return tsuTree; }, {}); } // memoize and deeply compare args so we only recreate when necessary const createKanaToRomajiMap = memoizeOne( (romanization, customRomajiMapping) => { let map = getKanaToRomajiTree(romanization); if (customRomajiMapping) { map = mergeCustomMapping(map, customRomajiMapping); } return map; }, dequal ); /** * Convert kana to romaji * @param {String} kana text input * @param {DefaultOptions} [options=defaultOptions] * @param {Object} map custom mapping * @return {String} converted text * @example * toRomaji('ひらがな　カタカナ') * // => 'hiragana katakana' * toRomaji('げーむ　ゲーム') * // => 'ge-mu geemu' * toRomaji('ひらがな　カタカナ', { upcaseKatakana: true }) * // => 'hiragana KATAKANA' * toRomaji('つじぎり', { customRomajiMapping: { じ: 'zi', つ: 'tu', り: 'li' } }); * // => 'tuzigili' */ function toRomaji(input = '', options = {}, map) { const config = mergeWithDefaultOptions(options); if (!map) { map = createKanaToRomajiMap( config.romanization, config.customRomajiMapping ); } // just throw away the substring index information and simply concatenate all the kana return splitIntoRomaji(input, config, map) .map((romajiToken) => { const [start, end, romaji] = romajiToken; const makeUpperCase = config.upcaseKatakana && isKatakana(input.slice(start, end)); return makeUpperCase ? romaji.toUpperCase() : romaji; }) .join(''); } function splitIntoRomaji(input, options, map) { if (!map) { map = createKanaToRomajiMap( options.romanization, options.customRomajiMapping ); } const config = Object.assign({}, { isDestinationRomaji: true }, options); return applyMapping( katakanaToHiragana(input, toRomaji, config), map, !options.IMEMode ); } /** * Tests a character. Returns true if the character is considered English punctuation. * @param {String} char character string to test * @return {Boolean} */ function isCharEnglishPunctuation(char = '') { if (isEmpty(char)) return false; return EN_PUNCTUATION_RANGES.some(([start, end]) => isCharInRange(char, start, end)); } /** * Convert input to [Hiragana](https://en.wikipedia.org/wiki/Hiragana) * @param {String} [input=''] text * @param {DefaultOptions} [options=defaultOptions] * @return {String} converted text * @example * toHiragana('toukyou, オオサカ') * // => 'とうきょう、　おおさか' * toHiragana('only カナ', { passRomaji: true }) * // => 'only かな' * toHiragana('wi') * // => 'うぃ' * toHiragana('wi', { useObsoleteKana: true }) * // => 'ゐ' */ function toHiragana(input = '', options = {}) { const config = mergeWithDefaultOptions(options); if (config.passRomaji) { return katakanaToHiragana(input, toRomaji, config); } if (isMixed(input, { passKanji: true })) { const convertedKatakana = katakanaToHiragana(input, toRomaji, config); return toKana(convertedKatakana.toLowerCase(), config); } if (isRomaji(input) || isCharEnglishPunctuation(input)) { return toKana(input.toLowerCase(), config); } return katakanaToHiragana(input, toRomaji, config); } /** * Convert input to [Katakana](https://en.wikipedia.org/wiki/Katakana) * @param {String} [input=''] text * @param {DefaultOptions} [options=defaultOptions] * @return {String} converted text * @example * toKatakana('toukyou, おおさか') * // => 'トウキョウ、　オオサカ' * toKatakana('only かな', { passRomaji: true }) * // => 'only カナ' * toKatakana('wi') * // => 'ウィ' * toKatakana('wi', { useObsoleteKana: true }) * // => 'ヰ' */ function toKatakana(input = '', options = {}) { const mergedOptions = mergeWithDefaultOptions(options); if (mergedOptions.passRomaji) { return hiraganaToKatakana(input); } if (isMixed(input) || isRomaji(input) || isCharEnglishPunctuation(input)) { const hiragana = toKana(input.toLowerCase(), mergedOptions); return hiraganaToKatakana(hiragana); } return hiraganaToKatakana(input); } const FuzzySet = function(arr, useLevenshtein, gramSizeLower, gramSizeUpper) { var fuzzyset = { }; // default options arr = arr || []; fuzzyset.gramSizeLower = gramSizeLower || 2; fuzzyset.gramSizeUpper = gramSizeUpper || 3; fuzzyset.useLevenshtein = (typeof useLevenshtein !== 'boolean') ? true : useLevenshtein; // define all the object functions and attributes fuzzyset.exactSet = {}; fuzzyset.matchDict = {}; fuzzyset.items = {}; // helper functions var levenshtein = function(str1, str2) { var current = [], prev, value; for (var i = 0; i <= str2.length; i++) for (var j = 0; j <= str1.length; j++) { if (i && j) if (str1.charAt(j - 1) === str2.charAt(i - 1)) value = prev; else value = Math.min(current[j], current[j - 1], prev) + 1; else value = i + j; prev = current[j]; current[j] = value; } return current.pop(); }; // return an edit distance from 0 to 1 var _distance = function(str1, str2) { if (str1 === null && str2 === null) throw 'Trying to compare two null values'; if (str1 === null || str2 === null) return 0; str1 = String(str1); str2 = String(str2); var distance = levenshtein(str1, str2); if (str1.length > str2.length) { return 1 - distance / str1.length; } else { return 1 - distance / str2.length; } }; // u00C0-u00FF is latin characters // u0621-u064a is arabic letters // u0660-u0669 is arabic numerals // TODO: figure out way to do this for more languages var _nonWordRe = /[^a-zA-Z0-9\u00C0-\u00FF\u0621-\u064A\u0660-\u0669, ]+/g; var _iterateGrams = function(value, gramSize) { gramSize = gramSize || 2; var simplified = '-' + value.toLowerCase().replace(_nonWordRe, '') + '-', lenDiff = gramSize - simplified.length, results = []; if (lenDiff > 0) { for (var i = 0; i < lenDiff; ++i) { simplified += '-'; } } for (var i = 0; i < simplified.length - gramSize + 1; ++i) { results.push(simplified.slice(i, i + gramSize)); } return results; }; var _gramCounter = function(value, gramSize) { // return an object where key=gram, value=number of occurrences gramSize = gramSize || 2; var result = {}, grams = _iterateGrams(value, gramSize), i = 0; for (i; i < grams.length; ++i) { if (grams[i] in result) { result[grams[i]] += 1; } else { result[grams[i]] = 1; } } return result; }; // the main functions fuzzyset.get = function(value, defaultValue, minMatchScore) { // check for value in set, returning defaultValue or null if none found if (minMatchScore === undefined) { minMatchScore = .33; } var result = this._get(value, minMatchScore); if (!result && typeof defaultValue !== 'undefined') { return defaultValue; } return result; }; fuzzyset._get = function(value, minMatchScore) { var results = []; // start with high gram size and if there are no results, go to lower gram sizes for (var gramSize = this.gramSizeUpper; gramSize >= this.gramSizeLower; --gramSize) { results = this.__get(value, gramSize, minMatchScore); if (results && results.length > 0) { return results; } } return null; }; fuzzyset.__get = function(value, gramSize, minMatchScore) { var normalizedValue = this._normalizeStr(value), matches = {}, gramCounts = _gramCounter(normalizedValue, gramSize), items = this.items[gramSize], sumOfSquareGramCounts = 0, gram, gramCount, i, index, otherGramCount; for (gram in gramCounts) { gramCount = gramCounts[gram]; sumOfSquareGramCounts += Math.pow(gramCount, 2); if (gram in this.matchDict) { for (i = 0; i < this.matchDict[gram].length; ++i) { index = this.matchDict[gram][i][0]; otherGramCount = this.matchDict[gram][i][1]; if (index in matches) { matches[index] += gramCount * otherGramCount; } else { matches[index] = gramCount * otherGramCount; } } } } function isEmptyObject(obj) { for(var prop in obj) { if(obj.hasOwnProperty(prop)) return false; } return true; } if (isEmptyObject(matches)) { return null; } var vectorNormal = Math.sqrt(sumOfSquareGramCounts), results = [], matchScore; // build a results list of [score, str] for (var matchIndex in matches) { matchScore = matches[matchIndex]; results.push([matchScore / (vectorNormal * items[matchIndex][0]), items[matchIndex][1]]); } var sortDescending = function(a, b) { if (a[0] < b[0]) { return 1; } else if (a[0] > b[0]) { return -1; } else { return 0; } }; results.sort(sortDescending); if (this.useLevenshtein) { var newResults = [], endIndex = Math.min(50, results.length); // truncate somewhat arbitrarily to 50 for (var i = 0; i < endIndex; ++i) { newResults.push([_distance(results[i][1], normalizedValue), results[i][1]]); } results = newResults; results.sort(sortDescending); } newResults = []; results.forEach(function(scoreWordPair) { if (scoreWordPair[0] >= minMatchScore) { newResults.push([scoreWordPair[0], this.exactSet[scoreWordPair[1]]]); } }.bind(this)); return newResults; }; fuzzyset.add = function(value) { var normalizedValue = this._normalizeStr(value); if (normalizedValue in this.exactSet) { return false; } var i = this.gramSizeLower; for (i; i < this.gramSizeUpper + 1; ++i) { this._add(value, i); } }; fuzzyset._add = function(value, gramSize) { var normalizedValue = this._normalizeStr(value), items = this.items[gramSize] || [], index = items.length; items.push(0); var gramCounts = _gramCounter(normalizedValue, gramSize), sumOfSquareGramCounts = 0, gram, gramCount; for (gram in gramCounts) { gramCount = gramCounts[gram]; sumOfSquareGramCounts += Math.pow(gramCount, 2); if (gram in this.matchDict) { this.matchDict[gram].push([index, gramCount]); } else { this.matchDict[gram] = [[index, gramCount]]; } } var vectorNormal = Math.sqrt(sumOfSquareGramCounts); items[index] = [vectorNormal, normalizedValue]; this.items[gramSize] = items; this.exactSet[normalizedValue] = value; }; fuzzyset._normalizeStr = function(str) { if (Object.prototype.toString.call(str) !== '[object String]') throw 'Must use a string as argument to FuzzySet functions'; return str.toLowerCase(); }; // return length of items in set fuzzyset.length = function() { var count = 0, prop; for (prop in this.exactSet) { if (this.exactSet.hasOwnProperty(prop)) { count += 1; } } return count; }; // return is set is empty fuzzyset.isEmpty = function() { for (var prop in this.exactSet) { if (this.exactSet.hasOwnProperty(prop)) { return false; } } return true; }; // return list of values loaded into set fuzzyset.values = function() { var values = [], prop; for (prop in this.exactSet) { if (this.exactSet.hasOwnProperty(prop)) { values.push(this.exactSet[prop]); } } return values; }; // initialization var i = fuzzyset.gramSizeLower; for (i; i < fuzzyset.gramSizeUpp