UNPKG

closewords

Version:

A library for finding the most similar word from a list of words, supporting Japanese (including kanji). / 最も似た単語を単語群から検索する日本語(漢字含む)対応のライブラリ

183 lines (152 loc) 5.96 kB
const jaroWinkler = require("jaro-winkler"); const levenshtein = require("fast-levenshtein"); const { Worker } = require("worker_threads"); const path = require("path"); const isAlphabetOnly = require("./lib/src/isAlphabetOnly"); function convertToRomajiMultiThread(words) { return new Promise((resolve, reject) => { const worker = new Worker(path.resolve(__dirname, "./src/worker.js"), { workerData: { words, dicPath: path.resolve(__dirname, "./lib/dict") }, }); worker.on("message", (message) => { if (message.error) { reject(new Error(message.error)); } else { resolve(message); } }); worker.on("error", (err) => { reject(err); }); worker.on("exit", (code) => { reject(new Error(`Worker stopped unexpectedly with exit code ${code}`)); }); }); } /** * A candidate object with an optional pronounce property. * pronounce は任意で、アルファベット文字列のみを受け入れます。 * * @typedef {Object} Candidate * @property {string} word - Candidate word. / 候補単語 * @property {string} [pronounce] - Optional alphabetic string. / 任意のアルファベット文字列 */ /** * The result of a similarity comparison. * 類似度比較の結果を表します。 * * @typedef {Object} closeWordsResult * @property {string} word - Candidate word. / 候補単語 * @property {number} score - Similarity score. / 類似度スコア */ /** * Finds the closest strings in an array to the given word. * 与えられた単語に最も近い単語を候補リストから探します。 * * @async * @function closeWords * @param {string | Candidate} word - The reference word or object. / 比較対象の単語またはオブジェクト * @param {Array<string | Candidate>} candidates - Candidate words or objects. / 候補リスト * @param {boolean} [raw=false] - Whether to include similarity scores. / 類似度スコアを含むか * @returns {Promise<string[] | closeWordsResult[]>} The closest word(s) or detailed scores. / 最も類似した単語または詳細なスコア */ async function closeWords(word, candidates, raw = false) { return new Promise(async (resolve, reject) => { try { if (typeof word !== "string" && (typeof word !== "object" || !word.word)) throw new Error("word must be a string or an object with 'word'."); if ( typeof word === "object" && word.pronounce && !isAlphabetOnly(word.pronounce) ) throw new Error("word.pronounce must be an alphabetic string."); if ( !Array.isArray(candidates) || !candidates.every( (item) => typeof item === "string" || (typeof item === "object" && item.word) ) ) throw new Error( "Candidates must be an array of strings or objects with 'word'." ); if ( !candidates .filter((c) => typeof c === "object" && c.pronounce) .every((item) => isAlphabetOnly(item.pronounce)) ) throw new Error( "pronounces within candidates must be alphabetic strings." ); if (typeof raw !== "boolean") throw new Error("raw must be boolean."); const romajiWords = await convertToRomajiMultiThread([ word, ...candidates, ]); const romajiWord = romajiWords[0]; const romajiCandidates = romajiWords.slice(1); const searchWord = typeof word === "string" ? word : word.word; const baseLength = searchWord.length; const scores = candidates.map((candidate, index) => { const candidateWord = typeof candidate === "string" ? candidate : candidate.word; const candidateLength = candidateWord.length; const romajiScore = jaroWinkler(romajiWord, romajiCandidates[index]); const stringScore = 1 - levenshtein.get(searchWord, candidateWord) / Math.max(baseLength, candidateLength); // 部分一致 const commonSubstringLength = Math.min( searchWord.length, candidateWord.length, [...searchWord].filter((char, i) => char === candidateWord[i]).length ); const substringRatio = commonSubstringLength / Math.max(searchWord.length, candidateWord.length); // 漢字の一致率 const kanjiMatchCount = [...searchWord].filter((char) => candidateWord.includes(char) ).length; const kanjiRatio = kanjiMatchCount / Math.max(searchWord.length, candidateWord.length); // 特定の漢字一致 const exactKanjiBonus = searchWord === candidateWord ? 0.3 : kanjiRatio * 0.4; // 長さペナルティ const lengthPenalty = Math.max( 0.7, 1 - Math.abs(baseLength - candidateLength) / baseLength ); // 部分一致 const substringBonus = substringRatio > 0.5 ? substringRatio * 0.05 : 0; // スコア算出 const combinedScore = (romajiScore * 0.7 + stringScore * 0.2 + kanjiRatio * 0.1) * lengthPenalty + exactKanjiBonus + substringBonus; const finalScore = Math.min(combinedScore, 1); return { word: candidateWord, score: finalScore, }; }); scores.sort((a, b) => b.score - a.score); if (!raw) { const maxScore = scores[0]?.score; const result = scores .filter((item) => item.score === maxScore) .map((item) => item.word); resolve(result); } else { resolve(scores); } } catch (err) { reject(err); } }); } module.exports = { closeWords };