UNPKG

typo-ime

Version:

In-browser Pinyin-to-Hanzi IME with fuzzy search and bigram probabilistic suggestions

169 lines (152 loc) 5.79 kB
/** * @module TypoSM * @description * Web Worker script implementing a high-performance fuzzy search engine * for Pinyin-to-Hanzi conversion with probabilistic bigram context scoring. */ const DICTIONARY_URL = 'https://spotqing.pythonanywhere.com/static/dict.json'; const BIGRAM_URL = 'https://spotqing.pythonanywhere.com/static/bigram_probabilities.json'; // --- Levenshtein distance utility (unchanged) --- /** * Compute the Levenshtein edit distance between two strings. * @param {string} a - First string. * @param {string} b - Second string. * @returns {number} Edit distance. */ function levenshteinDistance(a, b) { // Small optimization: if length difference is large, score will be low. // For clarity, we keep the standard implementation. const matrix = Array(b.length + 1) .fill(null) .map(() => Array(a.length + 1).fill(null)); for (let i = 0; i <= a.length; i++) matrix[0][i] = i; for (let j = 0; j <= b.length; j++) matrix[j][0] = j; for (let j = 1; j <= b.length; j++) { for (let i = 1; i <= a.length; i++) { const cost = a[i - 1] === b[j - 1] ? 0 : 1; matrix[j][i] = Math.min( matrix[j][i - 1] + 1, matrix[j - 1][i] + 1, matrix[j - 1][i - 1] + cost ); } } return matrix[b.length][a.length]; } // --- Global worker state --- /** @type {Array<object>} Original dictionary records. */ let dictionary = []; /** @type {Object<string,Object<string,number>>} Bigram probability map. */ let bigramProbabilities = {}; // NEW: Search index for fast lookups by key and length /** * Format: { pinyin: { length: [record, ...] }, hanzi: { length: [...] } } * @type {Object<string, Object<number, Array<object>>>} */ let searchIndex = {}; // --- Initialization: load and index data --- /** * Fetches dictionary and bigram data, pre-processes strings, * and builds a length-based search index for each field. * @async * @throws {Error} If fetch responses are not OK. */ async function initialize() { console.log('[Worker] Loading data...'); const [dictResponse, bigramResponse] = await Promise.all([ fetch(DICTIONARY_URL), fetch(BIGRAM_URL) ]); if (!dictResponse.ok || !bigramResponse.ok) { throw new Error('Failed to fetch worker data.'); } dictionary = await dictResponse.json(); bigramProbabilities = await bigramResponse.json(); console.log('[Worker] Pre-processing and indexing data...'); searchIndex = { pinyin: {}, hanzi: {} }; for (const record of dictionary) { const keysToProcess = ['pinyin', 'hanzi']; for (const key of keysToProcess) { const target = record[key]; if (typeof target !== 'string' || !target) continue; // 1. Pre-calc: normalize once const searchableTarget = target.trim().replace(/[0-9]/g, '').toLowerCase(); if (!searchableTarget) continue; record[`searchable_${key}`] = searchableTarget; // 2. Index by length const len = searchableTarget.length; if (!searchIndex[key][len]) searchIndex[key][len] = []; searchIndex[key][len].push(record); } } console.log('[Worker] Indexing complete.'); } // --- Optimized search logic --- /** * Perform a fuzzy search over indexed records using Levenshtein distance * and optional bigram context scoring. * @param {string} query - Lowercase search fragment. * @param {object} options - { key: 'pinyin'|'hanzi', threshold?: number, lengthTolerance?: number } * @param {string|null} previousWordHanzi - Optional bigram context. * @returns {Array<{ item: object, score: number, bigramScore: number }>} Sorted results. */ function performSearch(query, options, previousWordHanzi) { const key = options.key; const threshold = options.threshold ?? 0.6; const lengthTolerance = options.lengthTolerance ?? 2; if (!query || !key || !searchIndex[key]) return []; const normalizedQuery = query.toLowerCase(); const queryLength = normalizedQuery.length; const hasBigramContext = !!( previousWordHanzi && bigramProbabilities[previousWordHanzi] ); const results = []; // Iterate only over relevant lengths for ( let len = Math.max(1, queryLength - lengthTolerance); len <= queryLength + lengthTolerance; len++ ) { const bucket = searchIndex[key][len]; if (!bucket) continue; for (const record of bucket) { const hanzi = record.hanzi; if (!hanzi) continue; const target = record[`searchable_${key}`]; const distance = levenshteinDistance(normalizedQuery, target); const score = 1 - distance / Math.max(queryLength, target.length); if (score >= threshold) { const bigramScore = hasBigramContext ? bigramProbabilities[previousWordHanzi]?.[hanzi] || 0 : 0; results.push({ item: record, score, bigramScore }); } } } return results.sort((a, b) => { if (b.score !== a.score) return b.score - a.score; return b.bigramScore - a.bigramScore; }); } // --- Worker message handling --- self.onmessage = async (event) => { const { type, payload, searchId } = event.data; switch (type) { case 'init': try { await initialize(); self.postMessage({ type: 'init_success' }); } catch (err) { console.error('[Worker] Init error:', err); self.postMessage({ type: 'init_error', error: err.message }); } break; case 'search': { const { query, options, previousWordHanzi } = payload; const results = performSearch(query, options, previousWordHanzi); self.postMessage({ type: 'search_results', results, searchId }); break; } } };