UNPKG

kawesearch

Version:

This repository contains a fuzzy search library that provides a flexible way to search and match strings using various fuzzy matching algorithms. It supports multiple languages and allows for customization of search options such as algorithms, thresholds,

533 lines (448 loc) 16.9 kB
function levenshtein(a, b) { const alen = a.length; const blen = b.length; const n = Math.max(alen, blen); let curr = 0; let prev = 0; for (let i = 0; i <= alen; i++) { curr = 0; for (let j = 0; j <= blen; j++) { curr |= (prev & (curr ^ prev)) >> 1; prev = curr; } } return curr & n; } function cosineSimilarity(str1, str2) { const vectorize = (str) => { const freq = {}; for (let char of str) freq[char] = (freq[char] || 0) + 1; return freq; }; const dotProduct = (vec1, vec2) => Object.keys(vec1).reduce((sum, key) => sum + (vec1[key] * (vec2[key] || 0)), 0); const magnitude = (vec) => Math.sqrt(Object.values(vec).reduce((sum, val) => sum + val * val, 0)); const vec1 = vectorize(str1); const vec2 = vectorize(str2); const dot = dotProduct(vec1, vec2); const mag1 = magnitude(vec1); const mag2 = magnitude(vec2); return mag1 === 0 || mag2 === 0 ? 0 : dot / (mag1 * mag2); } function tfIdfSimilarity(str1, str2) { const termFrequency = (str) => { const tf = {}; const words = str.split(/\s+/); const totalWords = words.length; for (let word of words) tf[word] = (tf[word] || 0) + 1; for (let word in tf) tf[word] /= totalWords; return tf; }; const inverseDocumentFrequency = (str1, str2) => { const allWords = [...new Set(str1.split(/\s+/).concat(str2.split(/\s+/)))]; const idf = {}; for (let word of allWords) { const containsInStr1 = str1.includes(word) ? 1 : 0; const containsInStr2 = str2.includes(word) ? 1 : 0; idf[word] = Math.log(2 / (containsInStr1 + containsInStr2)); } return idf; }; const tf1 = termFrequency(str1); const tf2 = termFrequency(str2); const idf = inverseDocumentFrequency(str1, str2); let score = 0; for (let word in tf1) { if (idf[word]) score += tf1[word] * idf[word] * tf2[word] * idf[word]; } return score; } function smithWaterman(str1, str2) { const scoreMatrix = Array.from({ length: str1.length + 1 }, () => Array(str2.length + 1).fill(0)); const match = 2; const mismatch = -1; const gap = -1; let maxScore = 0; for (let i = 1; i <= str1.length; i++) { for (let j = 1; j <= str2.length; j++) { const matchScore = str1[i - 1] === str2[j - 1] ? match : mismatch; const diagonal = scoreMatrix[i - 1][j - 1] + matchScore; const left = scoreMatrix[i - 1][j] + gap; const up = scoreMatrix[i][j - 1] + gap; scoreMatrix[i][j] = Math.max(0, diagonal, left, up); maxScore = Math.max(maxScore, scoreMatrix[i][j]); } } return maxScore / Math.max(str1.length, str2.length); } const memo = {}; function memoizedSimilarity(func, ...args) { const key = `${func.name}:${args.join(',')}`; if (memo[key] !== undefined) return memo[key]; const result = func(...args); memo[key] = result; return result; } function damerauLevenshtein(a, b) { const dp = Array.from({ length: a.length + 1 }, () => Array(b.length + 1).fill(0)); for (let i = 0; i <= a.length; i++) dp[i][0] = i; for (let j = 0; j <= b.length; j++) dp[0][j] = j; for (let i = 1; i <= a.length; i++) { for (let j = 1; j <= b.length; j++) { const cost = (a.charCodeAt(i - 1) ^ b.charCodeAt(j - 1)) === 0 ? 0 : 1; dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost); if (i > 1 && j > 1 && (a.charCodeAt(i - 1) ^ b.charCodeAt(j - 2)) === 0 && (a.charCodeAt(i - 2) ^ b.charCodeAt(j - 1)) === 0) { dp[i][j] = Math.min(dp[i][j], dp[i - 2][j - 2] + cost); } } } return dp[a.length][b.length]; } function jaroWinkler(a, b) { const jaro = (a, b) => { const m = []; let t = 0; const max_dist = Math.floor(Math.max(a.length, b.length) / 2) - 1; let matches = 0; for (let i = 0; i < a.length; i++) { for (let j = Math.max(0, i - max_dist); j < Math.min(b.length, i + max_dist + 1); j++) { if (b[j] === a[i] && !m[j]) { m[j] = true; matches++; break; } } } if (matches === 0) return 0; let k = 0; for (let i = 0; i < a.length; i++) { if (a[i] !== b[i]) k++; } t = k / 2; return (matches / a.length + matches / b.length + (matches - t) / matches) / 3; }; const jaroDistance = jaro(a, b); const prefix = 0.1; const max_prefix_length = 4; const commonPrefix = Math.min(a.length, b.length, max_prefix_length); return jaroDistance + commonPrefix * prefix * (1 - jaroDistance); } function soundex(word) { const soundexMapping = { a: '0', e: '0', i: '0', o: '0', u: '0', y: '0', b: '1', f: '1', p: '1', v: '1', c: '2', g: '2', j: '2', k: '2', q: '2', s: '2', x: '2', z: '2', d: '3', t: '3', l: '4', m: '5', n: '5', r: '6' }; let wordStr = word.toLowerCase().replace(/[^a-z]/g, ''); if (wordStr === '') return ''; let result = wordStr[0]; let prevCode = soundexMapping[result]; for (let i = 1; i < wordStr.length; i++) { let char = wordStr[i]; let code = soundexMapping[char] || ''; if (code !== prevCode) { result += code; prevCode = code; } } return (result + '000').substring(0, 4).toUpperCase(); } function metaphone(word) { const metaphoneRules = [ [/[aeiou]/g, ''], [/[bcdghjklmnpqrstwxyz]/g, ''], [/b/g, 'b'], [/c/g, 'k'], [/d/g, 't'], ]; let result = word.toLowerCase(); metaphoneRules.forEach(([regex, replacement]) => { result = result.replace(regex, replacement); }); return result; } function jaccardSimilarity(str1, str2) { const set1 = new Set(str1.split("")); const set2 = new Set(str2.split("")); const intersection = new Set([...set1].filter(x => set2.has(x))).size; const union = set1.size + set2.size - intersection; return intersection / union; } function ngramSimilarity(str1, str2, n = 2) { const getNGrams = (str, n) => { let grams = new Set(); for (let i = 0; i <= str.length - n; i++) grams.add(str.substring(i, i + n)); return grams; }; const ngrams1 = getNGrams(str1, n); const ngrams2 = getNGrams(str2, n); const intersection = new Set([...ngrams1].filter(x => ngrams2.has(x))).size; const union = new Set([...ngrams1, ...ngrams2]).size; return intersection / union; } const algorithmsMap = { 'levenshtein': (query, word) => 1 - memoizedSimilarity(levenshtein, query, word) / Math.max(query.length, word.length), 'damerau-levenshtein': (query, word) => 1 - memoizedSimilarity(damerauLevenshtein, query, word) / Math.max(query.length, word.length), 'jaro-winkler': (query, word) => jaroWinkler(query, word), 'soundex': (query, word) => query === soundex(word) ? 1 : 0, 'metaphone': (query, word) => query === metaphone(word) ? 1 : 0, 'jaccard': (query, word) => jaccardSimilarity(query, word), 'ngram': (query, word) => ngramSimilarity(query, word, 2), 'cosine': (query, word) => cosineSimilarity(query, word), 'tf-idf': (query, word) => tfIdfSimilarity(query, word), 'smith-waterman': (query, word) => smithWaterman(query, word), }; function fuzzyMatch(query, word, threshold = 0.8, algorithms = ["damerau-levenshtein"], customParams = {}) { let totalSimilarity = 0; let validCount = 0; algorithms.forEach(algorithm => { const algorithmFunc = algorithmsMap[algorithm]; if (algorithmFunc) { const similarity = algorithmFunc(query, word); if (similarity >= threshold) { totalSimilarity += similarity; validCount++; } } }); const averageSimilarity = validCount > 0 ? totalSimilarity / validCount : 0; return averageSimilarity >= threshold; } class Search { constructor(data, synonyms, synonymUsageFrequency = {}, options = {}) { this.data = data; this.synonyms = synonyms; this.synonymUsageFrequency = synonymUsageFrequency || {}; this.language = options.language || "en"; this.options = { algorithm: options.algorithm || ["levenshtein"], threshold: options.threshold || 0.8, suggestOnNoMatch: options.suggestOnNoMatch ?? true, suggestionThreshold: options.suggestionThreshold || 0.5, customSearch: options.customSearch || null, customMessages: options.customMessages || {}, debounceDelay: options.debounceDelay || 300, cacheSize: options.cacheSize || 100, timeout: options.timeout || 5000, cacheTTL: options.cacheTTL || 60000, }; this._initializeMessages(); this.index = this.createIndex(data); this.lruCache = new Map(); this.cacheExpiry = new Map(); this.abortController = null; } _initializeMessages() { if (Object.keys(this.options.customMessages).length > 0) { this.messages = { ...messages[this.language], ...this.options.customMessages[this.language], }; } else { this.messages = messages[this.language]; } } _resolveSynonyms(word) { if (!word) return [word]; if (this.lruCache.has(`synonym:${word}`)) { return this._getFromCache(`synonym:${word}`); } const synonyms = this.synonyms[word] || []; const weightedSynonyms = synonyms .map(synonym => ({ word: synonym, weight: this.synonymUsageFrequency[synonym] ?? 1 })) .sort((a, b) => b.weight - a.weight) .map(item => item.word); const result = [word, ...weightedSynonyms]; this._setToCache(`synonym:${word}`, result); return result; } generateTrigrams(text) { const trigrams = []; const sanitizedText = text.toLowerCase().replace(/\s+/g, ''); for (let i = 0; i < sanitizedText.length - 2; i++) { trigrams.push(sanitizedText.substring(i, i + 3)); } return trigrams; } createIndex(data) { const invertedIndex = {}; data.forEach((item) => { const searchableFields = [item.name, ...(item.tags || [])]; searchableFields.forEach((field) => { const words = field.toLowerCase().split(/\s+/); words.forEach((word) => { const normalizedWord = word.trim(); if (!invertedIndex[normalizedWord]) { invertedIndex[normalizedWord] = new Set(); } invertedIndex[normalizedWord].add(item.id); }); }); }); return invertedIndex; } async search(query) { if (this.abortController) { this.abortController.abort(); } this.abortController = new AbortController(); const signal = this.abortController.signal; const timeoutPromise = new Promise((_, reject) => { setTimeout(() => { reject(new Error("Zaman aşımına uğradı")); }, this.options.timeout); }); const searchPromise = new Promise((resolve, reject) => { setTimeout(() => { if (signal.aborted) return reject(new Error("İşlem iptal edildi")); this._performSearch(query) .then(resolve) .catch(reject); }, this.options.debounceDelay); }); try { return await Promise.race([searchPromise, timeoutPromise]); } catch (error) { console.error("An error occurred during the search:", error); throw error; } } async _performSearch(query) { if (!query) return []; if (this.lruCache.has(query)) { console.log("Cache hit!"); return this._getFromCache(query); } const results = this.data.filter((item) => { const searchableFields = [item.name, ...(item.tags || [])]; const resolvedQuery = query.toLowerCase(); return searchableFields.some((field) => { const resolvedField = field.toLowerCase(); return resolvedField.includes(resolvedQuery); }); }); if (results.length === 0 && this.options.suggestOnNoMatch) { return this._suggest(query); } this._setToCache(query, results); return results; } _suggest(query) { if (this.lruCache.has(`suggestion:${query}`)) { return this._getFromCache(`suggestion:${query}`); } const suggestions = this.data .map((item) => { const searchableFields = [item.name, ...(item.tags || [])]; const resolvedQuery = query.toLowerCase(); const closestMatch = searchableFields.reduce( (bestMatch, field) => { if (!field) return bestMatch; const resolvedField = field.toLowerCase(); const distance = this._calculateDistance(resolvedQuery, resolvedField); const similarity = 1 - distance / Math.max(resolvedQuery.length, resolvedField.length); if (similarity > bestMatch.similarity) { return { field, similarity, item }; } return bestMatch; }, { similarity: 0, item: null } ); return closestMatch.similarity >= this.options.suggestionThreshold ? closestMatch.item : null; }) .filter(Boolean); const result = { message: suggestions.length > 0 ? this.messages.suggest : this.messages.noResults, suggestions: suggestions, }; this._setToCache(`suggestion:${query}`, result); return result; } _match(query, word) { const algorithm = this.options.algorithm; const threshold = this.options.threshold; if (this.options.customSearch) { return this.options.customSearch(query, word); } return fuzzyMatch(query, word, threshold, algorithm); } _calculateDistance(query, word) { return this.options.algorithm === "levenshtein" ? levenshtein(query, word) : damerauLevenshtein(query, word); } _setToCache(key, value) { if (this.lruCache.size >= this.options.cacheSize) { const oldestKey = this.lruCache.keys().next().value; this.lruCache.delete(oldestKey); this.cacheExpiry.delete(oldestKey); } this.lruCache.set(key, value); this.cacheExpiry.set(key, Date.now() + this.options.cacheTTL); } _getFromCache(key) { const value = this.lruCache.get(key); const expiry = this.cacheExpiry.get(key); if (value && Date.now() < expiry) { this.lruCache.delete(key); this.lruCache.set(key, value); return value; } else { this.lruCache.delete(key); this.cacheExpiry.delete(key); return null; } } } const messages = { tr: { suggest:"Bunu mu demek istediniz?", noResults:"Sonuç bulunamadı." }, en: { suggest:"Did you mean this?", noResults:"No results found." }, de: { suggest:"Meinten Sie das?", noResults:"Keine Ergebnisse gefunden." }, az: { suggest:"Bu sözü demək istədiyinizə əminsiniz?", noResults:"Heç bir nəticə tapılmadı." }, fr: { suggest:"Vouliez-vous dire ceci?", noResults:"Aucun résultat trouvé." }, es: { suggest:"¿Quisiste decir esto?", noResults:"No se encontraron resultados." }, it: { suggest:"Volevi dire questo?", noResults:"Nessun risultato trovato." }, ru: { suggest:"Вы имели в виду это?", noResults:"Результатов не найдено." }, pt: { suggest:"Quis dizer isto?", noResults:"Nenhum resultado encontrado." }, ar: { suggest:"هل كنت تعني هذا؟", noResults:"لم يتم العثور على نتائج." } };