UNPKG

@wcs-colab/plugin-fuzzy-phrase

Version:

Advanced fuzzy phrase matching plugin for Orama with semantic weighting and synonym expansion

508 lines (503 loc) 16.4 kB
'use strict'; // src/fuzzy.ts function boundedLevenshtein(a, b, bound) { if (a === b) { return { isBounded: true, distance: 0 }; } const aLen = a.length; const bLen = b.length; if (Math.abs(aLen - bLen) > bound) { return { isBounded: false, distance: bound + 1 }; } if (aLen > bLen) { [a, b] = [b, a]; } const m = a.length; const n = b.length; let prevRow = new Array(n + 1); let currRow = new Array(n + 1); for (let j = 0; j <= n; j++) { prevRow[j] = j; } for (let i = 1; i <= m; i++) { currRow[0] = i; let minInRow = i; for (let j = 1; j <= n; j++) { const cost = a[i - 1] === b[j - 1] ? 0 : 1; currRow[j] = Math.min( prevRow[j] + 1, // deletion currRow[j - 1] + 1, // insertion prevRow[j - 1] + cost // substitution ); minInRow = Math.min(minInRow, currRow[j]); } if (minInRow > bound) { return { isBounded: false, distance: bound + 1 }; } [prevRow, currRow] = [currRow, prevRow]; } const distance = prevRow[n]; return { isBounded: distance <= bound, distance }; } function fuzzyMatch(word, queryToken, tolerance) { if (word === queryToken) { return { matches: true, distance: 0, score: 1 }; } if (word.startsWith(queryToken)) { return { matches: true, distance: 0, score: 0.95 }; } const result = boundedLevenshtein(word, queryToken, tolerance); if (result.isBounded) { const score = 1 - result.distance * 0.2; return { matches: true, distance: result.distance, score: Math.max(0.1, score) // Minimum score of 0.1 }; } return { matches: false, distance: tolerance + 1, score: 0 }; } function calculateAdaptiveTolerance(queryTokens, baseTolerance) { const queryLength = queryTokens.length; if (queryLength <= 2) { return baseTolerance; } else if (queryLength <= 4) { return baseTolerance + 1; } else if (queryLength <= 6) { return baseTolerance + 2; } else { return baseTolerance + 3; } } // src/candidates.ts function extractVocabularyFromRadixTree(radixNode) { const vocabulary = /* @__PURE__ */ new Set(); function traverse(node) { if (node.w) { vocabulary.add(node.w); } if (node.c) { for (const child of Object.values(node.c)) { traverse(child); } } } traverse(radixNode); return vocabulary; } function findCandidatesForToken(queryToken, vocabulary, tolerance, synonyms, synonymScore = 0.8) { const candidates = []; const seen = /* @__PURE__ */ new Set(); if (vocabulary.has(queryToken)) { candidates.push({ word: queryToken, type: "exact", queryToken, distance: 0, score: 1 }); seen.add(queryToken); } for (const word of vocabulary) { if (seen.has(word)) continue; const match = fuzzyMatch(word, queryToken, tolerance); if (match.matches) { candidates.push({ word, type: "fuzzy", queryToken, distance: match.distance, score: match.score }); seen.add(word); } } if (synonyms && synonyms[queryToken]) { for (const synonym of synonyms[queryToken]) { if (seen.has(synonym)) continue; if (vocabulary.has(synonym)) { candidates.push({ word: synonym, type: "synonym", queryToken, distance: 0, score: synonymScore }); seen.add(synonym); } } } return candidates; } function findAllCandidates(queryTokens, vocabulary, tolerance, synonyms, synonymScore = 0.8) { const candidatesMap = /* @__PURE__ */ new Map(); for (const token of queryTokens) { const tokenCandidates = findCandidatesForToken( token, vocabulary, tolerance, synonyms, synonymScore ); candidatesMap.set(token, tokenCandidates); } return candidatesMap; } function filterCandidatesByScore(candidatesMap, minScore) { const filtered = /* @__PURE__ */ new Map(); for (const [token, candidates] of candidatesMap.entries()) { const filteredCandidates = candidates.filter((c) => c.score >= minScore); if (filteredCandidates.length > 0) { filtered.set(token, filteredCandidates); } } return filtered; } // src/scoring.ts function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) { const phrases = []; const queryTokens = Array.from(candidatesMap.keys()); const wordMatches = []; for (let i = 0; i < documentTokens.length; i++) { const docWord = documentTokens[i]; for (const [queryToken, candidates] of candidatesMap.entries()) { for (const candidate of candidates) { if (candidate.word === docWord) { wordMatches.push({ word: docWord, queryToken, position: i, type: candidate.type, distance: candidate.distance, score: candidate.score }); } } } } for (let i = 0; i < wordMatches.length; i++) { const phrase = buildPhraseFromPosition( wordMatches, i, queryTokens, config, documentFrequency, totalDocuments ); if (phrase && phrase.words.length > 0) { phrases.push(phrase); } } return deduplicatePhrases(phrases); } function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) { const startMatch = wordMatches[startIndex]; const phraseWords = [startMatch]; const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]); for (let i = startIndex + 1; i < wordMatches.length; i++) { const match = wordMatches[i]; const gap = match.position - phraseWords[phraseWords.length - 1].position - 1; if (gap > config.maxGap) { break; } if (!coveredTokens.has(match.queryToken)) { phraseWords.push(match); coveredTokens.add(match.queryToken); } if (coveredTokens.size === queryTokens.length) { break; } } if (phraseWords.length > 0) { const score = calculatePhraseScore( phraseWords, queryTokens, config, documentFrequency, totalDocuments ); return { words: phraseWords, startPosition: phraseWords[0].position, endPosition: phraseWords[phraseWords.length - 1].position, gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position, inOrder: isInOrder(phraseWords, queryTokens), score }; } return null; } function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) { let baseScore = 0; for (const word of phraseWords) { const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8; baseScore += word.score * weight; } baseScore /= phraseWords.length; const inOrder = isInOrder(phraseWords, queryTokens); const orderScore = inOrder ? 1 : 0.5; const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1; const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5)); const densityScore = phraseWords.length / queryTokens.length; const semanticScore = calculateSemanticScore( phraseWords, documentFrequency, totalDocuments ); const weights = config.weights; const totalScore = baseScore + orderScore * weights.order + proximityScore * weights.proximity + densityScore * weights.density + semanticScore * weights.semantic; const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic; return Math.min(1, totalScore / maxPossibleScore); } function isInOrder(phraseWords, queryTokens) { const tokenOrder = new Map(queryTokens.map((token, index) => [token, index])); for (let i = 1; i < phraseWords.length; i++) { const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1; const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1; if (currOrder < prevOrder) { return false; } } return true; } function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) { let tfidfSum = 0; for (const word of phraseWords) { const df = documentFrequency.get(word.word) || 1; const idf = Math.log(totalDocuments / df); tfidfSum += idf; } const avgTfidf = tfidfSum / phraseWords.length; return Math.min(1, avgTfidf / 10); } function deduplicatePhrases(phrases) { if (phrases.length === 0) return []; const sorted = phrases.slice().sort((a, b) => b.score - a.score); const result = []; const covered = /* @__PURE__ */ new Set(); for (const phrase of sorted) { let overlaps = false; for (let pos = phrase.startPosition; pos <= phrase.endPosition; pos++) { if (covered.has(pos)) { overlaps = true; break; } } if (!overlaps) { result.push(phrase); for (let pos = phrase.startPosition; pos <= phrase.endPosition; pos++) { covered.add(pos); } } } return result.sort((a, b) => b.score - a.score); } // src/index.ts var DEFAULT_CONFIG = { textProperty: "content", tolerance: 1, adaptiveTolerance: true, enableSynonyms: false, supabase: void 0, synonymMatchScore: 0.8, weights: { exact: 1, fuzzy: 0.8, order: 0.3, proximity: 0.2, density: 0.2, semantic: 0.15 }, maxGap: 5, minScore: 0.1 }; var pluginStates = /* @__PURE__ */ new WeakMap(); function pluginFuzzyPhrase(userConfig = {}) { const config = { textProperty: userConfig.textProperty ?? DEFAULT_CONFIG.textProperty, tolerance: userConfig.tolerance ?? DEFAULT_CONFIG.tolerance, adaptiveTolerance: userConfig.adaptiveTolerance ?? DEFAULT_CONFIG.adaptiveTolerance, enableSynonyms: userConfig.enableSynonyms ?? DEFAULT_CONFIG.enableSynonyms, supabase: userConfig.supabase || DEFAULT_CONFIG.supabase, synonymMatchScore: userConfig.synonymMatchScore ?? DEFAULT_CONFIG.synonymMatchScore, weights: { exact: userConfig.weights?.exact ?? DEFAULT_CONFIG.weights.exact, fuzzy: userConfig.weights?.fuzzy ?? DEFAULT_CONFIG.weights.fuzzy, order: userConfig.weights?.order ?? DEFAULT_CONFIG.weights.order, proximity: userConfig.weights?.proximity ?? DEFAULT_CONFIG.weights.proximity, density: userConfig.weights?.density ?? DEFAULT_CONFIG.weights.density, semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic }, maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap, minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore }; const plugin = { name: "fuzzy-phrase", /** * Initialize plugin after index is created */ afterCreate: async (orama) => { console.log("\u{1F52E} Initializing Fuzzy Phrase Plugin..."); const state = { synonymMap: {}, config, documentFrequency: /* @__PURE__ */ new Map(), totalDocuments: 0 }; if (config.enableSynonyms && config.supabase) { try { console.log("\u{1F4D6} Loading synonyms from Supabase..."); state.synonymMap = await loadSynonymsFromSupabase(config.supabase); console.log(`\u2705 Loaded ${Object.keys(state.synonymMap).length} words with synonyms`); } catch (error) { console.error("\u26A0\uFE0F Failed to load synonyms:", error); } } if (orama.data && typeof orama.data === "object") { const docs = orama.data.docs || {}; state.totalDocuments = Object.keys(docs).length; state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty); console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`); } pluginStates.set(orama, state); console.log("\u2705 Fuzzy Phrase Plugin initialized"); } }; return plugin; } async function searchWithFuzzyPhrase(orama, params, language) { const startTime = performance.now(); const state = pluginStates.get(orama); if (!state) { console.error("\u274C Plugin state not initialized"); throw new Error("Fuzzy Phrase Plugin not properly initialized"); } const { term, properties } = params; if (!term || typeof term !== "string") { return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 }; } const textProperty = properties && properties[0] || state.config.textProperty; const queryTokens = tokenize(term); if (queryTokens.length === 0) { return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 }; } const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance; console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`); let vocabulary; try { const radixNode = orama.index?.indexes?.[textProperty]?.node; if (!radixNode) { console.error("\u274C Radix tree not found for property:", textProperty); return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 }; } vocabulary = extractVocabularyFromRadixTree(radixNode); console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`); } catch (error) { console.error("\u274C Failed to extract vocabulary:", error); return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 }; } const candidatesMap = findAllCandidates( queryTokens, vocabulary, tolerance, state.config.enableSynonyms ? state.synonymMap : void 0, state.config.synonymMatchScore ); const filteredCandidates = filterCandidatesByScore( candidatesMap, state.config.minScore ); console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`); const documentMatches = []; const docs = orama.data?.docs || {}; for (const [docId, doc] of Object.entries(docs)) { const text = doc[textProperty]; if (!text || typeof text !== "string") { continue; } const docTokens = tokenize(text); const phrases = findPhrasesInDocument( docTokens, filteredCandidates, { weights: state.config.weights, maxGap: state.config.maxGap }, state.documentFrequency, state.totalDocuments ); if (phrases.length > 0) { const docScore = Math.max(...phrases.map((p) => p.score)); documentMatches.push({ id: docId, phrases, score: docScore, document: doc }); } } documentMatches.sort((a, b) => b.score - a.score); const hits = documentMatches.map((match) => ({ id: match.id, score: match.score, document: match.document, // Store phrases for highlighting _phrases: match.phrases })); const elapsed = performance.now() - startTime; console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`); return { elapsed: { formatted: `${elapsed.toFixed(2)}ms`, raw: Math.floor(elapsed * 1e6) // nanoseconds }, hits, count: hits.length }; } async function loadSynonymsFromSupabase(supabaseConfig) { try { const { createClient } = await import('@supabase/supabase-js'); const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey); const { data, error } = await supabase.rpc("get_synonym_map"); if (error) { throw new Error(`Supabase error: ${error.message}`); } return data || {}; } catch (error) { console.error("Failed to load synonyms from Supabase:", error); throw error; } } function calculateDocumentFrequencies(docs, textProperty) { const df = /* @__PURE__ */ new Map(); for (const doc of Object.values(docs)) { const text = doc[textProperty]; if (!text || typeof text !== "string") { continue; } const words = new Set(tokenize(text)); for (const word of words) { df.set(word, (df.get(word) || 0) + 1); } } return df; } function tokenize(text) { return text.toLowerCase().split(/\s+/).filter((token) => token.length > 0); } exports.pluginFuzzyPhrase = pluginFuzzyPhrase; exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase; //# sourceMappingURL=out.js.map //# sourceMappingURL=index.cjs.map