@wcs-colab/plugin-fuzzy-phrase
Version:
Advanced fuzzy phrase matching plugin for Orama with semantic weighting and synonym expansion
508 lines (503 loc) • 16.4 kB
JavaScript
'use strict';
// src/fuzzy.ts
function boundedLevenshtein(a, b, bound) {
if (a === b) {
return { isBounded: true, distance: 0 };
}
const aLen = a.length;
const bLen = b.length;
if (Math.abs(aLen - bLen) > bound) {
return { isBounded: false, distance: bound + 1 };
}
if (aLen > bLen) {
[a, b] = [b, a];
}
const m = a.length;
const n = b.length;
let prevRow = new Array(n + 1);
let currRow = new Array(n + 1);
for (let j = 0; j <= n; j++) {
prevRow[j] = j;
}
for (let i = 1; i <= m; i++) {
currRow[0] = i;
let minInRow = i;
for (let j = 1; j <= n; j++) {
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
currRow[j] = Math.min(
prevRow[j] + 1,
// deletion
currRow[j - 1] + 1,
// insertion
prevRow[j - 1] + cost
// substitution
);
minInRow = Math.min(minInRow, currRow[j]);
}
if (minInRow > bound) {
return { isBounded: false, distance: bound + 1 };
}
[prevRow, currRow] = [currRow, prevRow];
}
const distance = prevRow[n];
return {
isBounded: distance <= bound,
distance
};
}
function fuzzyMatch(word, queryToken, tolerance) {
if (word === queryToken) {
return { matches: true, distance: 0, score: 1 };
}
if (word.startsWith(queryToken)) {
return { matches: true, distance: 0, score: 0.95 };
}
const result = boundedLevenshtein(word, queryToken, tolerance);
if (result.isBounded) {
const score = 1 - result.distance * 0.2;
return {
matches: true,
distance: result.distance,
score: Math.max(0.1, score)
// Minimum score of 0.1
};
}
return { matches: false, distance: tolerance + 1, score: 0 };
}
function calculateAdaptiveTolerance(queryTokens, baseTolerance) {
const queryLength = queryTokens.length;
if (queryLength <= 2) {
return baseTolerance;
} else if (queryLength <= 4) {
return baseTolerance + 1;
} else if (queryLength <= 6) {
return baseTolerance + 2;
} else {
return baseTolerance + 3;
}
}
// src/candidates.ts
function extractVocabularyFromRadixTree(radixNode) {
const vocabulary = /* @__PURE__ */ new Set();
function traverse(node) {
if (node.w) {
vocabulary.add(node.w);
}
if (node.c) {
for (const child of Object.values(node.c)) {
traverse(child);
}
}
}
traverse(radixNode);
return vocabulary;
}
function findCandidatesForToken(queryToken, vocabulary, tolerance, synonyms, synonymScore = 0.8) {
const candidates = [];
const seen = /* @__PURE__ */ new Set();
if (vocabulary.has(queryToken)) {
candidates.push({
word: queryToken,
type: "exact",
queryToken,
distance: 0,
score: 1
});
seen.add(queryToken);
}
for (const word of vocabulary) {
if (seen.has(word))
continue;
const match = fuzzyMatch(word, queryToken, tolerance);
if (match.matches) {
candidates.push({
word,
type: "fuzzy",
queryToken,
distance: match.distance,
score: match.score
});
seen.add(word);
}
}
if (synonyms && synonyms[queryToken]) {
for (const synonym of synonyms[queryToken]) {
if (seen.has(synonym))
continue;
if (vocabulary.has(synonym)) {
candidates.push({
word: synonym,
type: "synonym",
queryToken,
distance: 0,
score: synonymScore
});
seen.add(synonym);
}
}
}
return candidates;
}
function findAllCandidates(queryTokens, vocabulary, tolerance, synonyms, synonymScore = 0.8) {
const candidatesMap = /* @__PURE__ */ new Map();
for (const token of queryTokens) {
const tokenCandidates = findCandidatesForToken(
token,
vocabulary,
tolerance,
synonyms,
synonymScore
);
candidatesMap.set(token, tokenCandidates);
}
return candidatesMap;
}
function filterCandidatesByScore(candidatesMap, minScore) {
const filtered = /* @__PURE__ */ new Map();
for (const [token, candidates] of candidatesMap.entries()) {
const filteredCandidates = candidates.filter((c) => c.score >= minScore);
if (filteredCandidates.length > 0) {
filtered.set(token, filteredCandidates);
}
}
return filtered;
}
// src/scoring.ts
function findPhrasesInDocument(documentTokens, candidatesMap, config, documentFrequency, totalDocuments) {
const phrases = [];
const queryTokens = Array.from(candidatesMap.keys());
const wordMatches = [];
for (let i = 0; i < documentTokens.length; i++) {
const docWord = documentTokens[i];
for (const [queryToken, candidates] of candidatesMap.entries()) {
for (const candidate of candidates) {
if (candidate.word === docWord) {
wordMatches.push({
word: docWord,
queryToken,
position: i,
type: candidate.type,
distance: candidate.distance,
score: candidate.score
});
}
}
}
}
for (let i = 0; i < wordMatches.length; i++) {
const phrase = buildPhraseFromPosition(
wordMatches,
i,
queryTokens,
config,
documentFrequency,
totalDocuments
);
if (phrase && phrase.words.length > 0) {
phrases.push(phrase);
}
}
return deduplicatePhrases(phrases);
}
function buildPhraseFromPosition(wordMatches, startIndex, queryTokens, config, documentFrequency, totalDocuments) {
const startMatch = wordMatches[startIndex];
const phraseWords = [startMatch];
const coveredTokens = /* @__PURE__ */ new Set([startMatch.queryToken]);
for (let i = startIndex + 1; i < wordMatches.length; i++) {
const match = wordMatches[i];
const gap = match.position - phraseWords[phraseWords.length - 1].position - 1;
if (gap > config.maxGap) {
break;
}
if (!coveredTokens.has(match.queryToken)) {
phraseWords.push(match);
coveredTokens.add(match.queryToken);
}
if (coveredTokens.size === queryTokens.length) {
break;
}
}
if (phraseWords.length > 0) {
const score = calculatePhraseScore(
phraseWords,
queryTokens,
config,
documentFrequency,
totalDocuments
);
return {
words: phraseWords,
startPosition: phraseWords[0].position,
endPosition: phraseWords[phraseWords.length - 1].position,
gap: phraseWords[phraseWords.length - 1].position - phraseWords[0].position,
inOrder: isInOrder(phraseWords, queryTokens),
score
};
}
return null;
}
function calculatePhraseScore(phraseWords, queryTokens, config, documentFrequency, totalDocuments) {
let baseScore = 0;
for (const word of phraseWords) {
const weight = word.type === "exact" ? config.weights.exact : word.type === "fuzzy" ? config.weights.fuzzy : config.weights.fuzzy * 0.8;
baseScore += word.score * weight;
}
baseScore /= phraseWords.length;
const inOrder = isInOrder(phraseWords, queryTokens);
const orderScore = inOrder ? 1 : 0.5;
const span = phraseWords[phraseWords.length - 1].position - phraseWords[0].position + 1;
const proximityScore = Math.max(0, 1 - span / (queryTokens.length * 5));
const densityScore = phraseWords.length / queryTokens.length;
const semanticScore = calculateSemanticScore(
phraseWords,
documentFrequency,
totalDocuments
);
const weights = config.weights;
const totalScore = baseScore + orderScore * weights.order + proximityScore * weights.proximity + densityScore * weights.density + semanticScore * weights.semantic;
const maxPossibleScore = 1 + weights.order + weights.proximity + weights.density + weights.semantic;
return Math.min(1, totalScore / maxPossibleScore);
}
function isInOrder(phraseWords, queryTokens) {
const tokenOrder = new Map(queryTokens.map((token, index) => [token, index]));
for (let i = 1; i < phraseWords.length; i++) {
const prevOrder = tokenOrder.get(phraseWords[i - 1].queryToken) ?? -1;
const currOrder = tokenOrder.get(phraseWords[i].queryToken) ?? -1;
if (currOrder < prevOrder) {
return false;
}
}
return true;
}
function calculateSemanticScore(phraseWords, documentFrequency, totalDocuments) {
let tfidfSum = 0;
for (const word of phraseWords) {
const df = documentFrequency.get(word.word) || 1;
const idf = Math.log(totalDocuments / df);
tfidfSum += idf;
}
const avgTfidf = tfidfSum / phraseWords.length;
return Math.min(1, avgTfidf / 10);
}
function deduplicatePhrases(phrases) {
if (phrases.length === 0)
return [];
const sorted = phrases.slice().sort((a, b) => b.score - a.score);
const result = [];
const covered = /* @__PURE__ */ new Set();
for (const phrase of sorted) {
let overlaps = false;
for (let pos = phrase.startPosition; pos <= phrase.endPosition; pos++) {
if (covered.has(pos)) {
overlaps = true;
break;
}
}
if (!overlaps) {
result.push(phrase);
for (let pos = phrase.startPosition; pos <= phrase.endPosition; pos++) {
covered.add(pos);
}
}
}
return result.sort((a, b) => b.score - a.score);
}
// src/index.ts
var DEFAULT_CONFIG = {
textProperty: "content",
tolerance: 1,
adaptiveTolerance: true,
enableSynonyms: false,
supabase: void 0,
synonymMatchScore: 0.8,
weights: {
exact: 1,
fuzzy: 0.8,
order: 0.3,
proximity: 0.2,
density: 0.2,
semantic: 0.15
},
maxGap: 5,
minScore: 0.1
};
var pluginStates = /* @__PURE__ */ new WeakMap();
function pluginFuzzyPhrase(userConfig = {}) {
const config = {
textProperty: userConfig.textProperty ?? DEFAULT_CONFIG.textProperty,
tolerance: userConfig.tolerance ?? DEFAULT_CONFIG.tolerance,
adaptiveTolerance: userConfig.adaptiveTolerance ?? DEFAULT_CONFIG.adaptiveTolerance,
enableSynonyms: userConfig.enableSynonyms ?? DEFAULT_CONFIG.enableSynonyms,
supabase: userConfig.supabase || DEFAULT_CONFIG.supabase,
synonymMatchScore: userConfig.synonymMatchScore ?? DEFAULT_CONFIG.synonymMatchScore,
weights: {
exact: userConfig.weights?.exact ?? DEFAULT_CONFIG.weights.exact,
fuzzy: userConfig.weights?.fuzzy ?? DEFAULT_CONFIG.weights.fuzzy,
order: userConfig.weights?.order ?? DEFAULT_CONFIG.weights.order,
proximity: userConfig.weights?.proximity ?? DEFAULT_CONFIG.weights.proximity,
density: userConfig.weights?.density ?? DEFAULT_CONFIG.weights.density,
semantic: userConfig.weights?.semantic ?? DEFAULT_CONFIG.weights.semantic
},
maxGap: userConfig.maxGap ?? DEFAULT_CONFIG.maxGap,
minScore: userConfig.minScore ?? DEFAULT_CONFIG.minScore
};
const plugin = {
name: "fuzzy-phrase",
/**
* Initialize plugin after index is created
*/
afterCreate: async (orama) => {
console.log("\u{1F52E} Initializing Fuzzy Phrase Plugin...");
const state = {
synonymMap: {},
config,
documentFrequency: /* @__PURE__ */ new Map(),
totalDocuments: 0
};
if (config.enableSynonyms && config.supabase) {
try {
console.log("\u{1F4D6} Loading synonyms from Supabase...");
state.synonymMap = await loadSynonymsFromSupabase(config.supabase);
console.log(`\u2705 Loaded ${Object.keys(state.synonymMap).length} words with synonyms`);
} catch (error) {
console.error("\u26A0\uFE0F Failed to load synonyms:", error);
}
}
if (orama.data && typeof orama.data === "object") {
const docs = orama.data.docs || {};
state.totalDocuments = Object.keys(docs).length;
state.documentFrequency = calculateDocumentFrequencies(docs, config.textProperty);
console.log(`\u{1F4CA} Calculated document frequencies for ${state.totalDocuments} documents`);
}
pluginStates.set(orama, state);
console.log("\u2705 Fuzzy Phrase Plugin initialized");
}
};
return plugin;
}
async function searchWithFuzzyPhrase(orama, params, language) {
const startTime = performance.now();
const state = pluginStates.get(orama);
if (!state) {
console.error("\u274C Plugin state not initialized");
throw new Error("Fuzzy Phrase Plugin not properly initialized");
}
const { term, properties } = params;
if (!term || typeof term !== "string") {
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
}
const textProperty = properties && properties[0] || state.config.textProperty;
const queryTokens = tokenize(term);
if (queryTokens.length === 0) {
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
}
const tolerance = state.config.adaptiveTolerance ? calculateAdaptiveTolerance(queryTokens, state.config.tolerance) : state.config.tolerance;
console.log(`\u{1F50D} Fuzzy phrase search: "${term}" (${queryTokens.length} tokens, tolerance: ${tolerance})`);
let vocabulary;
try {
const radixNode = orama.index?.indexes?.[textProperty]?.node;
if (!radixNode) {
console.error("\u274C Radix tree not found for property:", textProperty);
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
}
vocabulary = extractVocabularyFromRadixTree(radixNode);
console.log(`\u{1F4DA} Extracted ${vocabulary.size} unique words from index`);
} catch (error) {
console.error("\u274C Failed to extract vocabulary:", error);
return { elapsed: { formatted: "0ms", raw: 0 }, hits: [], count: 0 };
}
const candidatesMap = findAllCandidates(
queryTokens,
vocabulary,
tolerance,
state.config.enableSynonyms ? state.synonymMap : void 0,
state.config.synonymMatchScore
);
const filteredCandidates = filterCandidatesByScore(
candidatesMap,
state.config.minScore
);
console.log(`\u{1F3AF} Found candidates: ${Array.from(filteredCandidates.values()).reduce((sum, c) => sum + c.length, 0)} total`);
const documentMatches = [];
const docs = orama.data?.docs || {};
for (const [docId, doc] of Object.entries(docs)) {
const text = doc[textProperty];
if (!text || typeof text !== "string") {
continue;
}
const docTokens = tokenize(text);
const phrases = findPhrasesInDocument(
docTokens,
filteredCandidates,
{
weights: state.config.weights,
maxGap: state.config.maxGap
},
state.documentFrequency,
state.totalDocuments
);
if (phrases.length > 0) {
const docScore = Math.max(...phrases.map((p) => p.score));
documentMatches.push({
id: docId,
phrases,
score: docScore,
document: doc
});
}
}
documentMatches.sort((a, b) => b.score - a.score);
const hits = documentMatches.map((match) => ({
id: match.id,
score: match.score,
document: match.document,
// Store phrases for highlighting
_phrases: match.phrases
}));
const elapsed = performance.now() - startTime;
console.log(`\u2705 Found ${hits.length} results in ${elapsed.toFixed(2)}ms`);
return {
elapsed: {
formatted: `${elapsed.toFixed(2)}ms`,
raw: Math.floor(elapsed * 1e6)
// nanoseconds
},
hits,
count: hits.length
};
}
async function loadSynonymsFromSupabase(supabaseConfig) {
try {
const { createClient } = await import('@supabase/supabase-js');
const supabase = createClient(supabaseConfig.url, supabaseConfig.serviceKey);
const { data, error } = await supabase.rpc("get_synonym_map");
if (error) {
throw new Error(`Supabase error: ${error.message}`);
}
return data || {};
} catch (error) {
console.error("Failed to load synonyms from Supabase:", error);
throw error;
}
}
function calculateDocumentFrequencies(docs, textProperty) {
const df = /* @__PURE__ */ new Map();
for (const doc of Object.values(docs)) {
const text = doc[textProperty];
if (!text || typeof text !== "string") {
continue;
}
const words = new Set(tokenize(text));
for (const word of words) {
df.set(word, (df.get(word) || 0) + 1);
}
}
return df;
}
function tokenize(text) {
return text.toLowerCase().split(/\s+/).filter((token) => token.length > 0);
}
exports.pluginFuzzyPhrase = pluginFuzzyPhrase;
exports.searchWithFuzzyPhrase = searchWithFuzzyPhrase;
//# sourceMappingURL=out.js.map
//# sourceMappingURL=index.cjs.map