UNPKG

@codai/memorai-core

Version:

Simplified advanced memory engine - no tiers, just powerful semantic search with persistence

469 lines (468 loc) 18.9 kB
/** * Advanced Semantic Search Engine for Memorai * Provides fuzzy matching, semantic similarity, and intelligent query understanding */ /** * Advanced semantic search with fuzzy matching and intelligent ranking */ export class SemanticSearchEngine { constructor(embeddingService) { this.queryCache = new Map(); this.conceptCache = new Map(); this.embeddingService = embeddingService; } /** * Perform advanced semantic search with multiple ranking factors */ async search(query, memories, options = {}, context) { const { enableFuzzyMatching = true, fuzzyThreshold = 0.7, enableSemanticExpansion = true, enableTypoTolerance = true, weightFactors = { semantic: 0.4, fuzzy: 0.2, recency: 0.15, frequency: 0.1, importance: 0.15, }, contextWindow = 5, diversityFactor = 0.1, limit, } = options; // Step 1: Preprocess query const processedQuery = await this.preprocessQuery(query, enableTypoTolerance, context); // Step 2: Generate query embedding const queryEmbedding = await this.getQueryEmbedding(processedQuery); // Step 3: Expand query with related concepts if enabled const expandedQueries = enableSemanticExpansion ? await this.expandQuery(processedQuery, context) : [processedQuery]; // Step 4: Score all memories const scoredResults = []; for (const memory of memories) { const result = await this.scoreMemory(memory, processedQuery, expandedQueries, queryEmbedding, weightFactors, { enableFuzzyMatching, fuzzyThreshold, contextWindow, }, context); if (result.searchScore > 0.1) { // Minimum threshold scoredResults.push(result); } } // Step 5: Apply diversity factor and rank results const rankedResults = await this.applyDiversityRanking(scoredResults, diversityFactor); // Step 6: Sort by final score and apply limit const sortedResults = rankedResults.sort((a, b) => b.searchScore - a.searchScore); // Apply limit if specified return limit ? sortedResults.slice(0, limit) : sortedResults; } /** * Preprocess query to handle typos and normalize text */ async preprocessQuery(query, enableTypoTolerance, context) { let processed = query.toLowerCase().trim(); // Basic normalization processed = processed.replace(/[^\w\s]/g, ' ').replace(/\s+/g, ' '); // Typo correction (simplified implementation) if (enableTypoTolerance) { processed = await this.correctTypos(processed, context); } return processed; } /** * Get or generate embedding for query with caching */ async getQueryEmbedding(query) { const cacheKey = query; if (this.queryCache.has(cacheKey)) { return this.queryCache.get(cacheKey); } const embeddingResult = await this.embeddingService.embed(query); const embedding = embeddingResult.embedding; this.queryCache.set(cacheKey, embedding); // Clean cache if it gets too large if (this.queryCache.size > 1000) { const firstKey = this.queryCache.keys().next().value; if (firstKey) { this.queryCache.delete(firstKey); } } return embedding; } /** * Expand query with semantically related concepts */ async expandQuery(query, context) { const expansions = [query]; // Get related concepts from cache or generate const cacheKey = query; let relatedConcepts; if (this.conceptCache.has(cacheKey)) { relatedConcepts = this.conceptCache.get(cacheKey); } else { relatedConcepts = await this.generateRelatedConcepts(query, context); this.conceptCache.set(cacheKey, relatedConcepts); } expansions.push(...relatedConcepts); return expansions; } /** * Score a single memory against the query */ async scoreMemory(memory, query, expandedQueries, queryEmbedding, weightFactors, searchOptions, context) { // Calculate individual scores const semanticScore = await this.calculateSemanticScore(memory, queryEmbedding); const fuzzyScore = searchOptions.enableFuzzyMatching ? this.calculateFuzzyScore(memory.content, expandedQueries, searchOptions.fuzzyThreshold) : 0; const recencyScore = this.calculateRecencyScore(memory); const frequencyScore = this.calculateFrequencyScore(memory); const importanceScore = memory.importance; const contextRelevance = context ? this.calculateContextRelevance(memory, context, searchOptions.contextWindow) : 0; // Calculate weighted final score const searchScore = (semanticScore * weightFactors.semantic + fuzzyScore * weightFactors.fuzzy + recencyScore * weightFactors.recency + frequencyScore * weightFactors.frequency + importanceScore * weightFactors.importance) * (1 + contextRelevance * 0.2); // Boost based on context relevance // Generate explanation const explanation = this.generateExplanation(semanticScore, fuzzyScore, recencyScore, frequencyScore, importanceScore, contextRelevance); // Extract related concepts const relatedConcepts = await this.extractRelatedConcepts(memory.content); return { memory, score: semanticScore, // Keep original for compatibility searchScore, fuzzyScore: searchOptions.enableFuzzyMatching ? fuzzyScore : 0, semanticScore, recencyScore, frequencyScore, contextRelevance, explanation, relatedConcepts, relevance_reason: explanation, }; } /** * Calculate semantic similarity score using embeddings */ async calculateSemanticScore(memory, queryEmbedding) { if (!memory.embedding) { // Generate embedding if not available const embeddingResult = await this.embeddingService.embed(memory.content); memory.embedding = embeddingResult.embedding; } return this.cosineSimilarity(queryEmbedding, memory.embedding); } /** * Calculate fuzzy matching score */ calculateFuzzyScore(content, queries, threshold) { let maxScore = 0; for (const query of queries) { const score = this.fuzzyMatch(content.toLowerCase(), query, threshold); maxScore = Math.max(maxScore, score); } return maxScore; } /** * Calculate recency score (more recent = higher score) */ calculateRecencyScore(memory) { const now = Date.now(); const memoryTime = Math.max(memory.updatedAt.getTime(), memory.lastAccessedAt.getTime(), memory.createdAt.getTime()); const daysSince = (now - memoryTime) / (1000 * 60 * 60 * 24); // Exponential decay: score = e^(-days/30) return Math.exp(-daysSince / 30); } /** * Calculate frequency score based on access count */ calculateFrequencyScore(memory) { // Normalize access count (log scale to prevent outliers from dominating) const normalizedCount = Math.log(memory.accessCount + 1); // Scale to 0-1 range (assuming max reasonable access count is ~1000) return Math.min(normalizedCount / Math.log(1001), 1.0); } /** * Calculate context relevance score */ calculateContextRelevance(memory, context, contextWindow) { let relevanceScore = 0; let factors = 0; // Check against recent queries if (context.recentQueries.length > 0) { const recentRelevance = context.recentQueries .slice(0, contextWindow) .reduce((sum, recentQuery) => { return (sum + this.fuzzyMatch(memory.content.toLowerCase(), recentQuery.toLowerCase(), 0.6)); }, 0) / Math.min(context.recentQueries.length, contextWindow); relevanceScore += recentRelevance; factors++; } // Check user preferences if (Object.keys(context.userPreferences).length > 0) { const preferenceMatch = this.checkPreferenceMatch(memory, context.userPreferences); relevanceScore += preferenceMatch; factors++; } // Time context relevance const timeRelevance = this.calculateTimeRelevance(memory, context.timeContext); relevanceScore += timeRelevance; factors++; return factors > 0 ? relevanceScore / factors : 0.5; } /** * Apply diversity ranking to avoid too similar results */ async applyDiversityRanking(results, diversityFactor) { if (diversityFactor === 0 || results.length <= 1) { return results; } const diversifiedResults = []; const remaining = [...results]; while (remaining.length > 0 && diversifiedResults.length < results.length) { let bestIndex = 0; let bestScore = -1; for (let i = 0; i < remaining.length; i++) { const candidate = remaining[i]; if (!candidate) continue; // Calculate diversity penalty let diversityPenalty = 0; for (const selected of diversifiedResults) { const similarity = await this.calculateContentSimilarity(candidate.memory.content, selected.memory.content); diversityPenalty += similarity * diversityFactor; } // Adjusted score = original score - diversity penalty const adjustedScore = candidate.searchScore - diversityPenalty; if (adjustedScore > bestScore) { bestScore = adjustedScore; bestIndex = i; } } const bestCandidate = remaining[bestIndex]; if (bestCandidate) { diversifiedResults.push(bestCandidate); remaining.splice(bestIndex, 1); } else { break; } } return diversifiedResults; } // Helper methods cosineSimilarity(a, b) { if (a.length !== b.length) return 0; let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { const aVal = a[i] ?? 0; const bVal = b[i] ?? 0; dotProduct += aVal * bVal; normA += aVal * aVal; normB += bVal * bVal; } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } fuzzyMatch(text, pattern, threshold) { // Check for exact substring match first if (text.includes(pattern) || pattern.includes(text)) { return 1.0; } // Check word-level fuzzy matching const words = text.split(/\s+/); let maxScore = 0; for (const word of words) { if (word.includes(pattern) || pattern.includes(word)) { // Word-level match is good but not perfect unless it's exact maxScore = Math.max(maxScore, word.toLowerCase() === pattern.toLowerCase() ? 1.0 : 0.95); } else { const distance = this.jaroWinklerDistance(word, pattern); if (distance >= threshold) { maxScore = Math.max(maxScore, distance); } } } // Also check full text matching as fallback const fullTextDistance = this.jaroWinklerDistance(text, pattern); if (fullTextDistance >= threshold) { maxScore = Math.max(maxScore, fullTextDistance); } return maxScore; } jaroWinklerDistance(s1, s2) { // Simplified implementation of Jaro-Winkler distance if (s1 === s2) return 1; const len1 = s1.length; const len2 = s2.length; if (len1 === 0 || len2 === 0) return 0; const matchDistance = Math.floor(Math.max(len1, len2) / 2) - 1; const s1Matches = new Array(len1).fill(false); const s2Matches = new Array(len2).fill(false); let matches = 0; let transpositions = 0; // Find matches for (let i = 0; i < len1; i++) { const start = Math.max(0, i - matchDistance); const end = Math.min(i + matchDistance + 1, len2); for (let j = start; j < end; j++) { if (s2Matches[j] || s1[i] !== s2[j]) continue; s1Matches[i] = s2Matches[j] = true; matches++; break; } } if (matches === 0) return 0; // Find transpositions let k = 0; for (let i = 0; i < len1; i++) { if (!s1Matches[i]) continue; while (!s2Matches[k]) k++; if (s1[i] !== s2[k]) transpositions++; k++; } const jaro = (matches / len1 + matches / len2 + (matches - transpositions / 2) / matches) / 3; // Calculate Jaro-Winkler distance let prefix = 0; for (let i = 0; i < Math.min(len1, len2, 4); i++) { if (s1[i] === s2[i]) prefix++; else break; } return jaro + 0.1 * prefix * (1 - jaro); } async correctTypos(text, _context) { // Simplified typo correction - in production, use a proper spell checker const corrections = { remmber: 'remember', remembr: 'remember', recal: 'recall', retreive: 'retrieve', serach: 'search', searh: 'search', }; let corrected = text; for (const [typo, correction] of Object.entries(corrections)) { corrected = corrected.replace(new RegExp(typo, 'gi'), correction); } return corrected; } async generateRelatedConcepts(query, _context) { // Simplified concept expansion - in production, use a knowledge base or LLM const conceptMap = { code: ['programming', 'development', 'software', 'function', 'method'], bug: ['error', 'issue', 'problem', 'defect', 'glitch'], test: ['testing', 'verification', 'validation', 'check', 'assertion'], user: ['customer', 'client', 'person', 'account', 'profile'], data: ['information', 'content', 'record', 'database', 'storage'], }; const concepts = []; const queryWords = query.toLowerCase().split(' '); for (const word of queryWords) { if (conceptMap[word]) { concepts.push(...conceptMap[word]); } } return [...new Set(concepts)]; // Remove duplicates } generateExplanation(semanticScore, fuzzyScore, recencyScore, frequencyScore, importanceScore, contextRelevance) { const factors = []; if (semanticScore > 0.8) factors.push('high semantic similarity'); else if (semanticScore > 0.6) factors.push('moderate semantic similarity'); if (fuzzyScore > 0.8) factors.push('exact text match'); else if (fuzzyScore > 0.6) factors.push('partial text match'); if (recencyScore > 0.8) factors.push('recently accessed'); if (frequencyScore > 0.8) factors.push('frequently accessed'); if (importanceScore > 0.8) factors.push('marked as important'); if (contextRelevance > 0.7) factors.push('contextually relevant'); return factors.length > 0 ? `Relevant due to: ${factors.join(', ')}` : 'Basic relevance match'; } async extractRelatedConcepts(content) { // Extract key concepts from content (simplified) const words = content .toLowerCase() .replace(/[^\w\s]/g, ' ') .split(' ') .filter(word => word.length > 3); // Return most frequent words as concepts const wordCount = new Map(); words.forEach(word => { wordCount.set(word, (wordCount.get(word) || 0) + 1); }); return Array.from(wordCount.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, 5) .map(([word]) => word); } checkPreferenceMatch(memory, preferences) { let matchScore = 0; let totalPreferences = 0; for (const [key, value] of Object.entries(preferences)) { totalPreferences++; const valueStr = typeof value === 'string' ? value : String(value); if (memory.tags.includes(key) || memory.tags.includes(valueStr)) { matchScore++; } else if (memory.content.toLowerCase().includes(valueStr.toLowerCase())) { matchScore += 0.5; } } return totalPreferences > 0 ? matchScore / totalPreferences : 0; } calculateTimeRelevance(memory, timeContext) { // Simple time-based relevance (can be expanded) const memoryHour = memory.createdAt.getHours(); const currentTimeScore = this.getTimeOfDayScore(memoryHour, timeContext.timeOfDay); return currentTimeScore; } getTimeOfDayScore(hour, timeOfDay) { const timeRanges = { morning: [6, 12], afternoon: [12, 18], evening: [18, 22], night: [22, 6], }; const range = timeRanges[timeOfDay]; if (!range) return 0.5; const [start, end] = range; if (start === undefined || end === undefined) return 0.5; if (timeOfDay === 'night') { return hour >= start || hour < end ? 1 : 0.5; } else { return hour >= start && hour < end ? 1 : 0.5; } } async calculateContentSimilarity(content1, content2) { // Quick similarity check using word overlap const words1 = new Set(content1.toLowerCase().split(' ')); const words2 = new Set(content2.toLowerCase().split(' ')); const intersection = new Set([...words1].filter(x => words2.has(x))); const union = new Set([...words1, ...words2]); return intersection.size / union.size; // Jaccard similarity } }