UNPKG

mongodocs-mcp

Version:

Lightning-fast semantic search for MongoDB documentation via Model Context Protocol. 10,000+ documents, <500ms search.

545 lines 23.5 kB
/** * Hybrid Search Engine - Combines Vector Search with Keyword Search * This fixes the relevance issues by using multiple retrieval strategies */ import { MongoDBClient } from './mongodb-client.js'; import { MongoDBQueryExpander } from './mongodb-query-expander.js'; import axios from 'axios'; export class HybridSearchEngine { mongodb; queryExpander; voyageApiKey; voyageContextualUrl = 'https://api.voyageai.com/v1/contextualizedembeddings'; VOYAGE_DIMENSIONS = 2048; initialized = false; // Scoring thresholds MIN_VECTOR_SCORE = 0.7; // Reject vector results below this MIN_KEYWORD_SCORE = 0.3; // Minimum keyword relevance VECTOR_WEIGHT = 0.6; // Weight for vector search KEYWORD_WEIGHT = 0.4; // Weight for keyword search constructor() { this.mongodb = MongoDBClient.getInstance(); this.queryExpander = new MongoDBQueryExpander(); this.voyageApiKey = process.env.VOYAGE_API_KEY; if (!this.voyageApiKey) { throw new Error('VOYAGE_API_KEY environment variable is required'); } } async initialize() { if (this.initialized) return; await this.mongodb.connect(); await this.mongodb.createVectorSearchIndex(); this.initialized = true; console.error('✅ Hybrid Search Engine initialized'); } /** * Perform hybrid search combining vector and keyword strategies */ async search(query, options = {}) { console.error(`🚀 MONGODB-OPTIMIZED Search: "${query}"`); const collection = this.mongodb.getVectorsCollection(); const count = await collection.countDocuments(); if (count === 0) { console.error('⚠️ No documents in database. Run indexer first!'); return []; } console.error(`📊 Searching ${count} docs with MongoDB intelligence...`); // 🧠 MongoDB-specific query understanding const mongoIntent = this.detectMongoDBIntent(query); console.error(`🧠 Intent: ${mongoIntent.type} (${mongoIntent.confidence.toFixed(2)} confidence)`); // 🔍 Enhanced query expansion with MongoDB context const expandedQueries = this.queryExpander.expandQuery(query); console.error(`🔍 Expanded to ${expandedQueries.length} MongoDB-aware variations`); // Run both search strategies in parallel const [vectorResults, keywordResults] = await Promise.all([ this.vectorSearch(query, expandedQueries, options), this.keywordSearch(query, expandedQueries, options) ]); console.error(` 🔢 Vector results: ${vectorResults.length}, Keyword results: ${keywordResults.length}`); // Merge and re-rank results const mergedResults = this.mergeResults(vectorResults, keywordResults); // Apply score threshold const filteredResults = mergedResults.filter(r => r.hybridScore >= this.MIN_VECTOR_SCORE); // Re-rank using Voyage AI if available const rerankedResults = await this.rerankResults(query, filteredResults); // Format final results const finalResults = this.formatResults(rerankedResults, options.limit || 5); console.error(`✅ Found ${finalResults.length} relevant results`); // If no results, provide suggestions if (finalResults.length === 0) { const suggestions = this.queryExpander.generateSuggestions(query); console.error('💡 Suggestions:', suggestions.join('\n ')); } return finalResults; } /** * MongoDB-specific intent detection (our SECRET WEAPON!) */ detectMongoDBIntent(query) { const queryLower = query.toLowerCase(); // MongoDB operation patterns const patterns = { crud: ['insert', 'find', 'update', 'delete', 'replace', 'upsert', 'insertone', 'updateone'], aggregation: ['aggregate', 'pipeline', '$match', '$group', '$project', '$sort', '$lookup', '$unwind'], vectorSearch: ['vector search', '$vectorsearch', 'embedding', 'similarity', 'cosine', 'semantic'], indexing: ['index', 'createindex', 'compound', 'text index', 'performance'], troubleshooting: ['error', 'problem', 'fix', 'timeout', 'connection', 'slow'], drivers: { nodejs: ['node', 'javascript', 'mongoose', 'async', 'await'], python: ['python', 'pymongo', 'motor'], java: ['java', 'spring'], csharp: ['c#', '.net'], go: ['golang', 'go'], php: ['php', 'laravel'], ruby: ['ruby'] } }; let intent = { type: 'general', confidence: 0.5, components: [], language: undefined }; // Detect primary intent if (patterns.vectorSearch.some(p => queryLower.includes(p))) { intent = { type: 'vector_search', confidence: 0.9, components: ['vector-search'], language: undefined }; } else if (patterns.aggregation.some(p => queryLower.includes(p))) { intent = { type: 'aggregation', confidence: 0.8, components: ['aggregation'], language: undefined }; } else if (patterns.crud.some(p => queryLower.includes(p))) { intent = { type: 'crud', confidence: 0.8, components: ['crud'], language: undefined }; } else if (patterns.indexing.some(p => queryLower.includes(p))) { intent = { type: 'indexing', confidence: 0.7, components: ['indexing'], language: undefined }; } else if (patterns.troubleshooting.some(p => queryLower.includes(p))) { intent = { type: 'troubleshooting', confidence: 0.7, components: ['troubleshooting'], language: undefined }; } // Detect programming language for (const [lang, indicators] of Object.entries(patterns.drivers)) { if (indicators.some(indicator => queryLower.includes(indicator))) { intent.language = lang; intent.confidence += 0.1; break; } } return intent; } /** * Vector search using embeddings */ async vectorSearch(_query, expandedQueries, options) { const results = []; try { // Generate embeddings for all query variations const embeddings = await this.generateQueryEmbeddings(expandedQueries); const collection = this.mongodb.getVectorsCollection(); // Search with each embedding for (let i = 0; i < embeddings.length; i++) { const embedding = embeddings[i]; const queryText = expandedQueries[i]; // CRITICAL FIX: Normalize the query embedding for dot product similarity! const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); const normalizedEmbedding = embedding.map(val => val / magnitude); const pipeline = [ { $vectorSearch: { index: 'semantic_search', path: 'embedding', queryVector: normalizedEmbedding, // USE NORMALIZED! numCandidates: 150, // Increased for better recall limit: 20 // Get more candidates for re-ranking } }, { $addFields: { searchScore: { $meta: 'vectorSearchScore' }, queryUsed: queryText } } ]; // Add filters if provided if (options.filter) { pipeline.push({ $match: options.filter }); } const docs = await collection.aggregate(pipeline).toArray(); docs.forEach((doc, index) => { // CRITICAL FIX: Normalize vector scores and add diversity penalty const baseScore = doc.searchScore || 0; const diversityPenalty = index * 0.01; // Slight penalty for lower-ranked results const normalizedScore = Math.max(0, Math.min(1, baseScore - diversityPenalty)); results.push({ document: doc, vectorScore: normalizedScore, keywordScore: 0, hybridScore: normalizedScore, source: 'vector' }); }); } } catch (error) { console.error('Vector search error:', error); } return results; } /** * Keyword search using MongoDB text search and regex */ async keywordSearch(query, expandedQueries, options) { const results = []; try { const collection = this.mongodb.getVectorsCollection(); // Build keyword search queries const keywordQueries = this.buildKeywordQueries(query, expandedQueries); for (const kQuery of keywordQueries) { const pipeline = [ { $match: { $or: [ { content: { $regex: kQuery.regex, $options: 'i' } }, { 'title': { $regex: kQuery.regex, $options: 'i' } }, { 'metadata.path': { $regex: kQuery.regex, $options: 'i' } } ] } }, { $addFields: { keywordScore: this.calculateKeywordScore(kQuery.terms) } }, { $limit: 20 } ]; // Add filters if provided if (options.filter) { const matchStage = { $match: options.filter }; pipeline.push(matchStage); } const docs = await collection.aggregate(pipeline).toArray(); docs.forEach(doc => { results.push({ document: doc, vectorScore: 0, keywordScore: doc.keywordScore || 0.5, hybridScore: doc.keywordScore || 0.5, source: 'keyword' }); }); } } catch (error) { console.error('Keyword search error:', error); } return results; } /** * Merge vector and keyword results with product diversity */ mergeResults(vectorResults, keywordResults) { const merged = new Map(); const productCounts = new Map(); // Add vector results with product diversity tracking vectorResults.forEach(result => { const id = result.document._id.toString(); const product = result.document.product || 'unknown'; // Apply diversity boost for underrepresented products const currentCount = productCounts.get(product) || 0; const diversityBoost = Math.max(0, (5 - currentCount) * 0.05); // Boost less common products result.hybridScore += diversityBoost; productCounts.set(product, currentCount + 1); merged.set(id, result); }); // Merge keyword results keywordResults.forEach(result => { const id = result.document._id.toString(); if (merged.has(id)) { // Document found in both - calculate hybrid score const existing = merged.get(id); existing.keywordScore = result.keywordScore; existing.hybridScore = this.calculateHybridScore(existing.vectorScore, result.keywordScore); existing.source = 'both'; } else { // Only in keyword results merged.set(id, result); } }); // Sort by hybrid score return Array.from(merged.values()) .sort((a, b) => b.hybridScore - a.hybridScore); } /** * Re-rank results using Voyage AI reranker */ async rerankResults(query, results) { if (results.length === 0) return results; console.error(`🔄 ENHANCED reranking with RAGFlow-inspired optimizations...`); let retryCount = 0; const maxRetries = 3; while (retryCount < maxRetries) { try { // Prepare documents for reranking const documents = results.map(r => r.document.content?.substring(0, 1000) || '' // Use first 1000 chars ); // Call Voyage reranker with latest 2025 model const response = await axios.post('https://api.voyageai.com/v1/rerank', { query, documents, model: 'rerank-2.5', // 2025: Latest cross-encoder model top_k: Math.min(results.length, 20) }, { headers: { 'Authorization': `Bearer ${this.voyageApiKey}`, 'Content-Type': 'application/json', }, timeout: 30000, }); // Update scores based on reranking if (response.data?.data) { response.data.data.forEach((item, index) => { // CRITICAL FIX: Voyage API returns nested structure with relevanceScore const score = item.relevanceScore || item.score; const resultIndex = item.index !== undefined ? item.index : index; if (resultIndex < results.length && score !== undefined) { // Use reranking score as primary, with original as fallback const rerankWeight = 0.7; const originalWeight = 0.3; results[resultIndex].hybridScore = (score * rerankWeight) + (results[resultIndex].hybridScore * originalWeight); } }); } // Re-sort by new scores results.sort((a, b) => b.hybridScore - a.hybridScore); break; // Success, exit retry loop } catch (error) { retryCount++; if (retryCount >= maxRetries) { console.error(`❌ Reranking failed after ${maxRetries} retries (using original scores):`, error.message); break; } else { console.error(`⚠️ Reranking retry ${retryCount}/${maxRetries}:`, error.message); await new Promise(resolve => setTimeout(resolve, 500 * retryCount)); // Quick backoff for reranking } } } return results; } /** * Generate embeddings for query variations */ async generateQueryEmbeddings(queries) { try { // Use the contextualized embeddings endpoint for queries const response = await axios.post(this.voyageContextualUrl, { inputs: queries.map(q => [q]), // Each query wrapped in array input_type: 'query', // Critical: asymmetric embeddings for query-document matching model: 'voyage-context-3', output_dimension: this.VOYAGE_DIMENSIONS }, { headers: { 'Authorization': `Bearer ${this.voyageApiKey}`, 'Content-Type': 'application/json', }, timeout: 30000, }); if (!response.data?.data) { console.error('No data returned from Voyage API'); return []; } // Extract and normalize embeddings const normalizedEmbeddings = []; for (const queryResult of response.data.data) { if (queryResult?.data?.[0]?.embedding) { const embedding = queryResult.data[0].embedding; const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); const normalized = embedding.map((v) => v / magnitude); normalizedEmbeddings.push(normalized); } } return normalizedEmbeddings; } catch (error) { console.error('Embedding generation failed:', error.response?.data || error); return []; } } /** * Build keyword search queries */ buildKeywordQueries(query, expandedQueries) { const queries = []; // Original query terms const terms = query.toLowerCase().split(/\s+/); queries.push({ regex: terms.join('|'), terms }); // Expanded query terms (top 3) expandedQueries.slice(0, 3).forEach(eq => { const eqTerms = eq.toLowerCase().split(/\s+/); queries.push({ regex: eqTerms.join('|'), terms: eqTerms }); }); return queries; } /** * Calculate keyword relevance score */ calculateKeywordScore(terms) { // Simple TF-IDF style scoring const baseScore = 0.5; const termBoost = 0.1; return Math.min(1.0, baseScore + (terms.length * termBoost)); } /** * Calculate hybrid score combining vector and keyword scores */ calculateHybridScore(vectorScore, keywordScore) { return (vectorScore * this.VECTOR_WEIGHT) + (keywordScore * this.KEYWORD_WEIGHT); } /** * Format results for output with "Lost in the Middle" mitigation (2025 best practice) */ formatResults(results, limit) { const formatted = []; const seen = new Set(); for (const result of results) { if (formatted.length >= limit) break; const docId = result.document.documentId; if (seen.has(docId)) continue; seen.add(docId); // Group chunks by document const relatedChunks = results .filter(r => r.document.documentId === docId) .map(r => ({ content: r.document.content, score: r.hybridScore, metadata: r.document.metadata })) .sort((a, b) => b.score - a.score); // Sort by score // 2025: Apply "Lost in the Middle" mitigation // Place most relevant chunks at beginning and end const reorderedChunks = this.reorderChunksForLLM(relatedChunks); formatted.push({ documentId: docId, chunks: reorderedChunks.slice(0, 3).map((chunk, idx) => ({ ...chunk, chunkIndex: idx })), // Top 3 chunks with index metadata: result.document.metadata, maxScore: result.hybridScore, source: result.source }); } return formatted; } /** * Reorder chunks to mitigate "Lost in the Middle" problem (2025 best practice) * LLMs have U-shaped attention - best recall at beginning and end */ reorderChunksForLLM(chunks) { if (chunks.length <= 2) return chunks; const reordered = []; const sorted = [...chunks].sort((a, b) => b.score - a.score); // Place most relevant at beginning reordered.push(sorted[0]); // Place second most relevant at end if (sorted.length > 1) { const secondBest = sorted[1]; sorted.splice(1, 1); // Fill middle with remaining chunks for (let i = 2; i < sorted.length; i++) { reordered.push(sorted[i]); } // Add second best at the end reordered.push(secondBest); } return reordered; } /** * Find similar documents to provided content */ async findSimilar(content, limit = 5) { // Generate embedding for the content const embedding = await this.generateQueryEmbeddings([content]); if (embedding.length === 0) { return []; } // Use vector search to find similar const results = await this.vectorSearch(content, [content], { limit }); return results.slice(0, limit).map(r => ({ title: r.document.title || r.document.metadata?.title || 'Untitled', url: r.document.metadata?.url || '', score: r.vectorScore, content: r.document.content?.substring(0, 200) || '' })); } /** * Explain a concept using the documentation */ async explainConcept(concept, depth = 'intermediate') { // Search for the concept const results = await this.search(concept, { limit: 5 }); if (results.length === 0) { return `No documentation found for "${concept}". Try a different search term.`; } // Format explanation based on depth let explanation = `# ${concept}\n\n`; if (depth === 'beginner') { explanation += '## Simple Explanation\n\n'; explanation += results[0].chunks[0]?.content.substring(0, 500) || ''; } else if (depth === 'advanced') { explanation += '## Detailed Explanation\n\n'; results.forEach(result => { explanation += `### ${result.metadata?.title || 'Document'}\n`; result.chunks.forEach(chunk => { explanation += chunk.content + '\n\n'; }); }); } else { // intermediate explanation += '## Overview\n\n'; explanation += results[0].chunks[0]?.content || ''; explanation += '\n\n## Related Documentation\n\n'; results.slice(1, 3).forEach(result => { explanation += `- [${result.metadata?.title || 'Document'}](${result.metadata?.url || '#'})\n`; }); } return explanation; } /** * Get search engine status */ async getStatus() { const collection = this.mongodb.getVectorsCollection(); const count = await collection.countDocuments(); // Get sample of products and versions const products = await collection.distinct('metadata.product'); const versions = await collection.distinct('metadata.version'); return { totalDocuments: count, products, versions, searchEngine: 'Hybrid (Vector + Keyword)', vectorModel: 'voyage-context-3', rerankerModel: 'rerank-2.5', minVectorScore: this.MIN_VECTOR_SCORE, minKeywordScore: this.MIN_KEYWORD_SCORE, weights: { vector: this.VECTOR_WEIGHT, keyword: this.KEYWORD_WEIGHT } }; } } //# sourceMappingURL=hybrid-search-engine.js.map