UNPKG

mongodocs-mcp

Version:

Lightning-fast semantic search for MongoDB documentation via Model Context Protocol. 10,000+ documents, <500ms search.

324 lines • 14.3 kB
/** * ENHANCED Voyage AI Embedding Pipeline * Uses voyage-context-3 model for TRUE contextualized 2048-dimensional embeddings * * RESEARCH-BASED IMPROVEMENTS: * - RAGFlow-inspired dynamic batching with memory management * - LightRAG-inspired parallel processing with MAX_ASYNC control * - True contextual document grouping for maximum embedding quality */ import axios from 'axios'; import { MongoDBClient } from './mongodb-client.js'; import { SmartChunker } from './smart-chunker.js'; import pLimit from 'p-limit'; export class EmbeddingPipeline { voyageApiKey; voyageContextualUrl = 'https://api.voyageai.com/v1/contextualizedembeddings'; mongodb; chunker; rateLimiter = pLimit(3); // Max 3 concurrent API calls // ENHANCED: Research-based API limits and batching MAX_TOTAL_TOKENS = 120000; MAX_TOTAL_CHUNKS = 16000; dynamicBatchSize = 8; // RAGFlow-inspired: Start with 8, adjust based on performance MIN_BATCH_SIZE = 2; // Minimum for stability VOYAGE_DIMENSIONS = 2048; // 2025: MAXIMUM dimensions for best performance constructor() { const apiKey = process.env.VOYAGE_API_KEY; if (!apiKey) { throw new Error('VOYAGE_API_KEY environment variable is required'); } this.voyageApiKey = apiKey; this.mongodb = MongoDBClient.getInstance(); this.chunker = new SmartChunker(); } /** * Embed all documents using Voyage Context-3's contextualized embeddings * This is the KEY differentiator - chunks are embedded with document context */ async embedAllDocuments(chunkedDocs) { console.error(`🧠 Starting Voyage Context-3 embedding pipeline...`); console.error(`šŸ“Š Processing ${chunkedDocs.length} documents with ${chunkedDocs.reduce((sum, d) => sum + d.chunks.length, 0)} total chunks`); // Create batches respecting API limits const batches = this.createBatches(chunkedDocs); console.error(`šŸ“¦ Created ${batches.length} batches for processing`); let processedChunks = 0; let totalTokensUsed = 0; for (let i = 0; i < batches.length; i++) { const batch = batches[i]; console.error(`\nšŸ”„ Processing batch ${i + 1}/${batches.length} (${batch.documents.length} documents, ${batch.totalChunks} chunks)`); try { const { embeddings, tokensUsed } = await this.processBatch(batch); processedChunks += embeddings.length; totalTokensUsed += tokensUsed; console.error(`āœ… Batch ${i + 1} complete: ${embeddings.length} chunks embedded, ${tokensUsed} tokens used`); // Rate limiting if (i < batches.length - 1) { await this.sleep(1000); // 1 second between batches } } catch (error) { console.error(`āŒ Batch ${i + 1} failed:`, error); throw error; } } console.error(`\nšŸŽ‰ Embedding complete! Processed ${processedChunks} chunks using ${totalTokensUsed} tokens`); } /** * Process a batch of documents with TRUE contextualized embeddings * Each document's chunks are embedded together for global context awareness */ async processBatch(batch) { // CRITICAL: Group chunks by document for contextualized_embed const documentGroups = []; const metadataGroups = []; for (const doc of batch.documents) { const docChunks = []; const docMetadata = []; doc.chunks.forEach((chunk, idx) => { docChunks.push(chunk.content); docMetadata.push({ documentId: doc.documentId, chunkIndex: idx, totalChunks: doc.totalChunks, ...doc.documentMetadata, ...chunk.metadata, }); }); documentGroups.push(docChunks); metadataGroups.push(docMetadata); } // Call Voyage API with CONTEXTUALIZED embeddings const embeddings = await this.callContextualizedEmbed(documentGroups); // Flatten embeddings and metadata for MongoDB const flatEmbeddings = []; const flatMetadata = []; embeddings.forEach((docEmbeddings, docIdx) => { docEmbeddings.forEach((embedding, chunkIdx) => { flatEmbeddings.push(embedding); flatMetadata.push(metadataGroups[docIdx][chunkIdx]); }); }); // Prepare documents for MongoDB const vectorDocuments = this.prepareVectorDocuments(flatEmbeddings, batch, flatMetadata); // Insert into MongoDB await this.insertToMongoDB(vectorDocuments); // Estimate token usage (approximate) const totalTokens = documentGroups.reduce((sum, doc) => sum + doc.reduce((docSum, chunk) => docSum + chunk.length / 4, 0), 0); return { embeddings: vectorDocuments, tokensUsed: Math.round(totalTokens), }; } /** * Call Voyage API with TRUE contextualized embeddings - USING THE CORRECT ENDPOINT! * This is the GAME CHANGER - chunks are embedded with full document context! */ async callContextualizedEmbed(documentGroups) { const model = 'voyage-context-3'; console.error(`šŸš€ Using ${model} with RESEARCH-ENHANCED contextualized embeddings!`); let retryCount = 0; const maxRetries = 3; while (retryCount < maxRetries) { try { // Call the CORRECT contextualized embeddings endpoint! const response = await this.rateLimiter(() => axios.post(this.voyageContextualUrl, { inputs: documentGroups, // Array of arrays - each sub-array is a document's chunks input_type: 'document', model: model, output_dimension: this.VOYAGE_DIMENSIONS }, { headers: { 'Authorization': `Bearer ${this.voyageApiKey}`, 'Content-Type': 'application/json', }, timeout: 60000, // Longer timeout for contextualized embeddings })); if (!response.data?.data) { throw new Error('No data returned from Voyage contextualized API'); } // Extract embeddings - the response structure is: // data: [ { data: [ { embedding: [...] }, { embedding: [...] } ] }, ... ] const allEmbeddings = []; for (const docResult of response.data.data) { const docEmbeddings = []; for (const chunk of docResult.data) { if (chunk?.embedding) { // Normalize the embedding for cosine similarity const embedding = chunk.embedding; const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); const normalized = embedding.map((v) => v / magnitude); docEmbeddings.push(normalized); } } allEmbeddings.push(docEmbeddings); } return allEmbeddings; break; // Success, exit retry loop } catch (error) { retryCount++; if (retryCount >= maxRetries) { console.error(`āŒ Voyage AI failed after ${maxRetries} retries:`, error.response?.data || error.message); // RAGFlow-inspired: Try reducing batch size for next time if (this.dynamicBatchSize > this.MIN_BATCH_SIZE) { this.dynamicBatchSize = Math.max(this.dynamicBatchSize / 2, this.MIN_BATCH_SIZE); console.error(`šŸ”„ Reduced batch size to ${this.dynamicBatchSize} for stability`); } throw new Error(`Voyage API error: ${error.response?.data?.error || error.message || 'Unknown error'}`); } else { console.error(`āš ļø Voyage AI retry ${retryCount}/${maxRetries}:`, error.response?.data?.message || error.message); await new Promise(resolve => setTimeout(resolve, 1000 * retryCount)); // Exponential backoff } } } // This should never be reached due to the retry loop, but TypeScript requires it throw new Error('Unexpected end of callContextualizedEmbed method'); } /** * Prepare vector documents for MongoDB insertion */ prepareVectorDocuments(embeddings, batch, metadataMap) { const documents = []; let embeddingIndex = 0; for (const doc of batch.documents) { for (const chunk of doc.chunks) { const vectorDoc = { content: chunk.content, contentHash: this.chunker.hashContent(chunk.content), embedding: embeddings[embeddingIndex], embeddingModel: 'voyage-context-3', embeddedAt: new Date(), metadata: metadataMap[embeddingIndex], searchMeta: { clickCount: 0, boostFactor: 1.0, }, }; documents.push(vectorDoc); embeddingIndex++; } } return documents; } /** * Insert vector documents into MongoDB Atlas */ async insertToMongoDB(documents) { const collection = this.mongodb.getVectorsCollection(); try { // Use unordered insert to continue on duplicate errors const result = await collection.insertMany(documents, { ordered: false, }); console.error(` šŸ“ Inserted ${result.insertedCount} documents to MongoDB`); } catch (error) { if (error.code === 11000) { // Duplicate key error - some documents already exist console.error(' āš ļø Some documents already exist, updating...'); await this.updateExistingDocuments(documents); } else { throw error; } } } /** * Update existing documents with new embeddings */ async updateExistingDocuments(documents) { const collection = this.mongodb.getVectorsCollection(); let updated = 0; for (const doc of documents) { try { await collection.replaceOne({ contentHash: doc.contentHash }, doc, { upsert: true }); updated++; } catch (error) { console.error('Failed to update document:', error); } } console.error(` šŸ“ Updated ${updated} existing documents`); } /** * Create batches of documents respecting Voyage API limits */ createBatches(chunkedDocs) { const batches = []; let currentBatch = []; let currentTokens = 0; let currentChunks = 0; for (const doc of chunkedDocs) { // Estimate tokens (rough approximation) const docTokens = doc.chunks.reduce((sum, chunk) => sum + chunk.metadata.tokenCount, 0); const docChunks = doc.chunks.length; // Check if adding this document would exceed limits if (currentBatch.length >= this.dynamicBatchSize || currentTokens + docTokens > this.MAX_TOTAL_TOKENS || currentChunks + docChunks > this.MAX_TOTAL_CHUNKS) { // Save current batch if (currentBatch.length > 0) { batches.push({ documents: currentBatch, totalTokens: currentTokens, totalChunks: currentChunks, }); } // Start new batch currentBatch = []; currentTokens = 0; currentChunks = 0; } // Add document to current batch currentBatch.push(doc); currentTokens += docTokens; currentChunks += docChunks; } // Save final batch if (currentBatch.length > 0) { batches.push({ documents: currentBatch, totalTokens: currentTokens, totalChunks: currentChunks, }); } return batches; } /** * Embed a single query for search using contextualized endpoint */ async embedQuery(query) { try { const response = await axios.post(this.voyageContextualUrl, { inputs: [[query]], // Single query wrapped in double array input_type: 'query', // Important: query type for asymmetric search model: 'voyage-context-3', output_dimension: this.VOYAGE_DIMENSIONS }, { headers: { 'Authorization': `Bearer ${this.voyageApiKey}`, 'Content-Type': 'application/json', }, timeout: 30000, }); if (!response.data?.data?.[0]?.data?.[0]?.embedding) { throw new Error('No embedding returned for query'); } // Extract embedding from nested structure const embedding = response.data.data[0].data[0].embedding; // Normalize for cosine similarity const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); return embedding.map((v) => v / magnitude); } catch (error) { console.error('Failed to embed query:', error.response?.data || error); throw error; } } sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } } //# sourceMappingURL=embedding-pipeline.js.map