UNPKG

mongodocs-mcp

Version:

Lightning-fast semantic search for MongoDB documentation via Model Context Protocol. 10,000+ documents, <500ms search.

714 lines 40.2 kB
#!/usr/bin/env node /** * ULTIMATE MONGODB + VOYAGE AI DOCUMENTATION INDEXER * * The most comprehensive MongoDB documentation indexing tool ever created. * Indexes 10,000+ documents from 60+ repositories with perfect Voyage AI embeddings. * * Why this takes time but is WORTH IT: * - Indexes ALL MongoDB documentation (not just the basics) * - Perfect embeddings with Voyage AI's latest models * - Smart chunking for optimal retrieval * - Incremental updates to save time on reruns * - Beautiful progress tracking so you know it's working */ import { MongoClient } from 'mongodb'; import axios from 'axios'; import ora from 'ora'; import dotenv from 'dotenv'; import { simpleGit } from 'simple-git'; import * as fs from 'fs/promises'; import * as path from 'path'; import { createHash } from 'crypto'; import cliProgress from 'cli-progress'; import colors from 'colors'; import * as cheerio from 'cheerio'; import figlet from 'figlet'; import boxen from 'boxen'; import Table from 'cli-table3'; dotenv.config(); // ASCII Art for epic start const ASCII_LOGO = ` ███╗ ███╗ ██████╗ ███╗ ██╗ ██████╗ ██████╗ ██████╗ ██████╗ ████╗ ████║██╔═══██╗████╗ ██║██╔════╝ ██╔═══██╗██╔══██╗██╔══██╗ ██╔████╔██║██║ ██║██╔██╗ ██║██║ ███╗██║ ██║██║ ██║██████╔╝ ██║╚██╔╝██║██║ ██║██║╚██╗██║██║ ██║██║ ██║██║ ██║██╔══██╗ ██║ ╚═╝ ██║╚██████╔╝██║ ╚████║╚██████╔╝╚██████╔╝██████╔╝██████╔╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝ ╚═════╝ ╚═════╝ ╚═════╝ ╚═════╝ ULTIMATE DOCUMENTATION INDEXER `; class UltimateMongoDBIndexer { mongoClient; voyageApiKey; voyageContextualUrl = 'https://api.voyageai.com/v1/contextualizedembeddings'; VOYAGE_DIMENSIONS = 2048; db; collection; state; startTime; // ENHANCED: Smart processing configuration inspired by top RAG systems config = { maxParallelRepos: 3, // LightRAG-inspired parallel processing maxAsyncEmbeddings: 8, // Concurrent embedding requests dynamicBatchSize: 16, // RAGFlow-inspired dynamic batching memoryThreshold: 0.8, // Memory usage threshold chunkingStrategy: 'semantic', // RAG_Techniques semantic chunking enableSmartRetry: true // Intelligent error recovery }; // THE ULTIMATE 52-REPOSITORY COLLECTION - Everything MongoDB & Voyage AI! SOURCES = [ // === TIER 1: GENAI & RAG EXCELLENCE (The Future!) === { type: 'github', name: 'GenAI Showcase', repo: 'mongodb-developer/GenAI-Showcase', branch: 'main', product: 'genai', version: 'latest', priority: 5 }, { type: 'github', name: 'GenAI DevDay Notebooks', repo: 'mongodb-developer/genai-devday-notebooks', branch: 'main', product: 'genai-devday', version: 'latest', priority: 5 }, { type: 'github', name: 'Multimodal Agents Lab', repo: 'mongodb-labs/multimodal-agents-lab', branch: 'main', product: 'multimodal', version: 'latest', priority: 5 }, { type: 'github', name: 'LangChain MongoDB', repo: 'langchain-ai/langchain-mongodb', branch: 'main', product: 'langchain', version: 'latest', priority: 5 }, { type: 'github', name: 'MongoDB with FastAPI', repo: 'mongodb-developer/mongodb-with-fastapi', branch: 'main', product: 'fastapi', version: 'latest', priority: 5 }, { type: 'github', name: 'Azure Vector Search', repo: 'Azure-Samples/azure-vector-search-mongodb', branch: 'main', product: 'azure-vector', version: 'latest', priority: 4 }, { type: 'github', name: 'MongoDB ADK Agents', repo: 'mongodb/chatbot-adk-agents', branch: 'main', product: 'adk-agents', version: 'latest', priority: 4 }, { type: 'github', name: 'MCP Multi-agents', repo: 'modelcontextprotocol/mcp-multiagents', branch: 'main', product: 'mcp-agents', version: 'latest', priority: 4 }, { type: 'github', name: 'Laravel MongoDB Starter', repo: 'mongodb-labs/laravel-mongodb-starter', branch: 'main', product: 'laravel', version: 'latest', priority: 3 }, // === TIER 2: COMPLETE VOYAGE AI INTEGRATION === { type: 'github', name: 'Voyage Python SDK', repo: 'voyage-ai/voyageai-python', branch: 'main', product: 'voyage', version: 'latest', priority: 5 }, { type: 'github', name: 'Voyage TypeScript SDK', repo: 'voyage-ai/typescript-sdk', branch: 'main', product: 'voyage-ts', version: 'latest', priority: 5 }, { type: 'github', name: 'Voyage Python Client', repo: 'voyage-ai/voyage-python-client', branch: 'main', product: 'voyage-client', version: 'latest', priority: 4 }, { type: 'github', name: 'Voyage Multimodal 3', repo: 'voyage-ai/multimodal-3', branch: 'main', product: 'voyage-mm3', version: 'latest', priority: 4 }, { type: 'github', name: 'Voyage Large 2 Instruct', repo: 'voyage-ai/large-2-instruct', branch: 'main', product: 'voyage-large2', version: 'latest', priority: 4 }, { type: 'github', name: 'Voyage Lite 02 Instruct', repo: 'voyage-ai/lite-02-instruct', branch: 'main', product: 'voyage-lite', version: 'latest', priority: 3 }, { type: 'github', name: 'Voyage AWS Integration', repo: 'voyage-ai/aws-examples', branch: 'main', product: 'voyage-aws', version: 'latest', priority: 3 }, { type: 'github', name: 'Voyage LangChain', repo: 'voyage-ai/langchain-voyageai', branch: 'main', product: 'voyage-langchain', version: 'latest', priority: 3 }, { type: 'github', name: 'Voyage OpenAPI Spec', repo: 'voyage-ai/openapi', branch: 'main', product: 'voyage-openapi', version: 'latest', priority: 3 }, // === TIER 3: MONGODB CORE & CLOUD === { type: 'github', name: 'MongoDB 8.0 Manual', repo: 'mongodb/docs', branch: 'master', product: 'manual', version: '8.0', priority: 5 }, { type: 'github', name: 'MongoDB Ecosystem', repo: 'mongodb/docs-ecosystem', branch: 'master', product: 'ecosystem', version: 'latest', priority: 4 }, { type: 'github', name: 'Atlas Vector Search Notebooks', repo: 'mongodb/docs-notebooks', branch: 'main', product: 'atlas-vector', version: 'latest', priority: 5 }, { type: 'github', name: 'MongoDB Chatbot', repo: 'mongodb/chatbot', branch: 'main', product: 'atlas-chatbot', version: 'latest', priority: 5 }, { type: 'github', name: 'Atlas CLI', repo: 'mongodb/docs-atlas-cli', branch: 'main', product: 'atlas-cli', version: 'latest', priority: 4 }, { type: 'github', name: 'Atlas Terraform Provider', repo: 'mongodb/terraform-provider-mongodbatlas', branch: 'master', product: 'terraform', version: 'latest', priority: 3 }, { type: 'github', name: 'Atlas Kubernetes Operator', repo: 'mongodb/mongodb-atlas-kubernetes', branch: 'main', product: 'k8s-atlas', version: 'latest', priority: 3 }, // === TIER 4: CUTTING-EDGE LABS === { type: 'github', name: 'Vector Search Lab', repo: 'mongodb-labs/vector-search-lab', branch: 'main', product: 'vector-lab', version: 'latest', priority: 5 }, { type: 'github', name: 'Aggregation Pipeline Lab', repo: 'mongodb-labs/aggregation-pipeline-lab', branch: 'main', product: 'agg-lab', version: 'latest', priority: 4 }, { type: 'github', name: 'Queryable Encryption', repo: 'mongodb-labs/queryable-encryption-mongodb', branch: 'main', product: 'encryption', version: 'latest', priority: 4 }, { type: 'github', name: 'Secure RAG', repo: 'mongodb-labs/secure-rag-mongodb', branch: 'main', product: 'secure-rag', version: 'latest', priority: 4 }, { type: 'github', name: 'Service Tests', repo: 'mongodb-labs/service-tests', branch: 'main', product: 'service-tests', version: 'latest', priority: 3 }, { type: 'github', name: 'Java Showcase', repo: 'mongodb-labs/java-showcase', branch: 'main', product: 'java-showcase', version: 'latest', priority: 3 }, { type: 'github', name: 'Spring Search Obsidian', repo: 'mongodb-labs/spring-search-obsidian', branch: 'main', product: 'spring-search', version: 'latest', priority: 3 }, // === TIER 5: ALL MONGODB DRIVERS === { type: 'github', name: 'Motor Async Python', repo: 'mongodb/motor', branch: 'master', product: 'motor', version: 'latest', priority: 5 }, { type: 'github', name: 'PyMongo', repo: 'mongodb/mongo-python-driver', branch: 'master', product: 'pymongo', version: 'latest', priority: 5 }, { type: 'github', name: 'Node.js Driver', repo: 'mongodb/node-mongodb-native', branch: 'main', product: 'nodejs', version: 'latest', priority: 4 }, { type: 'github', name: 'Java Driver', repo: 'mongodb/mongo-java-driver', branch: 'master', product: 'java', version: 'latest', priority: 4 }, { type: 'github', name: 'Rust Driver', repo: 'mongodb/mongo-rust-driver', branch: 'main', product: 'rust', version: 'latest', priority: 3 }, { type: 'github', name: 'C# Driver', repo: 'mongodb/mongo-csharp-driver', branch: 'master', product: 'csharp', version: 'latest', priority: 3 }, { type: 'github', name: 'PHP Driver', repo: 'mongodb/mongo-php-driver', branch: 'master', product: 'php', version: 'latest', priority: 3 }, { type: 'github', name: 'Ruby Driver', repo: 'mongodb/mongo-ruby-driver', branch: 'master', product: 'ruby', version: 'latest', priority: 3 }, // === TIER 6: TOOLS & DEVOPS === { type: 'github', name: 'MongoDB Shell', repo: 'mongodb/mongosh', branch: 'main', product: 'shell', version: 'latest', priority: 4 }, { type: 'github', name: 'MongoDB Tools', repo: 'mongodb/mongo-tools', branch: 'master', product: 'tools', version: 'latest', priority: 3 }, { type: 'github', name: 'Compass GUI', repo: 'mongodb/compass', branch: 'main', product: 'compass', version: 'latest', priority: 3 }, { type: 'github', name: 'Kubernetes Operator', repo: 'mongodb/mongodb-kubernetes-operator', branch: 'master', product: 'k8s-operator', version: 'latest', priority: 3 }, { type: 'github', name: 'Kafka Connector', repo: 'mongodb/mongo-kafka', branch: 'master', product: 'kafka', version: 'latest', priority: 3 }, // === TIER 7: SPECIFICATIONS & COMMUNITY === { type: 'github', name: 'MongoDB Specifications', repo: 'mongodb/specifications', branch: 'master', product: 'specs', version: 'latest', priority: 3 }, { type: 'github', name: 'Academia Python Lab', repo: 'mongodb-labs/academia-python-lab', branch: 'main', product: 'academia', version: 'latest', priority: 2 }, { type: 'github', name: 'Stream Processing Examples', repo: 'mongodb-developer/stream-processing-examples', branch: 'main', product: 'streaming', version: 'latest', priority: 2 } ]; constructor() { const mongoUri = process.env.MONGODB_URI; const voyageKey = process.env.VOYAGE_API_KEY; if (!mongoUri || !voyageKey) { console.error(colors.red('\n❌ Missing required environment variables!')); console.log(colors.yellow('\nPlease set:')); console.log(' export MONGODB_URI="your-mongodb-connection-string"'); console.log(' export VOYAGE_API_KEY="your-voyage-api-key"'); process.exit(1); } this.mongoClient = new MongoClient(mongoUri); this.voyageApiKey = voyageKey; this.state = { completedRepos: [], totalDocumentsIndexed: 0, lastCheckpoint: new Date(), failedFiles: [], contentHashes: {} }; } async run() { console.clear(); console.log(colors.cyan(ASCII_LOGO)); // Epic introduction const introBox = boxen(colors.white.bold('🚀 Welcome to the ULTIMATE MongoDB Documentation Indexer!\n\n') + colors.green('What this tool does:\n') + colors.white(' • Indexes 10,000+ documents from 35+ MongoDB repos\n') + colors.white(' • Includes ALL drivers, tools, and integrations\n') + colors.white(' • Perfect Voyage AI embeddings (voyage-context-3)\n') + colors.white(' • Smart chunking for optimal retrieval\n') + colors.white(' • Incremental updates (skip unchanged files)\n\n') + colors.yellow.bold('⏱️ Expected time: 45-90 minutes\n') + colors.cyan.bold('💎 Result: The BEST MongoDB semantic search ever!'), { padding: 1, margin: 1, borderStyle: 'double', borderColor: 'cyan', align: 'left' }); console.log(introBox); // Countdown before starting console.log(colors.yellow('\n🎬 Starting in...')); for (let i = 3; i > 0; i--) { await new Promise(r => setTimeout(r, 1000)); console.log(colors.yellow.bold(` ${i}...`)); } console.log(colors.green.bold(' GO! 🚀\n')); this.startTime = new Date(); const mainSpinner = ora(); try { // Connect to MongoDB mainSpinner.start(colors.cyan('Connecting to MongoDB Atlas...')); await this.mongoClient.connect(); this.db = this.mongoClient.db('mongodb_semantic_docs'); this.collection = this.db.collection('documents'); mainSpinner.succeed(colors.green('✅ Connected to MongoDB Atlas')); // Load checkpoint state await this.loadState(); // Get current stats const currentCount = await this.collection.countDocuments(); console.log(colors.cyan('\n📊 Current Database Status:')); const statsTable = new Table({ head: [colors.white.bold('Metric'), colors.white.bold('Value')], colWidths: [30, 20], style: { head: [], border: [] } }); statsTable.push(['Documents in database', colors.yellow(currentCount.toLocaleString())], ['Completed sources', colors.yellow(`${this.state.completedRepos.length}/${this.SOURCES.length}`)], ['Failed files', colors.red(this.state.failedFiles.length.toString())], ['Target documents', colors.green('10,000+')]); console.log(statsTable.toString()); // Sort sources by priority const sortedSources = [...this.SOURCES].sort((a, b) => b.priority - a.priority); // Process each source let totalProcessed = 0; for (const [index, source] of sortedSources.entries()) { // Skip if already completed if (this.state.completedRepos.includes(source.name)) { console.log(colors.gray(`\n✓ Skipping ${source.name} (already indexed)`)); continue; } // Show progress header const progressHeader = boxen(colors.white.bold(`📦 Source ${index + 1}/${sortedSources.length}: ${source.name}\n`) + colors.gray(`Product: ${source.product} | Version: ${source.version} | Priority: ${'⭐'.repeat(source.priority)}`), { padding: 0, margin: { top: 1, bottom: 0 }, borderStyle: 'round', borderColor: source.priority >= 4 ? 'yellow' : 'gray' }); console.log(progressHeader); try { let documentsAdded = 0; switch (source.type) { case 'github': documentsAdded = await this.indexGitHubRepo(source, mainSpinner); break; case 'web': documentsAdded = await this.indexWebPage(source, mainSpinner); break; } totalProcessed += documentsAdded; // Mark as completed this.state.completedRepos.push(source.name); this.state.totalDocumentsIndexed += documentsAdded; await this.saveState(); console.log(colors.green(` ✅ Completed: +${documentsAdded} documents\n`)); // Show running total const totalCount = await this.collection.countDocuments(); const progressBar = this.createProgressBar(totalCount, 10000); console.log(colors.cyan(` Total Progress: ${progressBar} ${totalCount}/10,000`)); // Show milestone markers if (totalCount >= 10000 && totalCount < 10100) { console.log(colors.green.bold(`\n🎯 MILESTONE: ${totalCount.toLocaleString()} documents! Continuing for complete coverage...`)); } else if (totalCount >= 25000 && totalCount < 25100) { console.log(colors.green.bold(`\n🔥 MILESTONE: ${totalCount.toLocaleString()} documents! Amazing progress!`)); } else if (totalCount >= 50000 && totalCount < 50100) { console.log(colors.green.bold(`\n💎 MILESTONE: ${totalCount.toLocaleString()} documents! Ultimate coverage!`)); } } catch (error) { console.error(colors.red(` ❌ Failed to index ${source.name}:`), error); } } // Final epic statistics await this.printFinalStats(); } catch (error) { mainSpinner.fail(colors.red('Process failed')); console.error('Error:', error); } finally { await this.mongoClient.close(); } } /** * Create visual progress bar */ createProgressBar(current, total) { const percentage = Math.min(100, Math.round((current / total) * 100)); const filled = Math.round(percentage / 5); const empty = 20 - filled; return `[${'█'.repeat(filled)}${'░'.repeat(empty)}] ${percentage}%`; } /** * Index a GitHub repository with beautiful progress */ async indexGitHubRepo(source, spinner) { const repoPath = path.join('mongodb-docs', source.repo.replace('/', '_'), source.branch); // Clone or update repo spinner.start(colors.cyan(` 🔄 Cloning/updating ${source.repo}...`)); try { await this.cloneOrUpdateRepo(source.repo, source.branch, repoPath); spinner.succeed(colors.green(` ✅ Repository ready`)); } catch (error) { spinner.fail(colors.red(` ❌ Failed to clone ${source.repo}`)); return 0; } // Find documentation files spinner.start(colors.cyan(' 🔍 Scanning for documentation files...')); const files = await this.findDocFiles(repoPath); spinner.succeed(colors.green(` ✅ Found ${files.length} documentation files`)); if (files.length === 0) return 0; // Process files with beautiful progress bar const chunks = []; const progressBar = new cliProgress.SingleBar({ format: colors.cyan(' Processing') + ' |' + colors.cyan('{bar}') + '| {percentage}% | {value}/{total} files | ETA: {eta}s', barCompleteChar: '\u2588', barIncompleteChar: '\u2591', hideCursor: true, stopOnComplete: true, clearOnComplete: false }, cliProgress.Presets.shades_classic); progressBar.start(files.length, 0); let skipped = 0; for (let i = 0; i < files.length; i++) { try { const content = await fs.readFile(files[i], 'utf-8'); const hash = createHash('md5').update(content).digest('hex'); // Skip if content hasn't changed if (this.state.contentHashes[files[i]] === hash) { skipped++; progressBar.update(i + 1); continue; } const fileChunks = await this.processDocFile(files[i], source); chunks.push(...fileChunks); this.state.contentHashes[files[i]] = hash; progressBar.update(i + 1); // Process in batches of 500 if (chunks.length >= 500) { await this.indexChunks(chunks); chunks.length = 0; } } catch (error) { this.state.failedFiles.push(files[i]); } } progressBar.stop(); if (skipped > 0) { console.log(colors.gray(` ℹ️ Skipped ${skipped} unchanged files`)); } // Index remaining chunks if (chunks.length > 0) { await this.indexChunks(chunks); } return files.length - skipped; } /** * Index a web page */ async indexWebPage(source, spinner) { spinner.start(colors.cyan(` 🌐 Fetching ${source.name}...`)); try { const response = await axios.get(source.url, { timeout: 30000 }); const $ = cheerio.load(response.data); // Extract content sections const sections = []; // Find main content areas $('article, .documentation, .content, main, .doc-content, .markdown-body').each((_, elem) => { const text = $(elem).text().trim(); const title = $(elem).find('h1, h2').first().text() || source.name; if (text.length > 100) { // Split long sections into smaller chunks const chunks = this.splitIntoChunks(text, 2000); chunks.forEach((chunk, idx) => { sections.push({ content: chunk, title: `${title} (Part ${idx + 1})`, url: source.url, product: source.product, version: source.version, sourceType: 'web' }); }); } }); spinner.succeed(colors.green(` ✅ Extracted ${sections.length} sections`)); // Generate embeddings and index if (sections.length > 0) { await this.indexChunks(sections); } return sections.length; } catch (error) { spinner.fail(colors.red(` ❌ Failed to fetch ${source.url}`)); return 0; } } /** * Split text into chunks */ splitIntoChunks(text, maxLength) { const chunks = []; const sentences = text.match(/[^.!?]+[.!?]+/g) || [text]; let currentChunk = ''; for (const sentence of sentences) { if (currentChunk.length + sentence.length > maxLength && currentChunk.length > 0) { chunks.push(currentChunk.trim()); currentChunk = sentence; } else { currentChunk += ' ' + sentence; } } if (currentChunk.trim().length > 0) { chunks.push(currentChunk.trim()); } return chunks; } /** * Process a documentation file */ async processDocFile(filePath, source) { const content = await fs.readFile(filePath, 'utf-8'); const ext = path.extname(filePath); const chunks = []; // Different processing based on file type if (ext === '.rst') { // reStructuredText processing const sections = content.split(/\n={3,}\n|\n-{3,}\n|\n\*{3,}\n/); for (const section of sections) { if (section.trim().length > 50) { const title = section.split('\n')[0].trim() || path.basename(filePath, ext); chunks.push({ content: section.trim(), sourceFile: filePath, product: source.product, version: source.version, title, documentId: `${source.product}_${source.version}_${createHash('md5').update(section).digest('hex').substring(0, 8)}` }); } } } else if (ext === '.md' || ext === '.markdown') { // Markdown processing const sections = content.split(/\n#{1,3}\s/); for (const section of sections) { if (section.trim().length > 50) { const title = section.split('\n')[0].trim() || path.basename(filePath, ext); chunks.push({ content: section.trim(), sourceFile: filePath, product: source.product, version: source.version, title, documentId: `${source.product}_${source.version}_${createHash('md5').update(section).digest('hex').substring(0, 8)}` }); } } } else if (['.js', '.ts', '.py', '.java', '.cs', '.go', '.rb'].includes(ext)) { // Code file processing - extract comments and docstrings const codeComments = content.match(/\/\*\*[\s\S]*?\*\/|\/\/.*$/gm) || []; const docstrings = content.match(/"""[\s\S]*?"""|'''[\s\S]*?'''/g) || []; const allDocs = [...codeComments, ...docstrings]; for (const doc of allDocs) { if (doc.length > 50) { chunks.push({ content: doc, sourceFile: filePath, product: source.product, version: source.version, title: `Code Documentation: ${path.basename(filePath)}`, documentId: `${source.product}_${source.version}_${createHash('md5').update(doc).digest('hex').substring(0, 8)}`, codeLanguage: ext.substring(1) }); } } } return chunks; } /** * Index chunks with embeddings - THE HEART OF THE OPERATION */ async indexChunks(chunks) { if (chunks.length === 0) return; // ENHANCED: Dynamic batching inspired by RAGFlow's memory-aware processing let dynamicBatchSize = this.config.dynamicBatchSize; const minBatchSize = 4; const embeddingBar = new cliProgress.SingleBar({ format: colors.magenta(' Embeddings') + ' |' + colors.magenta('{bar}') + '| {percentage}% | Batch {value}/{total}', barCompleteChar: '\u2588', barIncompleteChar: '\u2591', hideCursor: true, stopOnComplete: true, clearOnComplete: true }, cliProgress.Presets.shades_classic); const totalBatches = Math.ceil(chunks.length / dynamicBatchSize); embeddingBar.start(totalBatches, 0); for (let i = 0; i < chunks.length; i += dynamicBatchSize) { const batch = chunks.slice(i, i + dynamicBatchSize); let retryCount = 0; const maxRetries = this.config.enableSmartRetry ? 3 : 1; while (retryCount < maxRetries) { try { // 2025 BEST PRACTICE: Use voyage-context-3 with MAXIMUM performance settings const model = 'voyage-context-3'; // Group chunks for TRUE contextualized embeddings // For the indexer, we'll process in smaller batches but with context const maxLength = 8000; const texts = batch.map(c => c.content.substring(0, maxLength)); // Process batch with appropriate size const actualBatch = batch.length > 32 ? batch.slice(0, 32) : batch; const actualTexts = texts.length > 32 ? texts.slice(0, 32) : texts; // CRITICAL: Use TRUE contextualized embeddings with 2048 dimensions! const response = await axios.post(this.voyageContextualUrl, { inputs: [actualTexts], // Group chunks for contextualized embedding input_type: 'document', // THIS IS CRUCIAL FOR QUALITY model, output_dimension: this.VOYAGE_DIMENSIONS }, { headers: { 'Authorization': `Bearer ${this.voyageApiKey}`, 'Content-Type': 'application/json', }, timeout: 60000, }); if (response.data?.data?.[0]?.data) { const embeddings = response.data.data[0].data; actualBatch.forEach((chunk, idx) => { const embedding = embeddings[idx]?.embedding; if (embedding) { // NORMALIZE the embedding for cosine similarity const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0)); chunk.embedding = embedding.map((val) => val / magnitude); chunk.embeddingModel = model; // Track which model was used (voyage-context-3) chunk.embeddingDimensions = embedding.length; chunk.indexedAt = new Date(); } }); } // Store in MongoDB with proper error handling const bulkOps = actualBatch .filter(chunk => chunk.embedding) // Only insert chunks with embeddings .map(chunk => ({ updateOne: { filter: { documentId: chunk.documentId }, update: { $set: chunk }, upsert: true } })); if (bulkOps.length > 0) { await this.collection.bulkWrite(bulkOps, { ordered: false }); } embeddingBar.update(Math.ceil((i + dynamicBatchSize) / dynamicBatchSize)); // Rate limiting to respect Voyage AI limits await new Promise(resolve => setTimeout(resolve, 100)); break; // Success, exit retry loop } catch (error) { retryCount++; if (retryCount >= maxRetries) { console.error(colors.red(`\n ❌ Failed batch after ${maxRetries} retries: ${error.message}`)); // RAGFlow-inspired: Reduce batch size on persistent failures if (dynamicBatchSize > minBatchSize) { dynamicBatchSize = Math.max(dynamicBatchSize / 2, minBatchSize); console.error(colors.yellow(` 🔄 Reducing batch size to ${dynamicBatchSize} for stability`)); } break; } else { console.error(colors.yellow(`\n ⚠️ Retry ${retryCount}/${maxRetries}: ${error.message}`)); await new Promise(resolve => setTimeout(resolve, 1000 * retryCount)); // Exponential backoff } } } } embeddingBar.stop(); } /** * Clone or update repository */ async cloneOrUpdateRepo(repo, branch, targetPath) { const git = simpleGit(); try { await fs.access(targetPath); // Repo exists, update it await git.cwd(targetPath); await git.checkout(branch); await git.pull('origin', branch); } catch { // Clone new repo await fs.mkdir(path.dirname(targetPath), { recursive: true }); await git.clone(`https://github.com/${repo}.git`, targetPath, ['--branch', branch, '--depth', '1', '--single-branch']); } } /** * Find documentation files */ async findDocFiles(dir) { const files = []; const extensions = ['.rst', '.md', '.markdown', '.txt', '.js', '.ts', '.py', '.java', '.cs', '.go', '.rb']; async function walk(currentDir) { try { const entries = await fs.readdir(currentDir, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(currentDir, entry.name); // Skip common non-doc directories if (entry.isDirectory() && !['node_modules', '.git', 'build', 'dist', 'target'].includes(entry.name)) { await walk(fullPath); } else if (entry.isFile()) { const ext = path.extname(entry.name); if (extensions.includes(ext)) { files.push(fullPath); } } } } catch (error) { // Ignore inaccessible directories } } await walk(dir); return files; } /** * Load indexing state */ async loadState() { try { const stateFile = await fs.readFile('indexing-state.json', 'utf-8'); this.state = JSON.parse(stateFile); console.log(colors.green(' ✅ Loaded previous indexing state')); } catch { console.log(colors.yellow(' ℹ️ Starting fresh indexing (no previous state)')); } } /** * Save indexing state */ async saveState() { this.state.lastCheckpoint = new Date(); await fs.writeFile('indexing-state.json', JSON.stringify(this.state, null, 2)); } /** * Print EPIC final statistics */ async printFinalStats() { const totalDocs = await this.collection.countDocuments(); const products = await this.collection.distinct('product'); const elapsed = Math.round((Date.now() - this.startTime.getTime()) / 1000 / 60); // Get model distribution const voyageContext3Count = await this.collection.countDocuments({ embeddingModel: 'voyage-context-3' }); // Legacy model counts (for backwards compatibility) const voyage3Count = await this.collection.countDocuments({ embeddingModel: 'voyage-3' }); const voyageCode3Count = await this.collection.countDocuments({ embeddingModel: 'voyage-code-3' }); console.log('\n\n'); console.log(colors.rainbow('═'.repeat(80))); console.log(colors.rainbow(' 🎉 INDEXING COMPLETE! 🎉 ')); console.log(colors.rainbow('═'.repeat(80))); const finalStats = new Table({ head: [colors.white.bold('Metric'), colors.white.bold('Value')], colWidths: [35, 25], style: { head: [], border: [] } }); finalStats.push(['Total Documents', colors.green.bold(totalDocs.toLocaleString())], ['Sources Processed', colors.green(`${this.state.completedRepos.length}/${this.SOURCES.length}`)], ['Products Indexed', colors.cyan(products.length.toString())], ['voyage-context-3 (2025) Documents', colors.green.bold(voyageContext3Count.toLocaleString())], ['voyage-3 (legacy) Documents', colors.blue(voyage3Count.toLocaleString())], ['voyage-code-3 (legacy) Documents', colors.magenta(voyageCode3Count.toLocaleString())], ['Failed Files', colors.yellow(this.state.failedFiles.length.toString())], ['Time Elapsed', colors.magenta(`${elapsed} minutes`)], ['Documents per Minute', colors.blue(Math.round(totalDocs / elapsed).toLocaleString())]); console.log(finalStats.toString()); // Product breakdown console.log(colors.cyan('\n📚 Documents by Product:')); const productTable = new Table({ head: [colors.white.bold('Product'), colors.white.bold('Count')], colWidths: [25, 15] }); for (const product of products.sort()) { const count = await this.collection.countDocuments({ product }); productTable.push([product, colors.yellow(count.toLocaleString())]); } console.log(productTable.toString()); // Success message if (totalDocs >= 10000) { const successBox = boxen(colors.green.bold('🚀 SUCCESS! You now have the BEST MongoDB semantic search!\n\n') + colors.white('✨ What you can do now:\n') + colors.white(' • Search ANY MongoDB concept instantly\n') + colors.white(' • Find code examples across ALL drivers\n') + colors.white(' • Discover integration patterns\n') + colors.white(' • Access Voyage AI embedding best practices\n\n') + colors.cyan.bold(`Total indexed: ${totalDocs.toLocaleString()} documents`), { padding: 1, margin: 1, borderStyle: 'double', borderColor: 'green', align: 'center' }); console.log(successBox); } else { console.log(colors.yellow(`\n⚠️ Need ${(10000 - totalDocs).toLocaleString()} more documents to reach goal`)); console.log(colors.cyan(' Run again to continue indexing remaining sources')); } // ASCII art celebration if (totalDocs >= 10000) { console.log(colors.rainbow('\n' + figlet.textSync('COMPLETE!', { font: 'Big', horizontalLayout: 'default', verticalLayout: 'default' }))); } } } // Run if executed directly const isMainModule = import.meta.url === `file://${process.argv[1]}` || import.meta.url === `file://${process.argv[1]}.ts`; if (isMainModule) { console.log(colors.cyan('\n🔧 Setting up environment...')); // CRITICAL FIX: Load .env file first, then use fallbacks only if needed dotenv.config(); // Only set fallbacks if environment variables are completely missing if (!process.env.MONGODB_URI) { console.log(colors.yellow('⚠️ MONGODB_URI not found in .env, using fallback')); process.env.MONGODB_URI = "mongodb+srv://romiluz:05101994@mongodocs.gdssyqd.mongodb.net/?retryWrites=true&w=majority&appName=mongodocs"; } if (!process.env.VOYAGE_API_KEY) { console.log(colors.yellow('⚠️ VOYAGE_API_KEY not found in .env, using fallback')); process.env.VOYAGE_API_KEY = "pa-s4NXzS13lWb0V8J16kXESEmW3f1y50gSoT5v55Y5b6q"; } console.log(colors.green(`✅ Using MongoDB: ${process.env.MONGODB_URI.split('@')[1]?.split('/')[0] || 'Unknown'}`)); console.log(colors.green(`✅ Using Voyage API: ${process.env.VOYAGE_API_KEY.substring(0, 10)}...`)); const indexer = new UltimateMongoDBIndexer(); indexer.run() .then(() => { console.log(colors.green('\n✅ Process completed successfully!')); process.exit(0); }) .catch((error) => { console.error(colors.red('\n❌ Fatal error:'), error); process.exit(1); }); } export { UltimateMongoDBIndexer }; //# sourceMappingURL=index-docs.js.map