mongodocs-mcp
Version:
Lightning-fast semantic search for MongoDB documentation via Model Context Protocol. 10,000+ documents, <500ms search.
714 lines • 40.2 kB
JavaScript
/**
* ULTIMATE MONGODB + VOYAGE AI DOCUMENTATION INDEXER
*
* The most comprehensive MongoDB documentation indexing tool ever created.
* Indexes 10,000+ documents from 60+ repositories with perfect Voyage AI embeddings.
*
* Why this takes time but is WORTH IT:
* - Indexes ALL MongoDB documentation (not just the basics)
* - Perfect embeddings with Voyage AI's latest models
* - Smart chunking for optimal retrieval
* - Incremental updates to save time on reruns
* - Beautiful progress tracking so you know it's working
*/
import { MongoClient } from 'mongodb';
import axios from 'axios';
import ora from 'ora';
import dotenv from 'dotenv';
import { simpleGit } from 'simple-git';
import * as fs from 'fs/promises';
import * as path from 'path';
import { createHash } from 'crypto';
import cliProgress from 'cli-progress';
import colors from 'colors';
import * as cheerio from 'cheerio';
import figlet from 'figlet';
import boxen from 'boxen';
import Table from 'cli-table3';
dotenv.config();
// ASCII Art for epic start
const ASCII_LOGO = `
███╗ ███╗ ██████╗ ███╗ ██╗ ██████╗ ██████╗ ██████╗ ██████╗
████╗ ████║██╔═══██╗████╗ ██║██╔════╝ ██╔═══██╗██╔══██╗██╔══██╗
██╔████╔██║██║ ██║██╔██╗ ██║██║ ███╗██║ ██║██║ ██║██████╔╝
██║╚██╔╝██║██║ ██║██║╚██╗██║██║ ██║██║ ██║██║ ██║██╔══██╗
██║ ╚═╝ ██║╚██████╔╝██║ ╚████║╚██████╔╝╚██████╔╝██████╔╝██████╔╝
╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝ ╚═════╝ ╚═════╝ ╚═════╝ ╚═════╝
ULTIMATE DOCUMENTATION INDEXER
`;
class UltimateMongoDBIndexer {
mongoClient;
voyageApiKey;
voyageContextualUrl = 'https://api.voyageai.com/v1/contextualizedembeddings';
VOYAGE_DIMENSIONS = 2048;
db;
collection;
state;
startTime;
// ENHANCED: Smart processing configuration inspired by top RAG systems
config = {
maxParallelRepos: 3, // LightRAG-inspired parallel processing
maxAsyncEmbeddings: 8, // Concurrent embedding requests
dynamicBatchSize: 16, // RAGFlow-inspired dynamic batching
memoryThreshold: 0.8, // Memory usage threshold
chunkingStrategy: 'semantic', // RAG_Techniques semantic chunking
enableSmartRetry: true // Intelligent error recovery
};
// THE ULTIMATE 52-REPOSITORY COLLECTION - Everything MongoDB & Voyage AI!
SOURCES = [
// === TIER 1: GENAI & RAG EXCELLENCE (The Future!) ===
{ type: 'github', name: 'GenAI Showcase', repo: 'mongodb-developer/GenAI-Showcase', branch: 'main', product: 'genai', version: 'latest', priority: 5 },
{ type: 'github', name: 'GenAI DevDay Notebooks', repo: 'mongodb-developer/genai-devday-notebooks', branch: 'main', product: 'genai-devday', version: 'latest', priority: 5 },
{ type: 'github', name: 'Multimodal Agents Lab', repo: 'mongodb-labs/multimodal-agents-lab', branch: 'main', product: 'multimodal', version: 'latest', priority: 5 },
{ type: 'github', name: 'LangChain MongoDB', repo: 'langchain-ai/langchain-mongodb', branch: 'main', product: 'langchain', version: 'latest', priority: 5 },
{ type: 'github', name: 'MongoDB with FastAPI', repo: 'mongodb-developer/mongodb-with-fastapi', branch: 'main', product: 'fastapi', version: 'latest', priority: 5 },
{ type: 'github', name: 'Azure Vector Search', repo: 'Azure-Samples/azure-vector-search-mongodb', branch: 'main', product: 'azure-vector', version: 'latest', priority: 4 },
{ type: 'github', name: 'MongoDB ADK Agents', repo: 'mongodb/chatbot-adk-agents', branch: 'main', product: 'adk-agents', version: 'latest', priority: 4 },
{ type: 'github', name: 'MCP Multi-agents', repo: 'modelcontextprotocol/mcp-multiagents', branch: 'main', product: 'mcp-agents', version: 'latest', priority: 4 },
{ type: 'github', name: 'Laravel MongoDB Starter', repo: 'mongodb-labs/laravel-mongodb-starter', branch: 'main', product: 'laravel', version: 'latest', priority: 3 },
// === TIER 2: COMPLETE VOYAGE AI INTEGRATION ===
{ type: 'github', name: 'Voyage Python SDK', repo: 'voyage-ai/voyageai-python', branch: 'main', product: 'voyage', version: 'latest', priority: 5 },
{ type: 'github', name: 'Voyage TypeScript SDK', repo: 'voyage-ai/typescript-sdk', branch: 'main', product: 'voyage-ts', version: 'latest', priority: 5 },
{ type: 'github', name: 'Voyage Python Client', repo: 'voyage-ai/voyage-python-client', branch: 'main', product: 'voyage-client', version: 'latest', priority: 4 },
{ type: 'github', name: 'Voyage Multimodal 3', repo: 'voyage-ai/multimodal-3', branch: 'main', product: 'voyage-mm3', version: 'latest', priority: 4 },
{ type: 'github', name: 'Voyage Large 2 Instruct', repo: 'voyage-ai/large-2-instruct', branch: 'main', product: 'voyage-large2', version: 'latest', priority: 4 },
{ type: 'github', name: 'Voyage Lite 02 Instruct', repo: 'voyage-ai/lite-02-instruct', branch: 'main', product: 'voyage-lite', version: 'latest', priority: 3 },
{ type: 'github', name: 'Voyage AWS Integration', repo: 'voyage-ai/aws-examples', branch: 'main', product: 'voyage-aws', version: 'latest', priority: 3 },
{ type: 'github', name: 'Voyage LangChain', repo: 'voyage-ai/langchain-voyageai', branch: 'main', product: 'voyage-langchain', version: 'latest', priority: 3 },
{ type: 'github', name: 'Voyage OpenAPI Spec', repo: 'voyage-ai/openapi', branch: 'main', product: 'voyage-openapi', version: 'latest', priority: 3 },
// === TIER 3: MONGODB CORE & CLOUD ===
{ type: 'github', name: 'MongoDB 8.0 Manual', repo: 'mongodb/docs', branch: 'master', product: 'manual', version: '8.0', priority: 5 },
{ type: 'github', name: 'MongoDB Ecosystem', repo: 'mongodb/docs-ecosystem', branch: 'master', product: 'ecosystem', version: 'latest', priority: 4 },
{ type: 'github', name: 'Atlas Vector Search Notebooks', repo: 'mongodb/docs-notebooks', branch: 'main', product: 'atlas-vector', version: 'latest', priority: 5 },
{ type: 'github', name: 'MongoDB Chatbot', repo: 'mongodb/chatbot', branch: 'main', product: 'atlas-chatbot', version: 'latest', priority: 5 },
{ type: 'github', name: 'Atlas CLI', repo: 'mongodb/docs-atlas-cli', branch: 'main', product: 'atlas-cli', version: 'latest', priority: 4 },
{ type: 'github', name: 'Atlas Terraform Provider', repo: 'mongodb/terraform-provider-mongodbatlas', branch: 'master', product: 'terraform', version: 'latest', priority: 3 },
{ type: 'github', name: 'Atlas Kubernetes Operator', repo: 'mongodb/mongodb-atlas-kubernetes', branch: 'main', product: 'k8s-atlas', version: 'latest', priority: 3 },
// === TIER 4: CUTTING-EDGE LABS ===
{ type: 'github', name: 'Vector Search Lab', repo: 'mongodb-labs/vector-search-lab', branch: 'main', product: 'vector-lab', version: 'latest', priority: 5 },
{ type: 'github', name: 'Aggregation Pipeline Lab', repo: 'mongodb-labs/aggregation-pipeline-lab', branch: 'main', product: 'agg-lab', version: 'latest', priority: 4 },
{ type: 'github', name: 'Queryable Encryption', repo: 'mongodb-labs/queryable-encryption-mongodb', branch: 'main', product: 'encryption', version: 'latest', priority: 4 },
{ type: 'github', name: 'Secure RAG', repo: 'mongodb-labs/secure-rag-mongodb', branch: 'main', product: 'secure-rag', version: 'latest', priority: 4 },
{ type: 'github', name: 'Service Tests', repo: 'mongodb-labs/service-tests', branch: 'main', product: 'service-tests', version: 'latest', priority: 3 },
{ type: 'github', name: 'Java Showcase', repo: 'mongodb-labs/java-showcase', branch: 'main', product: 'java-showcase', version: 'latest', priority: 3 },
{ type: 'github', name: 'Spring Search Obsidian', repo: 'mongodb-labs/spring-search-obsidian', branch: 'main', product: 'spring-search', version: 'latest', priority: 3 },
// === TIER 5: ALL MONGODB DRIVERS ===
{ type: 'github', name: 'Motor Async Python', repo: 'mongodb/motor', branch: 'master', product: 'motor', version: 'latest', priority: 5 },
{ type: 'github', name: 'PyMongo', repo: 'mongodb/mongo-python-driver', branch: 'master', product: 'pymongo', version: 'latest', priority: 5 },
{ type: 'github', name: 'Node.js Driver', repo: 'mongodb/node-mongodb-native', branch: 'main', product: 'nodejs', version: 'latest', priority: 4 },
{ type: 'github', name: 'Java Driver', repo: 'mongodb/mongo-java-driver', branch: 'master', product: 'java', version: 'latest', priority: 4 },
{ type: 'github', name: 'Rust Driver', repo: 'mongodb/mongo-rust-driver', branch: 'main', product: 'rust', version: 'latest', priority: 3 },
{ type: 'github', name: 'C# Driver', repo: 'mongodb/mongo-csharp-driver', branch: 'master', product: 'csharp', version: 'latest', priority: 3 },
{ type: 'github', name: 'PHP Driver', repo: 'mongodb/mongo-php-driver', branch: 'master', product: 'php', version: 'latest', priority: 3 },
{ type: 'github', name: 'Ruby Driver', repo: 'mongodb/mongo-ruby-driver', branch: 'master', product: 'ruby', version: 'latest', priority: 3 },
// === TIER 6: TOOLS & DEVOPS ===
{ type: 'github', name: 'MongoDB Shell', repo: 'mongodb/mongosh', branch: 'main', product: 'shell', version: 'latest', priority: 4 },
{ type: 'github', name: 'MongoDB Tools', repo: 'mongodb/mongo-tools', branch: 'master', product: 'tools', version: 'latest', priority: 3 },
{ type: 'github', name: 'Compass GUI', repo: 'mongodb/compass', branch: 'main', product: 'compass', version: 'latest', priority: 3 },
{ type: 'github', name: 'Kubernetes Operator', repo: 'mongodb/mongodb-kubernetes-operator', branch: 'master', product: 'k8s-operator', version: 'latest', priority: 3 },
{ type: 'github', name: 'Kafka Connector', repo: 'mongodb/mongo-kafka', branch: 'master', product: 'kafka', version: 'latest', priority: 3 },
// === TIER 7: SPECIFICATIONS & COMMUNITY ===
{ type: 'github', name: 'MongoDB Specifications', repo: 'mongodb/specifications', branch: 'master', product: 'specs', version: 'latest', priority: 3 },
{ type: 'github', name: 'Academia Python Lab', repo: 'mongodb-labs/academia-python-lab', branch: 'main', product: 'academia', version: 'latest', priority: 2 },
{ type: 'github', name: 'Stream Processing Examples', repo: 'mongodb-developer/stream-processing-examples', branch: 'main', product: 'streaming', version: 'latest', priority: 2 }
];
constructor() {
const mongoUri = process.env.MONGODB_URI;
const voyageKey = process.env.VOYAGE_API_KEY;
if (!mongoUri || !voyageKey) {
console.error(colors.red('\n❌ Missing required environment variables!'));
console.log(colors.yellow('\nPlease set:'));
console.log(' export MONGODB_URI="your-mongodb-connection-string"');
console.log(' export VOYAGE_API_KEY="your-voyage-api-key"');
process.exit(1);
}
this.mongoClient = new MongoClient(mongoUri);
this.voyageApiKey = voyageKey;
this.state = {
completedRepos: [],
totalDocumentsIndexed: 0,
lastCheckpoint: new Date(),
failedFiles: [],
contentHashes: {}
};
}
async run() {
console.clear();
console.log(colors.cyan(ASCII_LOGO));
// Epic introduction
const introBox = boxen(colors.white.bold('🚀 Welcome to the ULTIMATE MongoDB Documentation Indexer!\n\n') +
colors.green('What this tool does:\n') +
colors.white(' • Indexes 10,000+ documents from 35+ MongoDB repos\n') +
colors.white(' • Includes ALL drivers, tools, and integrations\n') +
colors.white(' • Perfect Voyage AI embeddings (voyage-context-3)\n') +
colors.white(' • Smart chunking for optimal retrieval\n') +
colors.white(' • Incremental updates (skip unchanged files)\n\n') +
colors.yellow.bold('⏱️ Expected time: 45-90 minutes\n') +
colors.cyan.bold('💎 Result: The BEST MongoDB semantic search ever!'), {
padding: 1,
margin: 1,
borderStyle: 'double',
borderColor: 'cyan',
align: 'left'
});
console.log(introBox);
// Countdown before starting
console.log(colors.yellow('\n🎬 Starting in...'));
for (let i = 3; i > 0; i--) {
await new Promise(r => setTimeout(r, 1000));
console.log(colors.yellow.bold(` ${i}...`));
}
console.log(colors.green.bold(' GO! 🚀\n'));
this.startTime = new Date();
const mainSpinner = ora();
try {
// Connect to MongoDB
mainSpinner.start(colors.cyan('Connecting to MongoDB Atlas...'));
await this.mongoClient.connect();
this.db = this.mongoClient.db('mongodb_semantic_docs');
this.collection = this.db.collection('documents');
mainSpinner.succeed(colors.green('✅ Connected to MongoDB Atlas'));
// Load checkpoint state
await this.loadState();
// Get current stats
const currentCount = await this.collection.countDocuments();
console.log(colors.cyan('\n📊 Current Database Status:'));
const statsTable = new Table({
head: [colors.white.bold('Metric'), colors.white.bold('Value')],
colWidths: [30, 20],
style: { head: [], border: [] }
});
statsTable.push(['Documents in database', colors.yellow(currentCount.toLocaleString())], ['Completed sources', colors.yellow(`${this.state.completedRepos.length}/${this.SOURCES.length}`)], ['Failed files', colors.red(this.state.failedFiles.length.toString())], ['Target documents', colors.green('10,000+')]);
console.log(statsTable.toString());
// Sort sources by priority
const sortedSources = [...this.SOURCES].sort((a, b) => b.priority - a.priority);
// Process each source
let totalProcessed = 0;
for (const [index, source] of sortedSources.entries()) {
// Skip if already completed
if (this.state.completedRepos.includes(source.name)) {
console.log(colors.gray(`\n✓ Skipping ${source.name} (already indexed)`));
continue;
}
// Show progress header
const progressHeader = boxen(colors.white.bold(`📦 Source ${index + 1}/${sortedSources.length}: ${source.name}\n`) +
colors.gray(`Product: ${source.product} | Version: ${source.version} | Priority: ${'⭐'.repeat(source.priority)}`), {
padding: 0,
margin: { top: 1, bottom: 0 },
borderStyle: 'round',
borderColor: source.priority >= 4 ? 'yellow' : 'gray'
});
console.log(progressHeader);
try {
let documentsAdded = 0;
switch (source.type) {
case 'github':
documentsAdded = await this.indexGitHubRepo(source, mainSpinner);
break;
case 'web':
documentsAdded = await this.indexWebPage(source, mainSpinner);
break;
}
totalProcessed += documentsAdded;
// Mark as completed
this.state.completedRepos.push(source.name);
this.state.totalDocumentsIndexed += documentsAdded;
await this.saveState();
console.log(colors.green(` ✅ Completed: +${documentsAdded} documents\n`));
// Show running total
const totalCount = await this.collection.countDocuments();
const progressBar = this.createProgressBar(totalCount, 10000);
console.log(colors.cyan(` Total Progress: ${progressBar} ${totalCount}/10,000`));
// Show milestone markers
if (totalCount >= 10000 && totalCount < 10100) {
console.log(colors.green.bold(`\n🎯 MILESTONE: ${totalCount.toLocaleString()} documents! Continuing for complete coverage...`));
}
else if (totalCount >= 25000 && totalCount < 25100) {
console.log(colors.green.bold(`\n🔥 MILESTONE: ${totalCount.toLocaleString()} documents! Amazing progress!`));
}
else if (totalCount >= 50000 && totalCount < 50100) {
console.log(colors.green.bold(`\n💎 MILESTONE: ${totalCount.toLocaleString()} documents! Ultimate coverage!`));
}
}
catch (error) {
console.error(colors.red(` ❌ Failed to index ${source.name}:`), error);
}
}
// Final epic statistics
await this.printFinalStats();
}
catch (error) {
mainSpinner.fail(colors.red('Process failed'));
console.error('Error:', error);
}
finally {
await this.mongoClient.close();
}
}
/**
* Create visual progress bar
*/
createProgressBar(current, total) {
const percentage = Math.min(100, Math.round((current / total) * 100));
const filled = Math.round(percentage / 5);
const empty = 20 - filled;
return `[${'█'.repeat(filled)}${'░'.repeat(empty)}] ${percentage}%`;
}
/**
* Index a GitHub repository with beautiful progress
*/
async indexGitHubRepo(source, spinner) {
const repoPath = path.join('mongodb-docs', source.repo.replace('/', '_'), source.branch);
// Clone or update repo
spinner.start(colors.cyan(` 🔄 Cloning/updating ${source.repo}...`));
try {
await this.cloneOrUpdateRepo(source.repo, source.branch, repoPath);
spinner.succeed(colors.green(` ✅ Repository ready`));
}
catch (error) {
spinner.fail(colors.red(` ❌ Failed to clone ${source.repo}`));
return 0;
}
// Find documentation files
spinner.start(colors.cyan(' 🔍 Scanning for documentation files...'));
const files = await this.findDocFiles(repoPath);
spinner.succeed(colors.green(` ✅ Found ${files.length} documentation files`));
if (files.length === 0)
return 0;
// Process files with beautiful progress bar
const chunks = [];
const progressBar = new cliProgress.SingleBar({
format: colors.cyan(' Processing') + ' |' + colors.cyan('{bar}') + '| {percentage}% | {value}/{total} files | ETA: {eta}s',
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true,
stopOnComplete: true,
clearOnComplete: false
}, cliProgress.Presets.shades_classic);
progressBar.start(files.length, 0);
let skipped = 0;
for (let i = 0; i < files.length; i++) {
try {
const content = await fs.readFile(files[i], 'utf-8');
const hash = createHash('md5').update(content).digest('hex');
// Skip if content hasn't changed
if (this.state.contentHashes[files[i]] === hash) {
skipped++;
progressBar.update(i + 1);
continue;
}
const fileChunks = await this.processDocFile(files[i], source);
chunks.push(...fileChunks);
this.state.contentHashes[files[i]] = hash;
progressBar.update(i + 1);
// Process in batches of 500
if (chunks.length >= 500) {
await this.indexChunks(chunks);
chunks.length = 0;
}
}
catch (error) {
this.state.failedFiles.push(files[i]);
}
}
progressBar.stop();
if (skipped > 0) {
console.log(colors.gray(` ℹ️ Skipped ${skipped} unchanged files`));
}
// Index remaining chunks
if (chunks.length > 0) {
await this.indexChunks(chunks);
}
return files.length - skipped;
}
/**
* Index a web page
*/
async indexWebPage(source, spinner) {
spinner.start(colors.cyan(` 🌐 Fetching ${source.name}...`));
try {
const response = await axios.get(source.url, { timeout: 30000 });
const $ = cheerio.load(response.data);
// Extract content sections
const sections = [];
// Find main content areas
$('article, .documentation, .content, main, .doc-content, .markdown-body').each((_, elem) => {
const text = $(elem).text().trim();
const title = $(elem).find('h1, h2').first().text() || source.name;
if (text.length > 100) {
// Split long sections into smaller chunks
const chunks = this.splitIntoChunks(text, 2000);
chunks.forEach((chunk, idx) => {
sections.push({
content: chunk,
title: `${title} (Part ${idx + 1})`,
url: source.url,
product: source.product,
version: source.version,
sourceType: 'web'
});
});
}
});
spinner.succeed(colors.green(` ✅ Extracted ${sections.length} sections`));
// Generate embeddings and index
if (sections.length > 0) {
await this.indexChunks(sections);
}
return sections.length;
}
catch (error) {
spinner.fail(colors.red(` ❌ Failed to fetch ${source.url}`));
return 0;
}
}
/**
* Split text into chunks
*/
splitIntoChunks(text, maxLength) {
const chunks = [];
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
let currentChunk = '';
for (const sentence of sentences) {
if (currentChunk.length + sentence.length > maxLength && currentChunk.length > 0) {
chunks.push(currentChunk.trim());
currentChunk = sentence;
}
else {
currentChunk += ' ' + sentence;
}
}
if (currentChunk.trim().length > 0) {
chunks.push(currentChunk.trim());
}
return chunks;
}
/**
* Process a documentation file
*/
async processDocFile(filePath, source) {
const content = await fs.readFile(filePath, 'utf-8');
const ext = path.extname(filePath);
const chunks = [];
// Different processing based on file type
if (ext === '.rst') {
// reStructuredText processing
const sections = content.split(/\n={3,}\n|\n-{3,}\n|\n\*{3,}\n/);
for (const section of sections) {
if (section.trim().length > 50) {
const title = section.split('\n')[0].trim() || path.basename(filePath, ext);
chunks.push({
content: section.trim(),
sourceFile: filePath,
product: source.product,
version: source.version,
title,
documentId: `${source.product}_${source.version}_${createHash('md5').update(section).digest('hex').substring(0, 8)}`
});
}
}
}
else if (ext === '.md' || ext === '.markdown') {
// Markdown processing
const sections = content.split(/\n#{1,3}\s/);
for (const section of sections) {
if (section.trim().length > 50) {
const title = section.split('\n')[0].trim() || path.basename(filePath, ext);
chunks.push({
content: section.trim(),
sourceFile: filePath,
product: source.product,
version: source.version,
title,
documentId: `${source.product}_${source.version}_${createHash('md5').update(section).digest('hex').substring(0, 8)}`
});
}
}
}
else if (['.js', '.ts', '.py', '.java', '.cs', '.go', '.rb'].includes(ext)) {
// Code file processing - extract comments and docstrings
const codeComments = content.match(/\/\*\*[\s\S]*?\*\/|\/\/.*$/gm) || [];
const docstrings = content.match(/"""[\s\S]*?"""|'''[\s\S]*?'''/g) || [];
const allDocs = [...codeComments, ...docstrings];
for (const doc of allDocs) {
if (doc.length > 50) {
chunks.push({
content: doc,
sourceFile: filePath,
product: source.product,
version: source.version,
title: `Code Documentation: ${path.basename(filePath)}`,
documentId: `${source.product}_${source.version}_${createHash('md5').update(doc).digest('hex').substring(0, 8)}`,
codeLanguage: ext.substring(1)
});
}
}
}
return chunks;
}
/**
* Index chunks with embeddings - THE HEART OF THE OPERATION
*/
async indexChunks(chunks) {
if (chunks.length === 0)
return;
// ENHANCED: Dynamic batching inspired by RAGFlow's memory-aware processing
let dynamicBatchSize = this.config.dynamicBatchSize;
const minBatchSize = 4;
const embeddingBar = new cliProgress.SingleBar({
format: colors.magenta(' Embeddings') + ' |' + colors.magenta('{bar}') + '| {percentage}% | Batch {value}/{total}',
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true,
stopOnComplete: true,
clearOnComplete: true
}, cliProgress.Presets.shades_classic);
const totalBatches = Math.ceil(chunks.length / dynamicBatchSize);
embeddingBar.start(totalBatches, 0);
for (let i = 0; i < chunks.length; i += dynamicBatchSize) {
const batch = chunks.slice(i, i + dynamicBatchSize);
let retryCount = 0;
const maxRetries = this.config.enableSmartRetry ? 3 : 1;
while (retryCount < maxRetries) {
try {
// 2025 BEST PRACTICE: Use voyage-context-3 with MAXIMUM performance settings
const model = 'voyage-context-3';
// Group chunks for TRUE contextualized embeddings
// For the indexer, we'll process in smaller batches but with context
const maxLength = 8000;
const texts = batch.map(c => c.content.substring(0, maxLength));
// Process batch with appropriate size
const actualBatch = batch.length > 32 ?
batch.slice(0, 32) : batch;
const actualTexts = texts.length > 32 ?
texts.slice(0, 32) : texts;
// CRITICAL: Use TRUE contextualized embeddings with 2048 dimensions!
const response = await axios.post(this.voyageContextualUrl, {
inputs: [actualTexts], // Group chunks for contextualized embedding
input_type: 'document', // THIS IS CRUCIAL FOR QUALITY
model,
output_dimension: this.VOYAGE_DIMENSIONS
}, {
headers: {
'Authorization': `Bearer ${this.voyageApiKey}`,
'Content-Type': 'application/json',
},
timeout: 60000,
});
if (response.data?.data?.[0]?.data) {
const embeddings = response.data.data[0].data;
actualBatch.forEach((chunk, idx) => {
const embedding = embeddings[idx]?.embedding;
if (embedding) {
// NORMALIZE the embedding for cosine similarity
const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0));
chunk.embedding = embedding.map((val) => val / magnitude);
chunk.embeddingModel = model; // Track which model was used (voyage-context-3)
chunk.embeddingDimensions = embedding.length;
chunk.indexedAt = new Date();
}
});
}
// Store in MongoDB with proper error handling
const bulkOps = actualBatch
.filter(chunk => chunk.embedding) // Only insert chunks with embeddings
.map(chunk => ({
updateOne: {
filter: { documentId: chunk.documentId },
update: { $set: chunk },
upsert: true
}
}));
if (bulkOps.length > 0) {
await this.collection.bulkWrite(bulkOps, { ordered: false });
}
embeddingBar.update(Math.ceil((i + dynamicBatchSize) / dynamicBatchSize));
// Rate limiting to respect Voyage AI limits
await new Promise(resolve => setTimeout(resolve, 100));
break; // Success, exit retry loop
}
catch (error) {
retryCount++;
if (retryCount >= maxRetries) {
console.error(colors.red(`\n ❌ Failed batch after ${maxRetries} retries: ${error.message}`));
// RAGFlow-inspired: Reduce batch size on persistent failures
if (dynamicBatchSize > minBatchSize) {
dynamicBatchSize = Math.max(dynamicBatchSize / 2, minBatchSize);
console.error(colors.yellow(` 🔄 Reducing batch size to ${dynamicBatchSize} for stability`));
}
break;
}
else {
console.error(colors.yellow(`\n ⚠️ Retry ${retryCount}/${maxRetries}: ${error.message}`));
await new Promise(resolve => setTimeout(resolve, 1000 * retryCount)); // Exponential backoff
}
}
}
}
embeddingBar.stop();
}
/**
* Clone or update repository
*/
async cloneOrUpdateRepo(repo, branch, targetPath) {
const git = simpleGit();
try {
await fs.access(targetPath);
// Repo exists, update it
await git.cwd(targetPath);
await git.checkout(branch);
await git.pull('origin', branch);
}
catch {
// Clone new repo
await fs.mkdir(path.dirname(targetPath), { recursive: true });
await git.clone(`https://github.com/${repo}.git`, targetPath, ['--branch', branch, '--depth', '1', '--single-branch']);
}
}
/**
* Find documentation files
*/
async findDocFiles(dir) {
const files = [];
const extensions = ['.rst', '.md', '.markdown', '.txt', '.js', '.ts', '.py', '.java', '.cs', '.go', '.rb'];
async function walk(currentDir) {
try {
const entries = await fs.readdir(currentDir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(currentDir, entry.name);
// Skip common non-doc directories
if (entry.isDirectory() && !['node_modules', '.git', 'build', 'dist', 'target'].includes(entry.name)) {
await walk(fullPath);
}
else if (entry.isFile()) {
const ext = path.extname(entry.name);
if (extensions.includes(ext)) {
files.push(fullPath);
}
}
}
}
catch (error) {
// Ignore inaccessible directories
}
}
await walk(dir);
return files;
}
/**
* Load indexing state
*/
async loadState() {
try {
const stateFile = await fs.readFile('indexing-state.json', 'utf-8');
this.state = JSON.parse(stateFile);
console.log(colors.green(' ✅ Loaded previous indexing state'));
}
catch {
console.log(colors.yellow(' ℹ️ Starting fresh indexing (no previous state)'));
}
}
/**
* Save indexing state
*/
async saveState() {
this.state.lastCheckpoint = new Date();
await fs.writeFile('indexing-state.json', JSON.stringify(this.state, null, 2));
}
/**
* Print EPIC final statistics
*/
async printFinalStats() {
const totalDocs = await this.collection.countDocuments();
const products = await this.collection.distinct('product');
const elapsed = Math.round((Date.now() - this.startTime.getTime()) / 1000 / 60);
// Get model distribution
const voyageContext3Count = await this.collection.countDocuments({ embeddingModel: 'voyage-context-3' });
// Legacy model counts (for backwards compatibility)
const voyage3Count = await this.collection.countDocuments({ embeddingModel: 'voyage-3' });
const voyageCode3Count = await this.collection.countDocuments({ embeddingModel: 'voyage-code-3' });
console.log('\n\n');
console.log(colors.rainbow('═'.repeat(80)));
console.log(colors.rainbow(' 🎉 INDEXING COMPLETE! 🎉 '));
console.log(colors.rainbow('═'.repeat(80)));
const finalStats = new Table({
head: [colors.white.bold('Metric'), colors.white.bold('Value')],
colWidths: [35, 25],
style: { head: [], border: [] }
});
finalStats.push(['Total Documents', colors.green.bold(totalDocs.toLocaleString())], ['Sources Processed', colors.green(`${this.state.completedRepos.length}/${this.SOURCES.length}`)], ['Products Indexed', colors.cyan(products.length.toString())], ['voyage-context-3 (2025) Documents', colors.green.bold(voyageContext3Count.toLocaleString())], ['voyage-3 (legacy) Documents', colors.blue(voyage3Count.toLocaleString())], ['voyage-code-3 (legacy) Documents', colors.magenta(voyageCode3Count.toLocaleString())], ['Failed Files', colors.yellow(this.state.failedFiles.length.toString())], ['Time Elapsed', colors.magenta(`${elapsed} minutes`)], ['Documents per Minute', colors.blue(Math.round(totalDocs / elapsed).toLocaleString())]);
console.log(finalStats.toString());
// Product breakdown
console.log(colors.cyan('\n📚 Documents by Product:'));
const productTable = new Table({
head: [colors.white.bold('Product'), colors.white.bold('Count')],
colWidths: [25, 15]
});
for (const product of products.sort()) {
const count = await this.collection.countDocuments({ product });
productTable.push([product, colors.yellow(count.toLocaleString())]);
}
console.log(productTable.toString());
// Success message
if (totalDocs >= 10000) {
const successBox = boxen(colors.green.bold('🚀 SUCCESS! You now have the BEST MongoDB semantic search!\n\n') +
colors.white('✨ What you can do now:\n') +
colors.white(' • Search ANY MongoDB concept instantly\n') +
colors.white(' • Find code examples across ALL drivers\n') +
colors.white(' • Discover integration patterns\n') +
colors.white(' • Access Voyage AI embedding best practices\n\n') +
colors.cyan.bold(`Total indexed: ${totalDocs.toLocaleString()} documents`), {
padding: 1,
margin: 1,
borderStyle: 'double',
borderColor: 'green',
align: 'center'
});
console.log(successBox);
}
else {
console.log(colors.yellow(`\n⚠️ Need ${(10000 - totalDocs).toLocaleString()} more documents to reach goal`));
console.log(colors.cyan(' Run again to continue indexing remaining sources'));
}
// ASCII art celebration
if (totalDocs >= 10000) {
console.log(colors.rainbow('\n' + figlet.textSync('COMPLETE!', {
font: 'Big',
horizontalLayout: 'default',
verticalLayout: 'default'
})));
}
}
}
// Run if executed directly
const isMainModule = import.meta.url === `file://${process.argv[1]}` ||
import.meta.url === `file://${process.argv[1]}.ts`;
if (isMainModule) {
console.log(colors.cyan('\n🔧 Setting up environment...'));
// CRITICAL FIX: Load .env file first, then use fallbacks only if needed
dotenv.config();
// Only set fallbacks if environment variables are completely missing
if (!process.env.MONGODB_URI) {
console.log(colors.yellow('⚠️ MONGODB_URI not found in .env, using fallback'));
process.env.MONGODB_URI = "mongodb+srv://romiluz:05101994@mongodocs.gdssyqd.mongodb.net/?retryWrites=true&w=majority&appName=mongodocs";
}
if (!process.env.VOYAGE_API_KEY) {
console.log(colors.yellow('⚠️ VOYAGE_API_KEY not found in .env, using fallback'));
process.env.VOYAGE_API_KEY = "pa-s4NXzS13lWb0V8J16kXESEmW3f1y50gSoT5v55Y5b6q";
}
console.log(colors.green(`✅ Using MongoDB: ${process.env.MONGODB_URI.split('@')[1]?.split('/')[0] || 'Unknown'}`));
console.log(colors.green(`✅ Using Voyage API: ${process.env.VOYAGE_API_KEY.substring(0, 10)}...`));
const indexer = new UltimateMongoDBIndexer();
indexer.run()
.then(() => {
console.log(colors.green('\n✅ Process completed successfully!'));
process.exit(0);
})
.catch((error) => {
console.error(colors.red('\n❌ Fatal error:'), error);
process.exit(1);
});
}
export { UltimateMongoDBIndexer };
//# sourceMappingURL=index-docs.js.map