UNPKG

mongodocs-mcp

Version:

Lightning-fast semantic search for MongoDB documentation via Model Context Protocol. 10,000+ documents, <500ms search.

284 lines 10.3 kB
/** * Document Refresher for MongoDB Semantic MCP * Handles incremental updates to keep documentation current */ import { MongoDBClient } from './mongodb-client.js'; import { UniversalFetcher } from './universal-fetcher.js'; import { SmartChunker } from './smart-chunker.js'; import { EmbeddingPipeline } from './embedding-pipeline.js'; import crypto from 'crypto'; export class DocumentRefresher { mongodb; fetcher; chunker; pipeline; initialized = false; constructor() { this.mongodb = MongoDBClient.getInstance(); this.fetcher = new UniversalFetcher(); this.chunker = new SmartChunker(); this.pipeline = new EmbeddingPipeline(); } async initialize() { if (this.initialized) { return; } await this.mongodb.connect(); this.initialized = true; } /** * Refresh documentation based on mode */ async refresh(options) { console.error(`🔄 Starting ${options.mode} refresh...`); try { if (options.mode === 'full') { return await this.fullRefresh(options); } else { return await this.incrementalRefresh(options); } } catch (error) { console.error('Refresh failed:', error); throw error; } } /** * Incremental refresh - only update changed documents */ async incrementalRefresh(options) { const result = { documentsChecked: 0, documentsUpdated: 0, newDocuments: 0, deletedDocuments: 0, errors: [], }; // Define sources to fetch based on products const sources = this.getSourcesToFetch(options.products); // Fetch all documentation using universal fetcher console.error('📥 Fetching latest documentation...'); const currentDocs = await this.fetcher.fetchFromSources(sources); result.documentsChecked = currentDocs.length; // Get existing document hashes from MongoDB const existingHashes = await this.getExistingDocumentHashes(); // Compare and identify changes const newDocs = []; const updatedDocs = []; const currentHashes = new Set(); for (const doc of currentDocs) { const hash = this.hashDocument(doc); currentHashes.add(doc.id); if (!existingHashes.has(doc.id)) { // New document newDocs.push(doc); result.newDocuments++; } else if (existingHashes.get(doc.id) !== hash) { // Updated document updatedDocs.push(doc); result.documentsUpdated++; } } // Identify deleted documents const deletedDocIds = Array.from(existingHashes.keys()).filter(id => !currentHashes.has(id)); result.deletedDocuments = deletedDocIds.length; // Process new documents if (newDocs.length > 0) { console.error(`📄 Processing ${newDocs.length} new documents...`); await this.processNewDocuments(newDocs); } // Process updated documents if (updatedDocs.length > 0) { console.error(`📝 Updating ${updatedDocs.length} documents...`); await this.updateDocuments(updatedDocs); } // Remove deleted documents if (deletedDocIds.length > 0) { console.error(`🗑️ Removing ${deletedDocIds.length} deleted documents...`); await this.removeDocuments(deletedDocIds); } // Update metadata await this.updateRefreshMetadata(result); console.error(`✅ Refresh complete: ${result.newDocuments} new, ${result.documentsUpdated} updated, ${result.deletedDocuments} deleted`); return result; } /** * Full refresh - reprocess all documentation */ async fullRefresh(options) { console.error('🔄 Starting full refresh...'); const result = { documentsChecked: 0, documentsUpdated: 0, newDocuments: 0, deletedDocuments: 0, errors: [], }; // Clear existing data console.error('🗑️ Clearing existing data...'); const collection = this.mongodb.getVectorsCollection(); const deleteResult = await collection.deleteMany({}); result.deletedDocuments = deleteResult.deletedCount || 0; // Define sources to fetch based on products const sources = this.getSourcesToFetch(options.products); // Fetch all documentation using universal fetcher console.error('📥 Fetching all documentation...'); const documents = await this.fetcher.fetchFromSources(sources); result.documentsChecked = documents.length; result.newDocuments = documents.length; // Process all documents await this.processNewDocuments(documents); console.error(`✅ Full refresh complete: ${result.newDocuments} documents processed`); return result; } /** * Get existing document hashes from MongoDB */ async getExistingDocumentHashes() { const collection = this.mongodb.getDocumentsCollection(); const docs = await collection .find({}, { projection: { documentId: 1, contentHash: 1 } }) .toArray(); const hashes = new Map(); for (const doc of docs) { hashes.set(doc.documentId, doc.contentHash); } return hashes; } /** * Process new documents */ async processNewDocuments(documents) { if (documents.length === 0) return; // Chunk documents const chunkedDocs = await this.chunker.chunkDocuments(documents); // Generate embeddings and store await this.pipeline.embedAllDocuments(chunkedDocs); // Store document metadata const docsCollection = this.mongodb.getDocumentsCollection(); const docMetadata = documents.map(doc => ({ documentId: doc.id, contentHash: this.hashDocument(doc), metadata: doc.metadata, lastUpdated: new Date(), })); await docsCollection.insertMany(docMetadata, { ordered: false }).catch((err) => { if (err.code !== 11000) { // Ignore duplicate key errors throw err; } }); } /** * Update existing documents */ async updateDocuments(documents) { const collection = this.mongodb.getVectorsCollection(); const docsCollection = this.mongodb.getDocumentsCollection(); for (const doc of documents) { try { // Remove old chunks await collection.deleteMany({ 'metadata.documentId': doc.id }); // Process and add new chunks const chunked = await this.chunker.chunkDocuments([doc]); await this.pipeline.embedAllDocuments(chunked); // Update document metadata await docsCollection.updateOne({ documentId: doc.id }, { $set: { contentHash: this.hashDocument(doc), metadata: doc.metadata, lastUpdated: new Date(), }, }, { upsert: true }); } catch (error) { console.error(`Failed to update document ${doc.id}:`, error); } } } /** * Remove deleted documents */ async removeDocuments(documentIds) { if (documentIds.length === 0) return; const collection = this.mongodb.getVectorsCollection(); const docsCollection = this.mongodb.getDocumentsCollection(); // Remove vectors await collection.deleteMany({ 'metadata.documentId': { $in: documentIds }, }); // Remove document metadata await docsCollection.deleteMany({ documentId: { $in: documentIds }, }); } /** * Generate hash for document content */ hashDocument(doc) { const content = JSON.stringify({ content: doc.content, metadata: doc.metadata, }); return crypto.createHash('sha256').update(content).digest('hex'); } /** * Update refresh metadata */ async updateRefreshMetadata(result) { const collection = this.mongodb.getDatabase().collection('metadata'); await collection.updateOne({ type: 'refresh' }, { $set: { lastRefresh: new Date(), lastResult: result, }, $inc: { totalRefreshes: 1, }, }, { upsert: true }); } /** * Get last refresh information */ async getLastRefreshInfo() { const collection = this.mongodb.getDatabase().collection('metadata'); const metadata = await collection.findOne({ type: 'refresh' }); return metadata || { lastRefresh: null, totalRefreshes: 0 }; } /** * Get sources to fetch based on requested products */ getSourcesToFetch(products) { const sources = []; // MongoDB documentation sources if (!products || products.includes('manual')) { sources.push({ type: 'github', name: 'MongoDB Manual', repo: 'mongodb/docs', branch: 'master', product: 'manual', version: '8.0', priority: 5 }); } // Voyage AI documentation if (!products || products.includes('voyage')) { sources.push({ type: 'github', name: 'Voyage Python SDK', repo: 'voyage-ai/voyageai-python', branch: 'main', product: 'voyage', version: 'latest', priority: 5 }); } // Add more sources as needed based on products return sources; } } //# sourceMappingURL=document-refresher.js.map