mongodocs-mcp
Version:
Lightning-fast semantic search for MongoDB documentation via Model Context Protocol. 10,000+ documents, <500ms search.
284 lines • 10.3 kB
JavaScript
/**
* Document Refresher for MongoDB Semantic MCP
* Handles incremental updates to keep documentation current
*/
import { MongoDBClient } from './mongodb-client.js';
import { UniversalFetcher } from './universal-fetcher.js';
import { SmartChunker } from './smart-chunker.js';
import { EmbeddingPipeline } from './embedding-pipeline.js';
import crypto from 'crypto';
export class DocumentRefresher {
mongodb;
fetcher;
chunker;
pipeline;
initialized = false;
constructor() {
this.mongodb = MongoDBClient.getInstance();
this.fetcher = new UniversalFetcher();
this.chunker = new SmartChunker();
this.pipeline = new EmbeddingPipeline();
}
async initialize() {
if (this.initialized) {
return;
}
await this.mongodb.connect();
this.initialized = true;
}
/**
* Refresh documentation based on mode
*/
async refresh(options) {
console.error(`🔄 Starting ${options.mode} refresh...`);
try {
if (options.mode === 'full') {
return await this.fullRefresh(options);
}
else {
return await this.incrementalRefresh(options);
}
}
catch (error) {
console.error('Refresh failed:', error);
throw error;
}
}
/**
* Incremental refresh - only update changed documents
*/
async incrementalRefresh(options) {
const result = {
documentsChecked: 0,
documentsUpdated: 0,
newDocuments: 0,
deletedDocuments: 0,
errors: [],
};
// Define sources to fetch based on products
const sources = this.getSourcesToFetch(options.products);
// Fetch all documentation using universal fetcher
console.error('📥 Fetching latest documentation...');
const currentDocs = await this.fetcher.fetchFromSources(sources);
result.documentsChecked = currentDocs.length;
// Get existing document hashes from MongoDB
const existingHashes = await this.getExistingDocumentHashes();
// Compare and identify changes
const newDocs = [];
const updatedDocs = [];
const currentHashes = new Set();
for (const doc of currentDocs) {
const hash = this.hashDocument(doc);
currentHashes.add(doc.id);
if (!existingHashes.has(doc.id)) {
// New document
newDocs.push(doc);
result.newDocuments++;
}
else if (existingHashes.get(doc.id) !== hash) {
// Updated document
updatedDocs.push(doc);
result.documentsUpdated++;
}
}
// Identify deleted documents
const deletedDocIds = Array.from(existingHashes.keys()).filter(id => !currentHashes.has(id));
result.deletedDocuments = deletedDocIds.length;
// Process new documents
if (newDocs.length > 0) {
console.error(`📄 Processing ${newDocs.length} new documents...`);
await this.processNewDocuments(newDocs);
}
// Process updated documents
if (updatedDocs.length > 0) {
console.error(`📝 Updating ${updatedDocs.length} documents...`);
await this.updateDocuments(updatedDocs);
}
// Remove deleted documents
if (deletedDocIds.length > 0) {
console.error(`🗑️ Removing ${deletedDocIds.length} deleted documents...`);
await this.removeDocuments(deletedDocIds);
}
// Update metadata
await this.updateRefreshMetadata(result);
console.error(`✅ Refresh complete: ${result.newDocuments} new, ${result.documentsUpdated} updated, ${result.deletedDocuments} deleted`);
return result;
}
/**
* Full refresh - reprocess all documentation
*/
async fullRefresh(options) {
console.error('🔄 Starting full refresh...');
const result = {
documentsChecked: 0,
documentsUpdated: 0,
newDocuments: 0,
deletedDocuments: 0,
errors: [],
};
// Clear existing data
console.error('🗑️ Clearing existing data...');
const collection = this.mongodb.getVectorsCollection();
const deleteResult = await collection.deleteMany({});
result.deletedDocuments = deleteResult.deletedCount || 0;
// Define sources to fetch based on products
const sources = this.getSourcesToFetch(options.products);
// Fetch all documentation using universal fetcher
console.error('📥 Fetching all documentation...');
const documents = await this.fetcher.fetchFromSources(sources);
result.documentsChecked = documents.length;
result.newDocuments = documents.length;
// Process all documents
await this.processNewDocuments(documents);
console.error(`✅ Full refresh complete: ${result.newDocuments} documents processed`);
return result;
}
/**
* Get existing document hashes from MongoDB
*/
async getExistingDocumentHashes() {
const collection = this.mongodb.getDocumentsCollection();
const docs = await collection
.find({}, { projection: { documentId: 1, contentHash: 1 } })
.toArray();
const hashes = new Map();
for (const doc of docs) {
hashes.set(doc.documentId, doc.contentHash);
}
return hashes;
}
/**
* Process new documents
*/
async processNewDocuments(documents) {
if (documents.length === 0)
return;
// Chunk documents
const chunkedDocs = await this.chunker.chunkDocuments(documents);
// Generate embeddings and store
await this.pipeline.embedAllDocuments(chunkedDocs);
// Store document metadata
const docsCollection = this.mongodb.getDocumentsCollection();
const docMetadata = documents.map(doc => ({
documentId: doc.id,
contentHash: this.hashDocument(doc),
metadata: doc.metadata,
lastUpdated: new Date(),
}));
await docsCollection.insertMany(docMetadata, { ordered: false }).catch((err) => {
if (err.code !== 11000) { // Ignore duplicate key errors
throw err;
}
});
}
/**
* Update existing documents
*/
async updateDocuments(documents) {
const collection = this.mongodb.getVectorsCollection();
const docsCollection = this.mongodb.getDocumentsCollection();
for (const doc of documents) {
try {
// Remove old chunks
await collection.deleteMany({ 'metadata.documentId': doc.id });
// Process and add new chunks
const chunked = await this.chunker.chunkDocuments([doc]);
await this.pipeline.embedAllDocuments(chunked);
// Update document metadata
await docsCollection.updateOne({ documentId: doc.id }, {
$set: {
contentHash: this.hashDocument(doc),
metadata: doc.metadata,
lastUpdated: new Date(),
},
}, { upsert: true });
}
catch (error) {
console.error(`Failed to update document ${doc.id}:`, error);
}
}
}
/**
* Remove deleted documents
*/
async removeDocuments(documentIds) {
if (documentIds.length === 0)
return;
const collection = this.mongodb.getVectorsCollection();
const docsCollection = this.mongodb.getDocumentsCollection();
// Remove vectors
await collection.deleteMany({
'metadata.documentId': { $in: documentIds },
});
// Remove document metadata
await docsCollection.deleteMany({
documentId: { $in: documentIds },
});
}
/**
* Generate hash for document content
*/
hashDocument(doc) {
const content = JSON.stringify({
content: doc.content,
metadata: doc.metadata,
});
return crypto.createHash('sha256').update(content).digest('hex');
}
/**
* Update refresh metadata
*/
async updateRefreshMetadata(result) {
const collection = this.mongodb.getDatabase().collection('metadata');
await collection.updateOne({ type: 'refresh' }, {
$set: {
lastRefresh: new Date(),
lastResult: result,
},
$inc: {
totalRefreshes: 1,
},
}, { upsert: true });
}
/**
* Get last refresh information
*/
async getLastRefreshInfo() {
const collection = this.mongodb.getDatabase().collection('metadata');
const metadata = await collection.findOne({ type: 'refresh' });
return metadata || { lastRefresh: null, totalRefreshes: 0 };
}
/**
* Get sources to fetch based on requested products
*/
getSourcesToFetch(products) {
const sources = [];
// MongoDB documentation sources
if (!products || products.includes('manual')) {
sources.push({
type: 'github',
name: 'MongoDB Manual',
repo: 'mongodb/docs',
branch: 'master',
product: 'manual',
version: '8.0',
priority: 5
});
}
// Voyage AI documentation
if (!products || products.includes('voyage')) {
sources.push({
type: 'github',
name: 'Voyage Python SDK',
repo: 'voyage-ai/voyageai-python',
branch: 'main',
product: 'voyage',
version: 'latest',
priority: 5
});
}
// Add more sources as needed based on products
return sources;
}
}
//# sourceMappingURL=document-refresher.js.map