embedocs-mcp
Version:
Transform any GitHub repository into searchable vector embeddings. MCP server with smart indexing, voyage-context-3 embeddings, and semantic search for Claude/Cursor IDEs.
498 lines • 19.2 kB
JavaScript
/**
* SINGLE Storage Service - The ONLY place that interacts with MongoDB
* Clean, simple, testable
*/
import { MongoClient } from 'mongodb';
import { config } from '../config/index.js';
export class StorageService {
static instance;
client = null;
db = null;
constructor() { }
static getInstance() {
if (!this.instance) {
this.instance = new StorageService();
}
return this.instance;
}
async connect() {
if (this.client && this.db)
return;
const uri = process.env.MONGODB_URI;
if (!uri) {
throw new Error('MONGODB_URI is required');
}
this.client = new MongoClient(uri, {
maxPoolSize: config.storage.maxPoolSize,
minPoolSize: config.storage.minPoolSize,
});
await this.client.connect();
this.db = this.client.db(config.storage.database);
// CRITICAL: Ensure BOTH indexes exist AND are ready before proceeding
console.log('🔍 Checking search indexes...');
await this.ensureVectorIndex();
await this.ensureTextIndex();
console.log('✅ All search indexes are READY!');
}
async disconnect() {
if (this.client) {
await this.client.close();
this.client = null;
this.db = null;
}
}
getCollection() {
if (!this.db)
throw new Error('Not connected to MongoDB');
return this.db.collection(config.storage.collection);
}
/**
* Upsert documents in batches
*/
async upsertDocuments(documents) {
const collection = this.getCollection();
const bulkOps = documents.map(doc => ({
updateOne: {
filter: { documentId: doc.documentId },
update: { $set: doc },
upsert: true
}
}));
if (bulkOps.length > 0) {
await collection.bulkWrite(bulkOps, { ordered: false });
}
}
/**
* Clean the database
*/
async clean() {
const collection = this.getCollection();
await collection.deleteMany({});
}
/**
* Get document count
*/
async count(filter = {}) {
const collection = this.getCollection();
return collection.countDocuments(filter);
}
/**
* Vector search
*/
async vectorSearch(embedding, limit = 10, filter) {
const collection = this.getCollection();
const pipeline = [
{
$vectorSearch: {
index: config.storage.vectorIndexName,
path: 'embedding',
queryVector: embedding,
numCandidates: config.search.numCandidates,
limit,
...(filter && { filter })
}
},
{
$addFields: {
searchScore: { $meta: 'vectorSearchScore' }
}
},
{
$project: {
embedding: 0 // Exclude embeddings from results
}
}
];
return collection.aggregate(pipeline).toArray();
}
/**
* MMR Vector Search - Maximum Marginal Relevance
* Balances relevance and diversity for better results
* Inspired by Harry-231's approach and LangChain MMR implementation
*/
async vectorSearchMMR(embedding, options = {}) {
const collection = this.getCollection();
const { limit = 10, fetchK = 20, lambdaMult = 0.7, filter } = options;
// Step 1: Fetch more candidates than needed (fetchK)
const pipeline = [
{
$vectorSearch: {
index: config.storage.vectorIndexName,
path: 'embedding',
queryVector: embedding,
numCandidates: Math.max(fetchK * 2, 100), // Ensure enough candidates
limit: fetchK,
...(filter && { filter })
}
},
{
$addFields: {
searchScore: { $meta: 'vectorSearchScore' }
}
},
{
$project: {
embedding: 1, // Keep embeddings for MMR calculation
documentId: 1,
content: 1,
title: 1,
product: 1,
metadata: 1,
searchScore: 1
}
}
];
const candidates = await collection.aggregate(pipeline).toArray();
// Step 2: Apply MMR selection algorithm
return this.selectMMRDocuments(candidates, embedding, limit, lambdaMult);
}
/**
* MMR selection algorithm
* Based on the standard MMR formula: λ * relevance - (1-λ) * max_similarity_to_selected
*/
selectMMRDocuments(candidates, _queryEmbedding, limit, lambdaMult) {
if (candidates.length === 0)
return [];
const selected = [];
const remaining = [...candidates];
// Step 1: Select the most relevant document first
const firstDoc = remaining.shift();
selected.push(firstDoc);
// Step 2: Iteratively select documents using MMR
while (selected.length < limit && remaining.length > 0) {
let bestDoc = null;
let bestScore = -Infinity;
let bestIndex = -1;
for (let i = 0; i < remaining.length; i++) {
const candidate = remaining[i];
// Calculate relevance score (already from vector search)
const relevanceScore = candidate.searchScore || 0;
// Calculate max similarity to already selected documents
let maxSimilarity = 0;
for (const selectedDoc of selected) {
if (selectedDoc.embedding && candidate.embedding) {
const similarity = this.cosineSimilarity(candidate.embedding, selectedDoc.embedding);
maxSimilarity = Math.max(maxSimilarity, similarity);
}
}
// MMR score: λ * relevance - (1-λ) * max_similarity
const mmrScore = lambdaMult * relevanceScore - (1 - lambdaMult) * maxSimilarity;
if (mmrScore > bestScore) {
bestScore = mmrScore;
bestDoc = candidate;
bestIndex = i;
}
}
if (bestDoc) {
selected.push(bestDoc);
remaining.splice(bestIndex, 1);
}
else {
break;
}
}
// Remove embeddings from final results to save bandwidth
return selected.map(doc => {
const { embedding: _, ...docWithoutEmbedding } = doc;
return docWithoutEmbedding;
});
}
/**
* Calculate cosine similarity between two vectors
*/
cosineSimilarity(a, b) {
if (a.length !== b.length)
return 0;
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
return magnitude === 0 ? 0 : dotProduct / magnitude;
}
/**
* Keyword search using MongoDB Atlas Search
* Based on: https://github.com/JohnGUnderwood/atlas-hybrid-search
*/
async keywordSearch(query, limit = 10) {
const collection = this.getCollection();
// Use MongoDB Atlas Search with text index
const pipeline = [
{
$search: {
index: 'text_index',
text: {
query: query,
path: ['content', 'title'],
fuzzy: {
maxEdits: 2,
prefixLength: 3
}
}
}
},
{
$limit: limit
},
{
$addFields: {
searchScore: { $meta: 'searchScore' }
}
},
{
$project: {
embedding: 0 // Exclude embeddings from results
}
}
];
try {
return await collection.aggregate(pipeline).toArray();
}
catch (error) {
// Fallback to regex search if Atlas Search is not available
console.warn('Atlas Search failed, falling back to regex search:', error);
return collection.find({
$or: [
{ content: { $regex: query, $options: 'i' } },
{ title: { $regex: query, $options: 'i' } }
]
})
.limit(limit)
.toArray();
}
}
/**
* Ensure vector index exists and is READY
*/
async ensureVectorIndex() {
const collection = this.getCollection();
try {
const indexes = await collection.listSearchIndexes().toArray();
const existingIndex = indexes.find((i) => i.name === config.storage.vectorIndexName);
if (!existingIndex) {
console.log(`🔨 Creating vector search index (this takes 1-2 minutes)...`);
await collection.createSearchIndex({
name: config.storage.vectorIndexName,
type: 'vectorSearch',
definition: {
fields: [
{
type: 'vector',
path: 'embedding',
numDimensions: config.embedding.dimensions,
similarity: 'cosine',
},
],
},
});
// CRITICAL: Wait for index to be READY (not just created)
console.log(`⏳ Waiting for vector index to be ready...`);
await this.waitForIndexReady(config.storage.vectorIndexName);
console.log(`✅ Vector index is READY with ${config.embedding.dimensions} dimensions`);
}
else if (existingIndex.status !== 'READY') {
console.log(`⏳ Vector index exists but not ready (status: ${existingIndex.status}). Waiting...`);
await this.waitForIndexReady(config.storage.vectorIndexName);
console.log(`✅ Vector index is now READY`);
}
else {
console.log(`✅ Vector index already exists and is READY`);
}
}
catch (error) {
console.error('⚠️ Warning: Could not create vector index:', error);
console.warn('The MCP will continue but vector search may not work until indexes are created.');
// Don't throw - allow MCP to connect even without indexes
}
}
/**
* Wait for a search index to be ready
*/
async waitForIndexReady(indexName, maxWaitTime = 120000) {
const collection = this.getCollection();
const startTime = Date.now();
while (Date.now() - startTime < maxWaitTime) {
const indexes = await collection.listSearchIndexes().toArray();
const index = indexes.find((i) => i.name === indexName);
if (index && index.status === 'READY') {
return;
}
const elapsed = Math.round((Date.now() - startTime) / 1000);
console.log(`⏳ Waiting for ${indexName} to be ready... (${elapsed}s elapsed, status: ${index?.status || 'CREATING'})`);
// Wait 5 seconds before checking again
await new Promise(resolve => setTimeout(resolve, 5000));
}
console.warn(`⚠️ Index ${indexName} not ready after ${maxWaitTime / 1000} seconds. Continuing anyway...`);
}
/**
* Ensure text index exists for keyword search and is READY
* Based on: https://github.com/JohnGUnderwood/atlas-hybrid-search/blob/main/create-search-indexes.mjs
*/
async ensureTextIndex() {
const collection = this.getCollection();
try {
const indexes = await collection.listSearchIndexes().toArray();
const existingIndex = indexes.find((i) => i.name === 'text_index');
if (!existingIndex) {
console.log(`🔨 Creating text search index (this takes 1-2 minutes)...`);
await collection.createSearchIndex({
name: 'text_index',
definition: {
mappings: {
dynamic: false,
fields: {
content: {
type: 'string',
analyzer: 'lucene.english',
multi: {
standardAnalyzer: {
type: 'string',
analyzer: 'lucene.standard'
}
}
},
title: {
type: 'string',
analyzer: 'lucene.standard',
multi: {
keywordAnalyzer: {
type: 'string',
analyzer: 'lucene.keyword'
}
}
},
product: {
type: 'string',
analyzer: 'lucene.keyword'
},
version: {
type: 'string',
analyzer: 'lucene.keyword'
}
}
}
}
});
// CRITICAL: Wait for index to be READY (not just created)
console.log(`⏳ Waiting for text index to be ready...`);
await this.waitForIndexReady('text_index');
console.log(`✅ Text index is READY for keyword search`);
}
else if (existingIndex.status !== 'READY') {
console.log(`⏳ Text index exists but not ready (status: ${existingIndex.status}). Waiting...`);
await this.waitForIndexReady('text_index');
console.log(`✅ Text index is now READY`);
}
else {
console.log(`✅ Text index already exists and is READY`);
}
}
catch (error) {
console.error('⚠️ Warning: Could not create text index:', error);
console.warn('The MCP will continue but keyword search may not work until indexes are created.');
// Don't throw - allow MCP to connect even without indexes
}
}
/**
* Store repository commit hash for smart update tracking
*/
async storeRepositoryHash(repoName, commitHash) {
if (!this.db)
throw new Error('Not connected to MongoDB');
const stateCollection = this.db.collection('repository_states');
await stateCollection.updateOne({ repoName }, {
$set: {
repoName,
commitHash,
lastIndexed: new Date()
}
}, { upsert: true });
}
/**
* Get stored repository commit hash
*/
async getRepositoryHash(repoName) {
if (!this.db)
throw new Error('Not connected to MongoDB');
const stateCollection = this.db.collection('repository_states');
const state = await stateCollection.findOne({ repoName });
return state?.commitHash || null;
}
/**
* Check if all indexes are ready
*/
async checkIndexesReady() {
const collection = this.getCollection();
try {
const indexes = await collection.listSearchIndexes().toArray();
const vectorIndex = indexes.find((i) => i.name === config.storage.vectorIndexName);
const textIndex = indexes.find((i) => i.name === 'text_index');
const ready = vectorIndex?.status === 'READY' && textIndex?.status === 'READY';
return {
ready,
details: [
{ name: config.storage.vectorIndexName, status: vectorIndex?.status || 'NOT_FOUND' },
{ name: 'text_index', status: textIndex?.status || 'NOT_FOUND' }
]
};
}
catch (error) {
return { ready: false, details: [] };
}
}
/**
* Get statistics
*/
async getStats() {
const collection = this.getCollection();
const [total, products, models] = await Promise.all([
collection.countDocuments(),
collection.distinct('product'),
collection.distinct('embeddingModel')
]);
return {
totalDocuments: total,
products,
models,
expectedDimensions: config.embedding.dimensions,
expectedModel: config.embedding.model
};
}
/**
* Fetch all chunks for a specific file
* Used by the fetch-full-context tool to reconstruct complete files
*/
async fetchFileChunks(filename, product) {
const collection = this.getCollection();
try {
// Find all documents with matching title and product
const chunks = await collection.find({
title: filename,
product: product
})
.sort({
'metadata.chunkIndex': 1, // Sort by chunk index if available
documentId: 1 // Secondary sort by document ID
})
.toArray();
console.error(`📄 Found ${chunks.length} chunks for ${filename} in ${product}`);
return chunks;
}
catch (error) {
console.error('Error fetching file chunks:', error);
throw new Error(`Failed to fetch chunks for ${filename}: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Count documents for a specific product
*/
async countByProduct(product) {
const collection = this.getCollection();
return collection.countDocuments({ product });
}
}
//# sourceMappingURL=storage.js.map