@knath2000/codebase-indexing-mcp
Version:
MCP server for codebase indexing with Voyage AI embeddings and Qdrant vector storage
596 lines âĸ 24.3 kB
JavaScript
import { QdrantClient } from '@qdrant/js-client-rest';
export class QdrantVectorClient {
constructor(url, apiKey, collectionName, embeddingDimension, keywordTimeoutMs = 10000, keywordMaxChunks = 20000) {
this.requestDurations = []; // To store last N request durations
this.maxDurationsToStore = 100; // Store up to 100 durations
this.url = url;
this.apiKey = apiKey;
this.collectionName = collectionName;
this.embeddingDimension = embeddingDimension;
const clientOptions = this.apiKey ? { url: this.url, apiKey: this.apiKey } : { url: this.url };
this.client = new QdrantClient(clientOptions);
this.keywordTimeoutMs = keywordTimeoutMs;
this.keywordMaxChunks = keywordMaxChunks;
}
/**
* Initialize the collection with proper schema
*/
async initializeCollection() {
try {
// Check if collection exists
const collections = await this.client.getCollections();
const collectionExists = collections.collections.some(col => col.name === this.collectionName);
if (!collectionExists) {
await this.client.createCollection(this.collectionName, {
vectors: {
size: this.embeddingDimension,
distance: 'Cosine'
}
});
console.log(`Created collection: ${this.collectionName}`);
// Create payload indexes for filtering capabilities
await this.createPayloadIndexes();
}
else {
// Check if existing collection has correct dimensions
const collectionInfo = await this.client.getCollection(this.collectionName);
const existingDimensions = collectionInfo.config?.params?.vectors?.size;
if (existingDimensions !== this.embeddingDimension) {
console.log(`â ī¸ Collection exists but has wrong dimensions: ${existingDimensions}, expected: ${this.embeddingDimension}`);
console.log(`đ Recreating collection with correct dimensions...`);
await this.recreateCollection();
}
else {
console.log(`â
Collection exists with correct dimensions: ${this.embeddingDimension}`);
// Ensure payload indexes exist (idempotent operation)
await this.createPayloadIndexes();
}
}
}
catch (error) {
throw new Error(`Failed to initialize collection: ${error}`);
}
}
/**
* Recreate the collection with correct dimensions (deletes all existing data)
*/
async recreateCollection() {
try {
console.log(`đī¸ Deleting existing collection: ${this.collectionName}`);
// Delete existing collection
try {
await this.client.deleteCollection(this.collectionName);
console.log(`â
Deleted existing collection`);
}
catch (deleteError) {
console.log(`â ī¸ Collection deletion failed (might not exist): ${deleteError}`);
}
// Wait a moment for deletion to complete
await new Promise(resolve => setTimeout(resolve, 1000));
// Create new collection with correct dimensions
console.log(`đ¯ Creating new collection with ${this.embeddingDimension} dimensions`);
await this.client.createCollection(this.collectionName, {
vectors: {
size: this.embeddingDimension,
distance: 'Cosine'
}
});
console.log(`â
Successfully recreated collection: ${this.collectionName}`);
// Create payload indexes for filtering capabilities
await this.createPayloadIndexes();
}
catch (error) {
throw new Error(`Failed to recreate collection: ${error}`);
}
}
/**
* Create payload indexes for filtering capabilities (matching Cursor's @codebase functionality)
*/
async createPayloadIndexes() {
console.log(`đ§ [Qdrant] Creating payload indexes for enhanced filtering...`);
try {
// Index for chunkType filtering (function, class, interface, etc.)
await this.client.createPayloadIndex(this.collectionName, {
field_name: 'chunkType',
field_schema: 'keyword'
});
console.log(`â
[Qdrant] Created chunkType index for filtering by code elements`);
// Index for language filtering (typescript, javascript, etc.)
await this.client.createPayloadIndex(this.collectionName, {
field_name: 'language',
field_schema: 'keyword'
});
console.log(`â
[Qdrant] Created language index for filtering by programming language`);
// Index for filePath filtering
await this.client.createPayloadIndex(this.collectionName, {
field_name: 'filePath',
field_schema: 'keyword'
});
console.log(`â
[Qdrant] Created filePath index for file-specific searches`);
// Index for fileKind filtering (code vs docs)
await this.client.createPayloadIndex(this.collectionName, {
field_name: 'fileKind',
field_schema: 'keyword'
});
console.log(`â
[Qdrant] Created fileKind index for distinguishing code vs documentation`);
console.log(`đ [Qdrant] All payload indexes created successfully - collection ready for -style filtered searches!`);
}
catch (error) {
// Check if indexes already exist (this is not an error)
const errorMsg = String(error).toLowerCase();
if (errorMsg.includes('already exists') || errorMsg.includes('conflict')) {
console.log(`âšī¸ [Qdrant] Payload indexes already exist, skipping creation`);
}
else {
console.error(`â [Qdrant] Failed to create payload indexes:`, error);
throw new Error(`Failed to create payload indexes: ${error}`);
}
}
}
/**
* Create payload indexes on existing collection (useful for upgrading existing collections)
*/
async ensurePayloadIndexes() {
console.log(`đ§ [Qdrant] Ensuring payload indexes exist for collection: ${this.collectionName}`);
await this.createPayloadIndexes();
}
/**
* Store embedding vectors in Qdrant
*/
async storeEmbeddings(embeddings) {
console.log(`đ [Qdrant] Starting to store ${embeddings.length} embeddings`);
// Early-exit
if (embeddings.length === 0) {
console.log('âšī¸ [Qdrant] No embeddings to store â skipping');
return;
}
// Verify connection once
console.log('đ [Qdrant] Verifying connection...');
if (!(await this.testConnection())) {
throw new Error('Qdrant connection test failed');
}
const batchSize = 256; // Safer batch size to avoid large payloads/timeouts
// Helper to transform EmbeddingVector â Qdrant point format
const toPoint = (emb) => ({
id: emb.id,
vector: emb.vector,
payload: emb.payload
});
try {
let stored = 0;
for (let i = 0; i < embeddings.length; i += batchSize) {
const batch = embeddings.slice(i, i + batchSize).map(toPoint);
// Defensive: ensure vectors present & correct dimension
for (const p of batch) {
if (!p.vector || p.vector.length !== this.embeddingDimension) {
throw new Error(`Vector dimension mismatch for id=${p.id}`);
}
}
const start = Date.now();
await this.client.upsert(this.collectionName, {
wait: false,
points: batch
});
const dur = Date.now() - start;
stored += batch.length;
console.log(`â
[Qdrant] Upserted batch (${batch.length}) â total stored so far: ${stored}/${embeddings.length} in ${dur}ms`);
}
// Final verification (may lag slightly because wait=false)
const finalInfo = await this.getCollectionInfo();
console.log(`đ [Qdrant] Collection now reports ${finalInfo.points_count || 0} points`);
}
catch (err) {
console.error('â [Qdrant] storeEmbeddings failed:', err);
throw err;
}
}
/**
* Store a single embedding vector
*/
async storeEmbedding(embedding) {
return this.storeEmbeddings([embedding]);
}
/**
* Search for similar vectors with enhanced error handling and logging
*/
async searchSimilar(query, queryVector) {
console.log(`đ [Qdrant] Starting search with query: "${query.query}"`);
console.log(`đ [Qdrant] Search parameters:`, {
limit: query.limit || 50, // Increased from 10 to 50 for better coverage
threshold: query.threshold ?? 0.25,
language: query.language,
filePath: query.filePath,
chunkType: query.chunkType,
vectorLength: queryVector.length
});
try {
// Validate input parameters
if (!queryVector || queryVector.length === 0) {
throw new Error('Query vector is empty or invalid');
}
if (queryVector.length !== this.embeddingDimension) {
throw new Error(`Query vector dimension mismatch: expected ${this.embeddingDimension}, got ${queryVector.length}`);
}
const searchParams = {
vector: queryVector,
limit: query.limit || 50, // Increased from 10 to 50 for better coverage
score_threshold: query.threshold ?? 0.25,
with_payload: true,
with_vector: false
};
// Build filters based on query parameters
const filterConditions = [];
if (query.language) {
filterConditions.push({ key: 'language', match: { value: query.language } });
console.log(`đ [Qdrant] Adding language filter: ${query.language}`);
}
if (query.filePath) {
filterConditions.push({ key: 'filePath', match: { value: query.filePath } });
console.log(`đ [Qdrant] Adding file path filter: ${query.filePath}`);
}
if (query.chunkType) {
filterConditions.push({ key: 'chunkType', match: { value: query.chunkType } });
console.log(`đ [Qdrant] Adding chunk type filter: ${query.chunkType}`);
}
// Add fileKind filter when preferImplementation is true
if (query.preferImplementation === true) {
filterConditions.push({ key: 'fileKind', match: { value: 'code' } });
console.log(`đ [Qdrant] Adding fileKind filter: code (prefer implementation)`);
}
if (filterConditions.length > 0) {
searchParams.filter = { must: filterConditions };
console.log(`đ [Qdrant] Applied ${filterConditions.length} filter(s)`);
}
console.log(`đ [Qdrant] Executing search in collection: ${this.collectionName}`);
const searchResult = await this.client.search(this.collectionName, searchParams);
console.log(`â
[Qdrant] Search completed successfully, found ${searchResult.length} results`);
const results = searchResult.map(result => {
const chunk = this.payloadToCodeChunk(result.payload);
chunk.id = result.id; // Set the ID from the search result
return {
id: result.id,
score: result.score,
chunk,
snippet: this.createSnippet(result.payload),
context: this.createContextDescription(result.payload)
};
});
console.log(`đ [Qdrant] Returning ${results.length} processed results`);
return results;
}
catch (error) {
console.error(`â [Qdrant] Search failed:`, error);
if (error instanceof Error) {
// Enhance error message with more context
throw new Error(`Qdrant search failed: ${error.message} (Collection: ${this.collectionName}, Query: "${query.query}")`);
}
throw new Error(`Qdrant search failed: ${String(error)}`);
}
}
/**
* Perform a simple keyword-based search across all indexed chunks.
* This provides a lightweight BM25-style sparse retrieval fallback that can be
* blended with dense semantic search results for higher accuracy â similar to
* Cursor's hybrid search pipeline.
*
* NOTE: This implementation scrolls the entire collection once and performs
* in-memory scoring. For typical source-code repositories (a few thousand
* chunks) this is fast enough and keeps the implementation dependency-free.
* If the collection grows large, consider replacing this with Qdrant's
* full-text payload index once it becomes generally available.
*/
async keywordSearch(query) {
const searchText = query.query.toLowerCase();
if (!searchText.trim()) {
return [];
}
const startTime = Date.now();
let scanned = 0;
// Collect all chunks (streaming in pages of 1000) â this is OK for <50k chunks
const pageLimit = 1000;
let offset = undefined;
const allPoints = [];
try {
do {
const page = await this.client.scroll(this.collectionName, {
with_payload: true,
with_vector: false,
limit: pageLimit,
offset: offset ?? null
});
page.points.forEach(point => {
allPoints.push({ id: point.id, payload: point.payload });
});
scanned += page.points.length;
if (Date.now() - startTime > this.keywordTimeoutMs) {
console.warn(`[Qdrant] keywordSearch timeout after ${this.keywordTimeoutMs} ms, stopping scroll early (scanned ${scanned} points)`);
break;
}
if (scanned >= this.keywordMaxChunks) {
console.warn(`[Qdrant] keywordSearch reached max chunk limit (${this.keywordMaxChunks}), stopping scroll`);
break;
}
offset = page.next_page_offset;
} while (offset !== undefined);
}
catch (error) {
console.error('[Qdrant] keywordSearch scroll failed:', error);
return [];
}
// Score and sort results by relevance
const scoredResults = allPoints
.map(point => {
const chunk = this.payloadToCodeChunk(point.payload);
chunk.id = String(point.id);
return {
point,
chunk,
score: this.calculateKeywordScore(searchText, point.payload.content)
};
})
.filter(result => {
// Apply score threshold
if (result.score <= (query.threshold ?? 0.25)) {
return false;
}
// Apply preferImplementation filter
if (query.preferImplementation === true && result.point.payload.fileKind !== 'code') {
return false;
}
// Apply other query filters
if (query.language && result.point.payload.language !== query.language) {
return false;
}
if (query.chunkType && result.point.payload.chunkType !== query.chunkType) {
return false;
}
if (query.filePath && result.point.payload.filePath !== query.filePath) {
return false;
}
return true;
})
.sort((a, b) => b.score - a.score)
.slice(0, query.limit || 50); // Use consistent limit
const results = scoredResults.map(result => ({
id: String(result.point.id),
score: result.score,
chunk: result.chunk,
snippet: this.createSnippet(result.point.payload),
context: this.createContextDescription(result.point.payload)
}));
return results;
}
/**
* Delete embeddings by file path
*/
async deleteByFilePath(filePath) {
try {
await this.client.delete(this.collectionName, {
filter: {
must: [{ key: 'filePath', match: { value: filePath } }]
}
});
}
catch (error) {
throw new Error(`Failed to delete embeddings for file ${filePath}: ${error}`);
}
}
/**
* Delete embeddings by IDs
*/
async deleteByIds(ids) {
try {
await this.client.delete(this.collectionName, {
points: ids
});
}
catch (error) {
throw new Error(`Failed to delete embeddings by IDs: ${error}`);
}
}
/**
* Get collection info and stats
*/
async getCollectionInfo() {
try {
return await this.client.getCollection(this.collectionName);
}
catch (error) {
throw new Error(`Failed to get collection info: ${error}`);
}
}
/**
* Count total points in collection
*/
async countPoints() {
try {
const info = await this.getCollectionInfo();
return info.points_count || 0;
}
catch (error) {
throw new Error(`Failed to count points: ${error}`);
}
}
/**
* Clear all data from collection
*/
async clearCollection() {
try {
await this.client.delete(this.collectionName, {
filter: { must: [] }
});
}
catch (error) {
throw new Error(`Failed to clear collection: ${error}`);
}
}
/**
* Get points by their IDs
*/
async getPointsById(ids) {
try {
const response = await this.client.retrieve(this.collectionName, {
ids: ids,
with_payload: true,
with_vector: true,
});
return response;
}
catch (error) {
console.error(`â [Qdrant] Failed to retrieve points by ID:`, error);
throw new Error(`Qdrant getPointsById failed: ${String(error)}`);
}
}
/**
* Get average request latency for Qdrant client
*/
getAverageLatency() {
if (this.requestDurations.length === 0) {
return 0;
}
const sum = this.requestDurations.reduce((a, b) => a + b, 0);
return sum / this.requestDurations.length;
}
/**
* Test connection to Qdrant
*/
async testConnection() {
try {
const startTime = Date.now();
await this.client.getCollections();
const duration = Date.now() - startTime;
this.addRequestDuration(duration);
return true;
}
catch (error) {
console.error('Qdrant connection test failed:', error);
return false;
}
}
addRequestDuration(duration) {
this.requestDurations.push(duration);
if (this.requestDurations.length > this.maxDurationsToStore) {
this.requestDurations.shift(); // Remove the oldest duration
}
}
/**
* Get embeddings by file path
*/
async getEmbeddingsByFilePath(filePath) {
try {
const searchResult = await this.client.scroll(this.collectionName, {
filter: {
must: [{ key: 'filePath', match: { value: filePath } }]
},
limit: 1000,
with_payload: true,
with_vector: true
});
return searchResult.points.map(point => ({
id: point.id,
vector: point.vector,
payload: point.payload
}));
}
catch (error) {
throw new Error(`Failed to get embeddings for file ${filePath}: ${error}`);
}
}
/**
* Check if file is already indexed
*/
async isFileIndexed(filePath, lastModified) {
try {
const searchResult = await this.client.scroll(this.collectionName, {
filter: {
must: [
{ key: 'filePath', match: { value: filePath } },
{ key: 'metadata.lastModified', match: { value: lastModified } }
]
},
limit: 1
});
return searchResult.points.length > 0;
}
catch (error) {
return false;
}
}
/**
* Convert payload to CodeChunk
*/
payloadToCodeChunk(payload) {
return {
id: '', // Will be set by the calling function
content: payload.content,
filePath: payload.filePath,
language: payload.language,
startLine: payload.startLine,
endLine: payload.endLine,
chunkType: payload.chunkType,
functionName: payload.functionName,
className: payload.className,
moduleName: payload.moduleName,
contentHash: payload.contentHash,
metadata: payload.metadata
};
}
/**
* Create a snippet from the payload
*/
createSnippet(payload) {
const lines = payload.content.split('\n');
const maxLines = 5;
if (lines.length <= maxLines) {
return payload.content;
}
const snippet = lines.slice(0, maxLines).join('\n');
return snippet + '\n...';
}
/**
* Create a context description for search results (similar to Cursor's @codebase format)
*/
createContextDescription(payload) {
const parts = [];
// Add file path (relative format)
const filePath = payload.filePath.startsWith('/app/')
? payload.filePath.substring(5)
: payload.filePath;
parts.push(`đ ${filePath}`);
// Add line range
if (payload.startLine && payload.endLine) {
parts.push(`đ Lines ${payload.startLine}-${payload.endLine}`);
}
// Add function/class name if available
if (payload.functionName) {
parts.push(`đ§ Function: ${payload.functionName}`);
}
else if (payload.className) {
parts.push(`đĻ Class: ${payload.className}`);
}
else if (payload.moduleName) {
parts.push(`đ Module: ${payload.moduleName}`);
}
// Add chunk type
parts.push(`đˇī¸ Type: ${payload.chunkType}`);
// Add language
parts.push(`đģ ${payload.language}`);
return parts.join(' | ');
}
/**
* Calculate keyword score for keyword search.
* This is a very basic TF-IDF-like scoring.
* For a real-world application, you'd need a proper tokenizer, stopwords,
* and a more sophisticated scoring mechanism.
*/
calculateKeywordScore(query, content) {
const tokens = query.split(/\s+/).filter(Boolean);
let score = 0;
for (const token of tokens) {
const occurrences = content.split(token).length - 1;
score += occurrences;
}
return score;
}
}
//# sourceMappingURL=qdrant-client.js.map