@knath2000/codebase-indexing-mcp
Version:
MCP server for codebase indexing with Voyage AI embeddings and Qdrant vector storage
583 lines • 25.3 kB
JavaScript
import { glob } from 'glob';
import { stat, readFile } from 'fs/promises';
import { join, resolve } from 'path';
import { EventEmitter } from 'events';
import { VoyageClient } from '../clients/voyage-client.js';
import { QdrantVectorClient } from '../clients/qdrant-client.js';
import { CodeParser } from '../parsers/code-parser.js';
import { WorkspaceManager } from './workspace-manager.js';
import { IndexingStatus } from '../types.js';
export class IndexingService extends EventEmitter {
constructor(config, workspaceManager) {
super();
this.currentWorkspace = null;
this.config = config;
this.workspaceManager = workspaceManager || new WorkspaceManager();
this.voyageClient = new VoyageClient(config.voyageApiKey);
// Create a temporary Qdrant client with default collection
// This will be updated during initialize() with workspace-specific collection
this.qdrantClient = new QdrantVectorClient(config.qdrantUrl, config.qdrantApiKey, config.collectionName, // Temporary, will be replaced
this.voyageClient.getEmbeddingDimension(config.embeddingModel));
this.codeParser = new CodeParser();
this.progress = {
totalFiles: 0,
processedFiles: 0,
totalChunks: 0,
processedChunks: 0,
currentFile: '',
status: IndexingStatus.IDLE,
startTime: new Date(),
errors: [],
incrementalUpdates: 0,
skippedFiles: 0,
cacheHits: 0
};
this.stats = {
totalFiles: 0,
totalChunks: 0,
totalSize: 0,
languageDistribution: {},
chunkTypeDistribution: {},
lastIndexed: new Date(),
indexingDuration: 0,
averageChunkSize: 0,
largestFile: '',
errors: 0,
warnings: 0,
incrementalUpdates: 0,
cacheHitRate: 0,
averageComplexity: 0,
tokensIndexed: 0,
memoryUsage: 0,
searchQueriesServed: 0,
averageSearchLatency: 0
};
}
/**
* Initialize the indexing service with enhanced workspace detection
*/
async initialize() {
try {
// Detect current workspace first
this.currentWorkspace = await this.workspaceManager.detectCurrentWorkspace();
// Update Qdrant client to use workspace-specific collection
this.updateQdrantClientForWorkspace(this.currentWorkspace);
// Test connections
const voyageTest = await this.voyageClient.testConnection();
if (!voyageTest) {
throw new Error('Failed to connect to Voyage AI');
}
const qdrantTest = await this.qdrantClient.testConnection();
if (!qdrantTest) {
throw new Error('Failed to connect to Qdrant');
}
// Initialize workspace-specific Qdrant collection
await this.qdrantClient.initializeCollection();
console.log(`🔧 IndexingService initialized for workspace: ${this.currentWorkspace.name}`);
console.log(`📊 Using collection: ${this.currentWorkspace.collectionName}`);
console.log(`📁 Workspace type: ${this.currentWorkspace.type}`);
console.log(`🎯 Folders: ${this.currentWorkspace.folders.length} folder(s)`);
}
catch (error) {
throw new Error(`Failed to initialize indexing service: ${error}`);
}
}
/**
* Update Qdrant client for workspace-specific collection
*/
updateQdrantClientForWorkspace(workspace) {
this.qdrantClient = new QdrantVectorClient(this.config.qdrantUrl, this.config.qdrantApiKey, workspace.collectionName, // Use workspace-specific collection name
this.voyageClient.getEmbeddingDimension(this.config.embeddingModel));
console.log(`🔄 Updated Qdrant client for workspace collection: ${workspace.collectionName}`);
}
/**
* Index a directory recursively
*/
async indexDirectory(directoryPath) {
const absolutePath = resolve(directoryPath);
try {
this.progress.status = IndexingStatus.SCANNING;
this.progress.startTime = new Date();
this.emit('progress', this.progress);
// Find all files to index
const files = await this.findFiles(absolutePath);
this.progress.totalFiles = files.length;
console.log(`Found ${files.length} files to index`);
this.emit('progress', this.progress);
// Process files in batches
const batchSize = 10;
const allChunks = [];
console.log(`📁 Processing ${files.length} files in batches of ${batchSize}`);
for (let i = 0; i < files.length; i += batchSize) {
const batch = files.slice(i, i + batchSize);
console.log(`🔄 Processing file batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(files.length / batchSize)}`);
const batchChunks = await this.processBatch(batch);
allChunks.push(...batchChunks);
console.log(`📊 Accumulated ${allChunks.length} chunks so far`);
}
console.log(`🎯 Finished processing all files. Total chunks: ${allChunks.length}`);
this.progress.totalChunks = allChunks.length;
this.progress.status = IndexingStatus.EMBEDDING;
this.emit('progress', this.progress);
// Generate embeddings and store
console.log(`🚀 Starting embedding and storage phase for ${allChunks.length} chunks`);
await this.embedAndStore(allChunks);
console.log('✅ Embedding and storage completed successfully');
// Update stats
this.updateStats(allChunks);
this.progress.status = IndexingStatus.COMPLETED;
this.emit('progress', this.progress);
return this.stats;
}
catch (error) {
this.progress.status = IndexingStatus.ERROR;
this.progress.errors.push({
filePath: '',
error: error instanceof Error ? error.message : String(error),
timestamp: new Date(),
severity: 'critical'
});
this.emit('progress', this.progress);
throw error;
}
}
/**
* Index a single file
*/
async indexFile(filePath) {
const absolutePath = resolve(filePath);
try {
this.progress.status = IndexingStatus.PARSING;
this.progress.currentFile = filePath;
this.emit('progress', this.progress);
// Check if file should be skipped
if (this.shouldSkipFile(absolutePath)) {
return [];
}
// Check file size and binary content
const fileStats = await stat(absolutePath);
if (fileStats.size > this.config.maxFileSize) {
console.log(`⚠️ Skipping ${filePath}: too large (${Math.round(fileStats.size / 1024 / 1024 * 100) / 100}MB > ${Math.round(this.config.maxFileSize / 1024 / 1024)}MB)`);
return [];
}
// Skip empty files
if (fileStats.size === 0) {
console.log(`⚠️ Skipping ${filePath}: empty file`);
return [];
}
// Check if file is binary
const isBinary = await this.isBinaryFile(absolutePath);
if (isBinary) {
console.log(`⚠️ Skipping ${filePath}: detected as binary file`);
return [];
}
// Check if file is already indexed and up to date
const isIndexed = await this.qdrantClient.isFileIndexed(absolutePath, fileStats.mtime.getTime());
if (isIndexed) {
console.log(`File ${filePath} is already indexed and up to date`);
return [];
}
// Parse the file
const chunks = await this.codeParser.parseFile(absolutePath);
if (chunks.length === 0) {
return [];
}
// Generate embeddings and store
await this.embedAndStore(chunks);
return chunks;
}
catch (error) {
const indexingError = {
filePath,
error: error instanceof Error ? error.message : String(error),
timestamp: new Date(),
severity: 'error'
};
this.progress.errors.push(indexingError);
this.emit('progress', this.progress);
console.error(`Error indexing file ${filePath}:`, error);
return [];
}
}
/**
* Re-index a file (force update)
*/
async reindexFile(filePath) {
const absolutePath = resolve(filePath);
try {
// Delete existing embeddings for this file
await this.qdrantClient.deleteByFilePath(absolutePath);
// Index the file
return await this.indexFile(filePath);
}
catch (error) {
console.error(`Error re-indexing file ${filePath}:`, error);
throw error;
}
}
/**
* Remove a file from the index
*/
async removeFile(filePath) {
const absolutePath = resolve(filePath);
try {
await this.qdrantClient.deleteByFilePath(absolutePath);
console.log(`Removed file ${filePath} from index`);
}
catch (error) {
console.error(`Error removing file ${filePath} from index:`, error);
throw error;
}
}
/**
* Clear entire index
*/
async clearIndex() {
try {
await this.qdrantClient.clearCollection();
console.log('Index cleared successfully');
}
catch (error) {
console.error('Error clearing index:', error);
throw error;
}
}
/**
* Get indexing progress
*/
getProgress() {
return { ...this.progress };
}
/**
* Get indexing statistics
*/
getStats() {
return { ...this.stats };
}
/**
* Get collection info from Qdrant
*/
async getCollectionInfo() {
return await this.qdrantClient.getCollectionInfo();
}
/**
* Count total indexed chunks
*/
async countIndexedChunks() {
return await this.qdrantClient.countPoints();
}
/**
* Find files to index
*/
async findFiles(directoryPath) {
const pattern = join(directoryPath, '**/*');
const allFiles = await glob(pattern, {
nodir: true,
ignore: this.config.excludePatterns
});
console.log(`Found ${allFiles.length} files after exclude pattern filtering`);
// Hardcoded filter to exclude node_modules paths
const filteredFiles = allFiles.filter(file => !file.includes('node_modules'));
// Filter by supported extensions
const supportedFiles = filteredFiles.filter(file => {
const ext = file.split('.').pop()?.toLowerCase();
return ext && this.config.supportedExtensions.includes(`.${ext}`);
});
console.log(`📝 ${supportedFiles.length} files have supported extensions`);
// Filter by file size and binary content
const validFiles = [];
let skippedSize = 0;
let skippedBinary = 0;
for (const file of supportedFiles) {
try {
const fileStat = await stat(file);
// Check file size (1MB limit)
if (fileStat.size > this.config.maxFileSize) {
skippedSize++;
console.log(`⚠️ Skipping ${file}: too large (${Math.round(fileStat.size / 1024 / 1024 * 100) / 100}MB > ${Math.round(this.config.maxFileSize / 1024 / 1024)}MB)`);
continue;
}
// Skip empty files
if (fileStat.size === 0) {
console.log(`⚠️ Skipping ${file}: empty file`);
continue;
}
// Check if file is binary
const isBinary = await this.isBinaryFile(file);
if (isBinary) {
skippedBinary++;
console.log(`⚠️ Skipping ${file}: detected as binary file`);
continue;
}
validFiles.push(file);
}
catch (error) {
console.warn(`❌ Could not process file ${file}:`, error);
}
}
console.log(`✅ Final result: ${validFiles.length} valid files to index`);
console.log(`📊 Filtering summary:`);
console.log(` - Skipped due to size (>${Math.round(this.config.maxFileSize / 1024 / 1024)}MB): ${skippedSize}`);
console.log(` - Skipped due to binary content: ${skippedBinary}`);
console.log(` - Valid text files: ${validFiles.length}`);
return validFiles;
}
/**
* Process a batch of files
*/
async processBatch(files) {
const chunks = [];
for (const file of files) {
this.progress.currentFile = file;
this.progress.status = IndexingStatus.PARSING;
this.emit('progress', this.progress);
try {
const fileChunks = await this.codeParser.parseFile(file);
chunks.push(...fileChunks);
this.progress.processedFiles++;
this.emit('progress', this.progress);
}
catch (error) {
const indexingError = {
filePath: file,
error: error instanceof Error ? error.message : String(error),
timestamp: new Date(),
severity: 'error'
};
this.progress.errors.push(indexingError);
this.emit('progress', this.progress);
}
}
return chunks;
}
/**
* Generate embeddings and store in Qdrant
*/
async embedAndStore(chunks) {
console.log(`🚀 Starting embedAndStore with ${chunks.length} chunks`);
// Filter out any null/undefined chunks early
const validChunksOnly = chunks.filter(chunk => chunk && chunk.content && chunk.content.trim().length > 0);
console.log(`🔍 After filtering: ${validChunksOnly.length} valid chunks (${chunks.length - validChunksOnly.length} filtered out)`);
if (validChunksOnly.length === 0) {
console.log('❌ No valid chunks to process, returning early');
return;
}
// Use filtered chunks for the rest of the method
chunks = validChunksOnly;
console.log('📊 Setting status to EMBEDDING');
this.progress.status = IndexingStatus.EMBEDDING;
this.emit('progress', this.progress);
const batchSize = this.config.batchSize;
console.log(`📦 Using batch size: ${batchSize}`);
const embeddings = [];
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize);
// Filter out any null/undefined chunks that might have slipped through
const validChunks = batch.filter(chunk => chunk && chunk.content);
if (validChunks.length === 0) {
console.log(`⚠️ Skipping batch ${Math.floor(i / batchSize) + 1} - no valid chunks`);
continue;
}
const texts = validChunks.map(chunk => chunk.content);
console.log(`🔄 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(chunks.length / batchSize)} with ${validChunks.length} valid chunks (${batch.length} total)`);
console.log(`📝 First chunk preview: ${texts[0]?.substring(0, 100)}...`);
try {
console.log(`🌐 Calling Voyage API with model: ${this.config.embeddingModel}`);
const vectors = await this.voyageClient.generateEmbeddingsBatch(texts, this.config.embeddingModel, 'document', batchSize);
console.log(`✅ Received ${vectors.length} embeddings from Voyage API`);
for (let j = 0; j < validChunks.length; j++) {
const chunk = validChunks[j];
const vector = vectors[j];
const payload = {
content: chunk.content,
filePath: chunk.filePath,
language: chunk.language,
chunkType: chunk.chunkType,
startLine: chunk.startLine,
endLine: chunk.endLine,
functionName: chunk.functionName || undefined,
className: chunk.className || undefined,
moduleName: chunk.moduleName || undefined,
contentHash: chunk.contentHash,
tokenCount: this.estimateTokenCount(chunk.content),
metadata: chunk.metadata,
fileKind: this.getFileKind(chunk.filePath)
};
embeddings.push({
id: chunk.id,
vector,
payload
});
}
this.progress.processedChunks += validChunks.length;
this.emit('progress', this.progress);
console.log(`📈 Progress: ${this.progress.processedChunks}/${chunks.length} chunks processed`);
}
catch (error) {
console.error(`❌ Error generating embeddings for batch:`, error);
throw error;
}
}
console.log(`🎯 Completed embedding generation for all ${embeddings.length} chunks`);
// Store embeddings in Qdrant
console.log('💾 Starting Qdrant storage phase');
this.progress.status = IndexingStatus.STORING;
this.emit('progress', this.progress);
const storeBatchSize = 100;
console.log(`📦 Storing in batches of ${storeBatchSize}`);
for (let i = 0; i < embeddings.length; i += storeBatchSize) {
const batch = embeddings.slice(i, i + storeBatchSize);
console.log(`💾 Storing batch ${Math.floor(i / storeBatchSize) + 1}/${Math.ceil(embeddings.length / storeBatchSize)} with ${batch.length} embeddings`);
await this.qdrantClient.storeEmbeddings(batch);
}
console.log('✅ Successfully completed embedAndStore process');
}
/**
* Check if file should be skipped
*/
shouldSkipFile(filePath) {
// Check against exclude patterns
for (const pattern of this.config.excludePatterns) {
if (filePath.includes(pattern.replace('*', ''))) {
return true;
}
}
// Check file extension
const ext = filePath.split('.').pop()?.toLowerCase();
if (!ext || !this.config.supportedExtensions.includes(`.${ext}`)) {
return true;
}
return false;
}
/**
* Check if file is binary by examining its content
*/
async isBinaryFile(filePath) {
try {
// Read first 8KB of the file to check for binary content
const buffer = await readFile(filePath, { flag: 'r' });
const sampleSize = Math.min(8192, buffer.length);
const sample = buffer.subarray(0, sampleSize);
// Check for null bytes (common in binary files)
for (let i = 0; i < sample.length; i++) {
if (sample[i] === 0) {
return true;
}
}
// Check for high percentage of non-printable characters
let nonPrintableCount = 0;
for (let i = 0; i < sample.length; i++) {
const byte = sample[i];
// Consider bytes outside printable ASCII range (except common whitespace)
if (byte < 9 || (byte > 13 && byte < 32) || byte > 126) {
nonPrintableCount++;
}
}
// If more than 30% of bytes are non-printable, consider it binary
const nonPrintableRatio = nonPrintableCount / sample.length;
if (nonPrintableRatio > 0.3) {
return true;
}
// Check for common binary file signatures (magic numbers)
const binarySignatures = [
[0x89, 0x50, 0x4E, 0x47], // PNG
[0xFF, 0xD8, 0xFF], // JPEG
[0x47, 0x49, 0x46], // GIF
[0x25, 0x50, 0x44, 0x46], // PDF
[0x50, 0x4B, 0x03, 0x04], // ZIP
[0x50, 0x4B, 0x05, 0x06], // ZIP (empty)
[0x50, 0x4B, 0x07, 0x08], // ZIP (spanned)
[0x1F, 0x8B], // GZIP
[0x42, 0x5A, 0x68], // BZIP2
[0x7F, 0x45, 0x4C, 0x46], // ELF executable
[0x4D, 0x5A], // Windows PE executable
[0xCA, 0xFE, 0xBA, 0xBE], // Java class file
[0xFE, 0xED, 0xFA, 0xCE], // Mach-O binary (32-bit)
[0xFE, 0xED, 0xFA, 0xCF], // Mach-O binary (64-bit)
];
for (const signature of binarySignatures) {
if (sample.length >= signature.length) {
let matches = true;
for (let i = 0; i < signature.length; i++) {
if (sample[i] !== signature[i]) {
matches = false;
break;
}
}
if (matches) {
return true;
}
}
}
return false;
}
catch (error) {
// If we can't read the file, assume it might be binary to be safe
console.warn(`Could not check if file ${filePath} is binary:`, error);
return true;
}
}
/**
* Estimate token count for content (rough approximation)
*/
estimateTokenCount(content) {
// Rough approximation: 1 token ≈ 4 characters for code
return Math.ceil(content.length / 4);
}
/**
* Determines if a file contains implementation code or documentation
*/
getFileKind(filePath) {
const extension = filePath.split('.').pop()?.toLowerCase() || '';
const fileName = filePath.toLowerCase();
// Documentation file extensions and patterns
const docExtensions = ['md', 'txt', 'rst', 'adoc', 'asciidoc'];
const docPatterns = ['readme', 'changelog', 'license', 'contributing', 'docs/', 'documentation/', 'memory-bank/'];
// Check extension
if (docExtensions.includes(extension)) {
return 'docs';
}
// Check file path patterns
if (docPatterns.some(pattern => fileName.includes(pattern))) {
return 'docs';
}
// Default to code for programming language files
return 'code';
}
/**
* Update statistics
*/
updateStats(chunks) {
// Filter out null chunks that might have been created during parsing
const validChunks = chunks.filter(chunk => chunk && chunk.content != null);
this.stats.totalFiles = this.progress.processedFiles;
this.stats.totalChunks = validChunks.length;
this.stats.totalSize = validChunks.reduce((sum, chunk) => sum + chunk.content.length, 0);
this.stats.averageChunkSize = this.stats.totalSize / this.stats.totalChunks || 0;
this.stats.lastIndexed = new Date();
this.stats.indexingDuration = Date.now() - this.progress.startTime.getTime();
this.stats.errors = this.progress.errors.filter(e => e.severity === 'error' || e.severity === 'critical').length;
this.stats.warnings = this.progress.errors.filter(e => e.severity === 'warning').length;
// Language distribution
this.stats.languageDistribution = {};
validChunks.forEach(chunk => {
this.stats.languageDistribution[chunk.language] =
(this.stats.languageDistribution[chunk.language] || 0) + 1;
});
// Chunk type distribution
this.stats.chunkTypeDistribution = {};
validChunks.forEach(chunk => {
this.stats.chunkTypeDistribution[chunk.chunkType] =
(this.stats.chunkTypeDistribution[chunk.chunkType] || 0) + 1;
});
// Find largest file - only if we have valid chunks
if (validChunks.length > 0) {
const largestChunk = validChunks.reduce((largest, chunk) => chunk.content.length > largest.content.length ? chunk : largest);
this.stats.largestFile = largestChunk.filePath;
}
else {
this.stats.largestFile = 'N/A';
}
}
}
//# sourceMappingURL=indexing-service.js.map