@wildcard-ai/deepcontext
Version:
Advanced codebase indexing and semantic search MCP server
568 lines • 27.5 kB
JavaScript
/**
* IndexingOrchestrator - Core business logic for codebase indexing
*
* Orchestrates the complete indexing process:
* - File discovery and filtering
* - Symbol extraction with AST parsing
* - Chunk generation with dependency context
* - Embedding generation and storage
*/
import * as fs from 'fs/promises';
import * as path from 'path';
import * as crypto from 'crypto';
import { FileUtils } from '../../utils/FileUtils.js';
import { LanguageDetector } from '../../utils/LanguageDetector.js';
import { ContentFilterProvider } from './ContentFilterProvider.js';
import { TreeSitterSymbolExtractorFull } from './TreeSitterSymbolExtractor.treesitter-based.js';
import { TreeSitterChunkExtractor } from './TreeSitterChunkExtractor.js';
import { ConfigurationService } from '../../services/ConfigurationService.js';
import { Logger } from '../../utils/Logger.js';
export class IndexingOrchestrator {
fileUtils;
languageDetector;
contentFilter;
symbolExtractor;
chunkExtractor;
logger;
services;
constructor(services) {
this.fileUtils = new FileUtils();
this.languageDetector = new LanguageDetector();
this.contentFilter = new ContentFilterProvider();
this.symbolExtractor = new TreeSitterSymbolExtractorFull();
// Create a default configuration service for now - ideally this should be injected
const defaultConfigService = new ConfigurationService();
this.chunkExtractor = new TreeSitterChunkExtractor(defaultConfigService);
this.logger = new Logger('INDEXING-ORCHESTRATOR', 'debug');
this.services = services;
}
/**
* Main indexing orchestration method
*/
async indexCodebase(request) {
const startTime = Date.now();
const errors = [];
this.logger.info(`🚀 Starting indexing: ${request.codebasePath}`);
this.logger.debug(`📋 Options: ${JSON.stringify({
force: request.forceReindex,
filtering: request.enableContentFiltering,
dependencies: request.enableDependencyAnalysis
})}`);
try {
// Initialize symbol extractor if not already initialized
await this.symbolExtractor.initialize();
// Step 1: Discover files
const allFiles = await this.fileUtils.discoverFiles(request.codebasePath, request.supportedLanguages || ['typescript', 'javascript', 'python', 'java', 'cpp', 'go', 'rust']);
if (allFiles.length === 0) {
this.logger.warn(`⚠️ No files found in ${request.codebasePath}`);
errors.push({
file: request.codebasePath,
error: 'No supported files found in directory'
});
}
// Step 2: Apply content filtering
let filesToProcess = allFiles;
if (request.enableContentFiltering !== false) {
filesToProcess = await this.applyContentFiltering(allFiles, request.codebasePath);
}
this.logger.info(`📝 Processing: ${filesToProcess.length} files`);
// Step 3: Process files in batches
const chunks = [];
const batchSize = 10; // Optimal batch size for processing
for (let i = 0; i < filesToProcess.length; i += batchSize) {
const batch = filesToProcess.slice(i, i + batchSize);
const batchResults = await Promise.allSettled(batch.map(file => this.processFile(file, request)));
// Collect results and errors
batchResults.forEach((result, index) => {
if (result.status === 'fulfilled') {
chunks.push(...result.value);
}
else {
errors.push({
file: batch[index],
error: result.reason?.message || 'Unknown error'
});
}
});
this.logger.debug(`📊 Processed: ${Math.min(i + batchSize, filesToProcess.length)}/${filesToProcess.length} files`);
}
const indexingTime = Date.now() - startTime;
if (!this.services?.namespaceManagerService) {
throw new Error('NamespaceManagerService is required for indexing operations');
}
const namespace = this.services.namespaceManagerService.generateNamespace(request.codebasePath);
// Clear existing index if force reindex is requested
if (request.forceReindex && this.services?.turbopufferService) {
this.logger.info(`🗑️ Force reindex enabled - clearing existing namespace: ${namespace}`);
try {
await this.services.turbopufferService.clearNamespace(namespace);
this.logger.info(`✅ Successfully cleared namespace: ${namespace}`);
}
catch (error) {
this.logger.warn(`⚠️ Failed to clear namespace ${namespace}:`, error);
// Continue with indexing even if clearing fails
}
}
// Upload to vector store if services are provided
if (this.services?.jinaApiService && this.services?.turbopufferService && chunks.length > 0) {
this.logger.info(`Uploading ${chunks.length} chunks to vector store...`);
const uploadResult = await this.uploadChunksToVectorStore(namespace, chunks);
// Call metadata callback only if upload was successful
if (this.services.metadataCallback && uploadResult.success) {
const indexedData = {
namespace,
totalChunks: uploadResult.successfulChunks,
indexedAt: new Date().toISOString()
};
await this.services.metadataCallback(request.codebasePath, indexedData);
}
}
// Determine success based on whether chunks were actually created and uploaded
let success = chunks.length > 0;
let completionMessage = `✅ Complete: ${chunks.length} chunks in ${indexingTime}ms`;
if (chunks.length === 0) {
success = false;
completionMessage = `❌ No chunks generated from ${filesToProcess.length} files - possible causes: all files filtered out, parsing failures, or empty files`;
this.logger.warn(completionMessage);
// Register the failed indexing attempt for status tracking
if (this.services?.namespaceManagerService) {
await this.services.namespaceManagerService.registerFailedIndexing(request.codebasePath, 'No indexable content found - check if files contain valid code or adjust content filtering');
}
}
else {
this.logger.info(completionMessage);
}
return {
success,
metadata: {
codebasePath: request.codebasePath,
namespace,
totalFiles: filesToProcess.length,
totalChunks: chunks.length,
totalSymbols: chunks.reduce((sum, chunk) => sum + (chunk.symbols?.length || 0), 0),
indexingTime,
indexingMethod: 'full',
features: {
astExtraction: true,
contentFiltering: request.enableContentFiltering !== false,
dependencyAnalysis: request.enableDependencyAnalysis !== false,
incrementalUpdate: false
},
...(chunks.length === 0 && {
failureReason: 'No indexable content found - check if files contain valid code or adjust content filtering'
})
},
chunks,
errors
};
}
catch (error) {
console.error('[INDEXING] ❌ Fatal error:', error);
return {
success: false,
metadata: {
codebasePath: request.codebasePath,
namespace: this.services?.namespaceManagerService?.generateNamespace(request.codebasePath) || 'unknown',
totalFiles: 0,
totalChunks: 0,
totalSymbols: 0,
indexingTime: Date.now() - startTime,
indexingMethod: 'full',
features: {
astExtraction: false,
contentFiltering: false,
dependencyAnalysis: false,
incrementalUpdate: false
}
},
chunks: [],
errors: [{ file: 'system', error: error instanceof Error ? error.message : String(error) }]
};
}
}
/**
* Process a single file into semantic chunks using Tree-sitter AST parsing
* Uses TreeSitterChunkExtractor for meaningful code unit extraction
*/
async processFile(filePath, request) {
const content = await fs.readFile(filePath, 'utf-8');
const language = this.languageDetector.detectLanguage(filePath, content);
const relativePath = path.relative(request.codebasePath, filePath);
try {
// Use new TreeSitterChunkExtractor for semantic chunking
const chunkingResult = await this.chunkExtractor.extractSemanticChunks(content, language.language, filePath, relativePath);
// Extract symbols and imports at file level for efficiency
const fileSymbolResult = await this.symbolExtractor.extractSymbols(content, language.language, filePath);
this.logger.debug(`🔍 Symbol extraction for ${filePath}: ${fileSymbolResult.symbols.length} symbols, ${fileSymbolResult.imports.length} imports`);
// Convert SemanticChunk[] to CodeChunk[] format with enhanced symbols/imports
const chunks = chunkingResult.chunks.map(semanticChunk => {
// Find symbols that belong to this chunk (precise containment)
const candidateSymbols = fileSymbolResult.symbols
.filter(symbol =>
// Precise filtering: symbol is contained within chunk boundaries
symbol.startLine >= semanticChunk.startLine &&
symbol.endLine <= semanticChunk.endLine);
this.logger.debug(`📍 Chunk ${semanticChunk.startLine}-${semanticChunk.endLine}: ${candidateSymbols.length} candidate symbols`);
const chunkSymbols = candidateSymbols
.filter(symbol =>
// Filter out symbol types not supported by SymbolInfo interface
['function', 'class', 'interface', 'variable', 'constant', 'type', 'namespace', 'method', 'enum'].includes(symbol.type))
.map(symbol => ({
name: symbol.name,
type: symbol.type,
startLine: symbol.startLine,
endLine: symbol.endLine,
scope: symbol.scope
}));
this.logger.debug(`✅ Chunk symbols after filtering: ${chunkSymbols.length} symbols - ${chunkSymbols.map(s => s.name).join(', ')}`);
// Find imports that are relevant to this chunk
const chunkImports = fileSymbolResult.imports
.filter(imp => imp.line <= semanticChunk.endLine) // Imports typically at top of file
.map(imp => ({
module: imp.module,
symbols: imp.symbols,
line: imp.line
}));
return {
id: semanticChunk.id,
content: semanticChunk.content,
filePath: semanticChunk.filePath,
relativePath: semanticChunk.relativePath,
startLine: semanticChunk.startLine,
endLine: semanticChunk.endLine,
language: semanticChunk.language,
symbols: chunkSymbols,
imports: chunkImports,
exports: fileSymbolResult.exports.filter(exp =>
// Associate exports with chunks that contain them
chunkSymbols.some(sym => sym.name === exp))
};
});
this.logger.debug(`Created ${chunks.length} semantic chunks for ${filePath}`);
// Log chunk details for debugging
if (chunks.length > 0) {
const avgSize = chunks.reduce((sum, chunk) => sum + chunk.content.length, 0) / chunks.length;
this.logger.debug(`Average chunk size: ${avgSize.toFixed(0)} characters`);
}
return chunks;
}
catch (error) {
// Fallback to simpler chunking if semantic chunking fails
this.logger.warn(`Semantic chunking failed for ${filePath}, using fallback: ${error}`);
return this.createFallbackChunks(content, filePath, relativePath, language.language);
}
}
async applyContentFiltering(files, codebasePath) {
this.logger.info(`🔍 Content filtering ${files.length} files...`);
const batchSize = 50;
const filtered = [];
for (let i = 0; i < files.length; i += batchSize) {
const batch = files.slice(i, i + batchSize);
const results = await Promise.allSettled(batch.map(async (file) => {
try {
const relativePath = path.relative(codebasePath, file);
// Check file size first to avoid reading large files into memory
const stats = await fs.stat(file);
if (stats.size > 500000) { // 500KB limit (same as ContentFilterProvider)
return {
file,
shouldInclude: {
include: false,
reason: 'File too large (likely data file)',
confidence: 0.9
},
relativePath
};
}
// Only read content for files under size limit
const content = await fs.readFile(file, 'utf-8');
const shouldInclude = this.contentFilter.shouldInclude(relativePath, content);
return { file, shouldInclude, relativePath };
}
catch (error) {
console.warn(`[INDEXING] ⚠️ Error filtering ${file}: ${error}`);
return null;
}
}));
results.forEach((result) => {
if (result.status === 'fulfilled' && result.value?.shouldInclude.include) {
filtered.push(result.value.file);
}
else if (result.status === 'fulfilled' && result.value) {
this.logger.debug(`🚫 Filtered: ${result.value.relativePath} (${result.value.shouldInclude.reason})`);
}
});
}
this.logger.info(`✅ Content filtering complete: ${filtered.length}/${files.length} files included`);
return filtered;
}
/**
* Upload chunks to vector store with embedding generation
*/
async uploadChunksToVectorStore(namespace, chunks) {
if (!chunks.length || !this.services?.jinaApiService || !this.services?.turbopufferService) {
this.logger.warn(`⚠️ Vector store upload skipped: chunks=${chunks.length}, jinaApiService=${!!this.services?.jinaApiService}, turbopufferService=${!!this.services?.turbopufferService}`);
return { success: false, successfulChunks: 0, skippedChunks: chunks.length };
}
this.logger.info(`Uploading ${chunks.length} chunks to vector store and local metadata...`);
const batchSize = 10; // Optimal batch size for embedding generation
let successfulBatches = 0;
let skippedBatches = 0;
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize);
const batchNumber = Math.floor(i / batchSize) + 1;
const totalBatches = Math.ceil(chunks.length / batchSize);
try {
this.logger.info(`📦 Processing batch ${batchNumber}/${totalBatches} (${batch.length} chunks)`);
// Generate embeddings for batch
this.logger.debug(`🧠 Generating embeddings for ${batch.length} chunks...`);
const embeddings = await this.services.jinaApiService.generateEmbeddingBatch(batch.map(chunk => chunk.content));
this.logger.debug(`✅ Generated ${embeddings.length} embeddings`);
// Prepare upsert data in Turbopuffer v2 format with schema for full-text search
const upsertData = batch.map((chunk, idx) => ({
id: chunk.id,
vector: embeddings[idx],
content: chunk.content,
filePath: chunk.filePath,
relativePath: chunk.relativePath,
startLine: chunk.startLine,
endLine: chunk.endLine,
language: chunk.language,
symbols: (chunk.symbols || []).map(s => typeof s === 'string' ? s : s.name).join(',')
}));
// Upload to Turbopuffer
this.logger.debug(`⬆️ Upserting ${upsertData.length} vectors to namespace: ${namespace}`);
await this.services.turbopufferService.upsert(namespace, upsertData);
this.logger.info(`✅ Batch ${batchNumber}/${totalBatches} completed successfully`);
successfulBatches++;
// Add minimal delay between batches to avoid rate limiting
if (batchNumber < totalBatches) {
const delay = 100; // Reduced to 100ms delay between batches
this.logger.debug(`⏱️ Waiting ${delay}ms before next batch...`);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
catch (error) {
// After 3 exponential retries in wildcardFetch, skip this batch and continue
skippedBatches++;
this.logger.warn(`⚠️ Skipping batch ${batchNumber} after retries failed. Continuing with remaining batches.`);
this.logger.warn(`Skipped batch error:`, error instanceof Error ? error.message : String(error));
// Don't throw - just continue to next batch
// The batch will be lost but indexing continues
continue;
}
}
const totalBatches = Math.ceil(chunks.length / batchSize);
const actualSuccessfulChunks = successfulBatches * batchSize;
const actualSkippedChunks = Math.min(skippedBatches * batchSize, chunks.length - actualSuccessfulChunks);
if (skippedBatches > 0) {
this.logger.info(`✅ Upload complete: ${actualSuccessfulChunks}/${chunks.length} chunks uploaded to namespace: ${namespace} (${skippedBatches}/${totalBatches} batches skipped due to rate limiting)`);
}
else {
this.logger.info(`✅ Uploaded ${chunks.length} chunks to namespace: ${namespace}`);
}
// Return success only if at least some chunks were uploaded successfully
const success = actualSuccessfulChunks > 0;
return {
success,
successfulChunks: actualSuccessfulChunks,
skippedChunks: actualSkippedChunks
};
}
/**
* Create sensible fallback chunks when semantic parsing fails
* Unlike the broken single-line approach, this creates larger, meaningful chunks
*/
createFallbackChunks(content, filePath, relativePath, language) {
const lines = content.split('\n');
const chunks = [];
const chunkSize = 100; // 100 lines per chunk (not 1!)
for (let i = 0; i < lines.length; i += chunkSize) {
const startLine = i + 1;
const endLine = Math.min(i + chunkSize, lines.length);
const chunkLines = lines.slice(i, endLine);
const chunkContent = chunkLines.join('\n');
// Skip empty chunks
if (!chunkContent.trim())
continue;
chunks.push({
id: this.generateChunkId(filePath, startLine, chunkContent),
content: chunkContent,
filePath,
relativePath,
startLine,
endLine,
language,
symbols: [], // No symbols for fallback chunks
imports: [] // No imports for fallback chunks
});
}
return chunks;
}
/**
* Expand symbol to include complete logical unit (function body, class body, etc.)
* This provides simple but effective boundary expansion for symbols
*/
expandSymbolToLogicalUnit(symbol, lines, content) {
const declarationLine = symbol.startLine - 1; // Convert to 0-based
if (declarationLine < 0 || declarationLine >= lines.length) {
const fallbackContent = lines[symbol.startLine - 1] || '';
return {
startLine: symbol.startLine,
endLine: symbol.startLine,
symbolContent: fallbackContent
};
}
const line = lines[declarationLine].trim();
let startIdx = declarationLine;
let endIdx = declarationLine;
// Find preceding comments
while (startIdx > 0) {
const prevLine = lines[startIdx - 1].trim();
if (prevLine === '' || prevLine.startsWith('//') || prevLine.startsWith('/*') ||
prevLine.startsWith('*') || prevLine.includes('*/')) {
startIdx--;
}
else {
break;
}
}
// Expand based on symbol type
if (symbol.type === 'class' || symbol.type === 'interface') {
endIdx = this.findBlockEnd(declarationLine, lines);
}
else if (symbol.type === 'function' || line.includes('=>')) {
if (line.includes('{')) {
endIdx = this.findBlockEnd(declarationLine, lines);
}
else {
// Simple arrow function or single line
endIdx = this.findStatementEnd(declarationLine, lines);
}
}
else {
// Variable, type, etc. - find statement end
endIdx = this.findStatementEnd(declarationLine, lines);
}
const symbolContent = lines.slice(startIdx, endIdx + 1).join('\n');
return {
startLine: startIdx + 1, // Convert back to 1-based
endLine: endIdx + 1,
symbolContent
};
}
/**
* Find block end using brace matching
*/
findBlockEnd(startLineIndex, lines) {
let braceCount = 0;
let foundOpenBrace = false;
for (let i = startLineIndex; i < lines.length; i++) {
const line = lines[i];
for (const char of line) {
if (char === '{') {
braceCount++;
foundOpenBrace = true;
}
else if (char === '}') {
braceCount--;
if (foundOpenBrace && braceCount === 0) {
return i;
}
}
}
}
return Math.min(startLineIndex + 50, lines.length - 1);
}
/**
* Find statement end (for variables, simple functions, etc.)
*/
findStatementEnd(startLineIndex, lines) {
let parenCount = 0;
let braceCount = 0;
let bracketCount = 0;
for (let i = startLineIndex; i < lines.length; i++) {
const line = lines[i];
// Count brackets
for (const char of line) {
switch (char) {
case '(':
parenCount++;
break;
case ')':
parenCount--;
break;
case '{':
braceCount++;
break;
case '}':
braceCount--;
break;
case '[':
bracketCount++;
break;
case ']':
bracketCount--;
break;
}
}
// Check if statement is complete
const trimmedLine = line.trim();
if ((parenCount === 0 && braceCount === 0 && bracketCount === 0) &&
(trimmedLine.endsWith(';') || trimmedLine.endsWith('}') ||
trimmedLine.endsWith(');'))) {
return i;
}
// Safety: stop at next declaration or max lines
if (i > startLineIndex &&
(trimmedLine.match(/^(const|let|var|function|class|interface|type)\s+/) ||
i - startLineIndex > 20)) {
return i - 1;
}
}
return Math.min(startLineIndex + 20, lines.length - 1);
}
generateChunkId(filePath, startLine, content) {
const input = `${filePath}:${startLine}:${content}`;
const hash = crypto.createHash('sha256').update(input, 'utf-8').digest('hex');
return `chunk_${hash.substring(0, 16)}`;
}
/**
* Get indexing status for codebases
*/
async getIndexingStatus(indexedCodebases, codebasePath) {
const indexedList = Array.from(indexedCodebases.values());
let currentCodebase;
let incrementalStats;
if (codebasePath) {
try {
const normalizedPath = path.resolve(codebasePath);
await fs.access(normalizedPath);
currentCodebase = indexedCodebases.get(normalizedPath);
}
catch (error) {
return {
indexedCodebases: indexedList,
indexed: false,
fileCount: 0
};
}
if (currentCodebase) {
incrementalStats = {
indexingMethod: 'full',
lastIndexed: currentCodebase.indexedAt
};
}
}
const indexed = codebasePath ? !!currentCodebase : indexedList.length > 0;
const fileCount = currentCodebase?.totalChunks || indexedList.reduce((sum, cb) => sum + cb.totalChunks, 0);
return {
indexedCodebases: indexedList,
currentCodebase,
incrementalStats,
indexed,
fileCount
};
}
}
//# sourceMappingURL=IndexingOrchestrator.js.map