bigparse
Version:
MCP server that gives Claude instant, intelligent access to your codebase using Language Server Protocol
468 lines (407 loc) • 13.9 kB
text/typescript
import { glob } from 'glob';
import * as path from 'path';
import * as fs from 'fs/promises';
import { createHash } from 'crypto';
import { LSPManager } from '../lsp/manager.js';
import { FileCache } from './cache.js';
import { EventEmitter } from 'events';
interface IndexedFile {
path: string;
language: string;
size: number;
hash: string;
lastModified: Date;
symbols: SymbolInfo[];
content?: string; // Optional cached content
}
interface SymbolInfo {
name: string;
kind: string;
location: {
start: { line: number; character: number };
end: { line: number; character: number };
};
containerName?: string;
}
interface SearchResult {
file: string;
matches: Array<{
line: number;
column: number;
text: string;
symbol?: SymbolInfo;
}>;
}
export class CodeIndexer extends EventEmitter {
private index: Map<string, IndexedFile> = new Map();
private lspManager: LSPManager;
private indexPath = '.index';
private cache: FileCache<IndexedFile>;
private fileWatcher: any;
private isIndexing = false;
private maxConcurrentIndexing = 4;
constructor(rootPath?: string) {
super();
// Use a proper cache directory in user's home folder for MCP servers
const homeDir = process.env.HOME || process.env.USERPROFILE || '';
const cacheBaseDir = process.env.MCP_CACHE_DIR || path.join(homeDir, '.cache', 'bigparse');
const workspaceId = rootPath ? Buffer.from(rootPath).toString('base64').replace(/[/+=]/g, '_') : 'default';
this.indexPath = path.join(cacheBaseDir, workspaceId);
this.lspManager = new LSPManager(rootPath);
this.cache = new FileCache<IndexedFile>(path.join(this.indexPath, 'cache'));
}
async initialize(): Promise<void> {
await this.cache.initialize();
await this.lspManager.initialize();
await this.loadIndex().catch(() => {
// Index doesn't exist yet, that's OK
});
}
async indexCodebase(basePath: string, languages?: string[]): Promise<{
filesIndexed: number;
errors: string[];
duration: number;
skipped: number;
}> {
if (this.isIndexing) {
throw new Error('Indexing already in progress');
}
this.isIndexing = true;
const startTime = Date.now();
const errors: string[] = [];
let filesIndexed = 0;
let skipped = 0;
try {
await this.ensureIndexDirectory();
this.emit('indexing-start', { basePath, languages });
const patterns = this.getFilePatterns(languages);
const files: string[] = [];
for (const pattern of patterns) {
const matches = await glob(pattern, {
cwd: basePath,
absolute: true,
ignore: ['**/node_modules/**', '**/dist/**', '**/.git/**', '**/.*/**'],
});
files.push(...matches);
}
this.emit('files-discovered', { total: files.length });
// Process files in batches for better performance
const batchSize = this.maxConcurrentIndexing;
for (let i = 0; i < files.length; i += batchSize) {
const batch = files.slice(i, i + batchSize);
const results = await Promise.allSettled(
batch.map(file => this.indexFile(file))
);
results.forEach((result, index) => {
if (result.status === 'fulfilled') {
if (result.value === 'cached' || result.value === 'skipped') {
skipped++;
} else if (result.value === 'indexed') {
filesIndexed++;
}
} else {
errors.push(`Failed to index ${batch[index]}: ${result.reason}`);
}
});
this.emit('indexing-progress', {
processed: i + batch.length,
total: files.length,
filesIndexed,
skipped,
errors: errors.length,
});
}
await this.saveIndex();
this.emit('indexing-complete', {
filesIndexed,
skipped,
errors: errors.length,
duration: Date.now() - startTime,
});
} catch (error) {
errors.push(`Indexing failed: ${error}`);
this.emit('indexing-error', { error });
} finally {
this.isIndexing = false;
}
return {
filesIndexed,
errors,
duration: Date.now() - startTime,
skipped,
};
}
async searchCode(query: string, options: {
fileTypes?: string[];
limit?: number;
} = {}): Promise<SearchResult[]> {
const results: SearchResult[] = [];
const limit = options.limit || 50;
const fileTypeFilter = options.fileTypes || [];
let totalMatches = 0;
// Process files in smaller batches to avoid memory issues
const indexEntries = Array.from(this.index.entries());
const batchSize = 5; // Process 5 files at a time
for (let i = 0; i < indexEntries.length && results.length < limit; i += batchSize) {
const batch = indexEntries.slice(i, i + batchSize);
const batchResults = await Promise.all(
batch.map(async ([filePath, indexedFile]) => {
if (fileTypeFilter.length > 0) {
const ext = path.extname(filePath);
if (!fileTypeFilter.includes(ext)) {
return null;
}
}
return this.searchInFile(filePath, query, indexedFile);
})
);
for (const fileResults of batchResults) {
if (fileResults && fileResults.matches.length > 0) {
results.push(fileResults);
totalMatches += fileResults.matches.length;
if (results.length >= limit) {
break;
}
}
}
// Force garbage collection hint
if (global.gc) {
global.gc();
}
}
console.error(`Search complete: found ${totalMatches} matches in ${results.length} files`);
return results;
}
private async indexFile(filePath: string): Promise<'indexed' | 'cached' | 'skipped'> {
try {
const stats = await fs.stat(filePath);
// Skip very large files to prevent memory issues
const MAX_INDEX_FILE_SIZE = 20 * 1024 * 1024; // 20MB limit for indexing
if (stats.size > MAX_INDEX_FILE_SIZE) {
console.error(`Skipping ${filePath} - file too large (${(stats.size / 1024 / 1024).toFixed(2)}MB)`);
return 'skipped';
}
const content = await fs.readFile(filePath, 'utf-8');
const hash = createHash('sha256').update(content).digest('hex');
// Check cache first
const cacheKey = filePath;
const cachedData = await this.cache.get(cacheKey, hash);
if (cachedData) {
this.index.set(filePath, cachedData);
return 'cached';
}
const language = this.detectLanguage(filePath);
const symbols = await this.extractSymbols(filePath, language);
const indexedFile: IndexedFile = {
path: filePath,
language,
size: stats.size,
hash,
lastModified: stats.mtime,
symbols,
};
this.index.set(filePath, indexedFile);
await this.cache.set(cacheKey, indexedFile, hash);
return 'indexed';
} catch (error) {
console.error(`Failed to index ${filePath}:`, error);
return 'skipped';
}
}
private async extractSymbols(filePath: string, _language: string): Promise<SymbolInfo[]> {
try {
const lspSymbols = await this.lspManager.getDocumentSymbols(filePath);
const symbols: SymbolInfo[] = [];
for (const symbol of lspSymbols) {
// SymbolInformation always has location.range
const range = symbol.location?.range;
if (!range) {
console.warn(`Symbol ${symbol.name} has no range information`);
continue;
}
symbols.push({
name: symbol.name,
kind: this.symbolKindToString(symbol.kind),
location: {
start: {
line: range.start.line,
character: range.start.character,
},
end: {
line: range.end.line,
character: range.end.character,
},
},
containerName: symbol.containerName,
});
}
return symbols;
} catch (error) {
// Language server might not be available, continue without symbols
console.error(`Failed to extract symbols from ${filePath}:`, error);
return [];
}
}
private async searchInFile(filePath: string, query: string, indexedFile: IndexedFile): Promise<SearchResult> {
const matches: SearchResult['matches'] = [];
try {
// Skip very large files to prevent memory issues
const stats = await fs.stat(filePath);
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB limit for search
if (stats.size > MAX_FILE_SIZE) {
console.error(`Skipping search in ${filePath} - file too large (${(stats.size / 1024 / 1024).toFixed(2)}MB)`);
return { file: filePath, matches };
}
const content = await fs.readFile(filePath, 'utf-8');
const lines = content.split('\n');
const regex = new RegExp(query, 'gi');
lines.forEach((line, lineIndex) => {
let match;
while ((match = regex.exec(line)) !== null) {
const symbol = this.findSymbolAtLocation(indexedFile.symbols, lineIndex, match.index);
matches.push({
line: lineIndex + 1,
column: match.index + 1,
text: line.trim().substring(0, 200), // Limit text length
symbol,
});
// Limit matches per file
if (matches.length >= 100) {
return;
}
}
});
} catch (error) {
console.error(`Failed to search in ${filePath}:`, error);
}
return { file: filePath, matches };
}
private findSymbolAtLocation(symbols: SymbolInfo[], line: number, column: number): SymbolInfo | undefined {
return symbols.find(symbol => {
return line >= symbol.location.start.line &&
line <= symbol.location.end.line &&
(line !== symbol.location.start.line || column >= symbol.location.start.character) &&
(line !== symbol.location.end.line || column <= symbol.location.end.character);
});
}
private detectLanguage(filePath: string): string {
const ext = path.extname(filePath);
const languageMap: Record<string, string> = {
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.py': 'python',
'.rs': 'rust',
'.go': 'go',
'.java': 'java',
'.cpp': 'cpp',
'.c': 'c',
'.cs': 'csharp',
'.rb': 'ruby',
'.php': 'php',
'.swift': 'swift',
'.kt': 'kotlin',
'.scala': 'scala',
'.r': 'r',
'.m': 'objc',
'.mm': 'objcpp',
};
return languageMap[ext] || 'plaintext';
}
private getFilePatterns(languages?: string[]): string[] {
const languagePatterns: Record<string, string[]> = {
typescript: ['**/*.ts', '**/*.tsx'],
javascript: ['**/*.js', '**/*.jsx'],
python: ['**/*.py'],
rust: ['**/*.rs'],
go: ['**/*.go'],
java: ['**/*.java'],
cpp: ['**/*.cpp', '**/*.cc', '**/*.cxx', '**/*.hpp', '**/*.h'],
c: ['**/*.c', '**/*.h'],
csharp: ['**/*.cs'],
ruby: ['**/*.rb'],
php: ['**/*.php'],
swift: ['**/*.swift'],
kotlin: ['**/*.kt'],
scala: ['**/*.scala'],
r: ['**/*.r', '**/*.R'],
objc: ['**/*.m', '**/*.h'],
objcpp: ['**/*.mm'],
};
if (languages && languages.length > 0) {
const patterns: string[] = [];
for (const lang of languages) {
if (languagePatterns[lang]) {
patterns.push(...languagePatterns[lang]);
}
}
return patterns;
}
return Object.values(languagePatterns).flat();
}
private symbolKindToString(kind: number): string {
const kinds = [
'File', 'Module', 'Namespace', 'Package', 'Class', 'Method',
'Property', 'Field', 'Constructor', 'Enum', 'Interface',
'Function', 'Variable', 'Constant', 'String', 'Number',
'Boolean', 'Array', 'Object', 'Key', 'Null', 'EnumMember',
'Struct', 'Event', 'Operator', 'TypeParameter'
];
return kinds[kind - 1] || 'Unknown';
}
private async ensureIndexDirectory(): Promise<void> {
try {
await fs.mkdir(this.indexPath, { recursive: true });
} catch (error) {
// Directory might already exist
}
}
private async saveIndex(): Promise<void> {
const indexData = Array.from(this.index.entries());
await fs.writeFile(
path.join(this.indexPath, 'index.json'),
JSON.stringify(indexData, null, 2)
);
}
async loadIndex(): Promise<void> {
try {
const data = await fs.readFile(path.join(this.indexPath, 'index.json'), 'utf-8');
const indexData = JSON.parse(data);
this.index = new Map(indexData);
} catch (error) {
// Index doesn't exist yet
}
}
async shutdown(): Promise<void> {
this.isIndexing = false;
await this.saveIndex();
await this.lspManager.shutdown();
if (this.fileWatcher) {
this.fileWatcher.close();
}
}
async getCacheStats(): Promise<any> {
return this.cache.getStats();
}
async clearCache(): Promise<void> {
await this.cache.clear();
this.index.clear();
}
getIndexStats(): {
totalFiles: number;
languages: Record<string, number>;
totalSymbols: number;
} {
const stats = {
totalFiles: this.index.size,
languages: {} as Record<string, number>,
totalSymbols: 0,
};
for (const file of this.index.values()) {
stats.languages[file.language] = (stats.languages[file.language] || 0) + 1;
stats.totalSymbols += file.symbols.length;
}
return stats;
}
}