claude-code-graph
Version:
Claude Code with live structural graphs for large codebases
446 lines (378 loc) • 12.8 kB
JavaScript
/**
* Fast Tree-sitter Parser - Optimized for large codebases
* Uses shallow import-only queries for 10-20x speedup
*/
import { spawn } from 'child_process';
import { readFile, readdir } from 'fs/promises';
import { existsSync } from 'fs';
import { join, dirname, extname } from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Query files for different languages
const QUERIES = {
'.py': 'python.scm',
'.js': 'javascript.scm',
'.ts': 'javascript.scm',
'.jsx': 'javascript.scm',
'.tsx': 'javascript.scm',
'.c': 'cpp.scm',
'.cpp': 'cpp.scm',
'.cc': 'cpp.scm',
'.cxx': 'cpp.scm',
'.h': 'cpp.scm',
'.hpp': 'cpp.scm'
};
// Language mappings for tree-sitter
const LANGUAGES = {
'.py': 'python',
'.js': 'javascript',
'.ts': 'typescript',
'.jsx': 'javascript',
'.tsx': 'typescript',
'.c': 'c',
'.cpp': 'cpp',
'.cc': 'cpp',
'.cxx': 'cpp',
'.h': 'c',
'.hpp': 'cpp'
};
class FastTreeSitter {
constructor(rootPath) {
this.rootPath = rootPath;
this.queriesDir = join(__dirname, 'queries');
}
/**
* Main public API for parsing all files in a directory
*/
async parseAllFiles(files) {
return this.parseFiles(files);
}
/**
* Parse files with shallow import-only queries
*/
async parseFiles(files, options = {}) {
const {
maxFiles = 10000, // Support massive codebases like React
batchSize = 100, // Larger batches for efficiency
timeout = 600000 // 10 minute timeout for huge repos
} = options;
console.error(`🌳 Fast parsing ${Math.min(files.length, maxFiles)} files...`);
// Limit files but be much more generous
const filesToProcess = files.slice(0, maxFiles);
const nodes = [];
const edges = [];
// Group files by language for batch processing
const filesByLang = this.groupFilesByLanguage(filesToProcess);
for (const [lang, langFiles] of Object.entries(filesByLang)) {
if (langFiles.length === 0) continue;
console.error(` Processing ${langFiles.length} ${lang} files...`);
try {
// Try tree-sitter first, fallback to regex parsing
let result;
try {
result = await this.parseLanguageBatch(lang, langFiles, { timeout });
} catch (tsError) {
console.error(` Tree-sitter unavailable for ${lang}, using regex fallback...`);
result = await this.parseWithRegex(lang, langFiles);
}
nodes.push(...result.nodes);
edges.push(...result.edges);
} catch (error) {
console.warn(` Failed to parse ${lang} files:`, error.message);
}
}
return {
nodes,
edges,
metadata: {
tool: 'fast-treesitter',
timestamp: new Date().toISOString(),
files_processed: nodes.length,
total_files: filesToProcess.length,
languages: Object.keys(filesByLang),
query_type: 'import-only'
}
};
}
/**
* Group files by language for efficient batch processing
*/
groupFilesByLanguage(files) {
const groups = {};
for (const file of files) {
const ext = extname(file).toLowerCase();
const lang = LANGUAGES[ext];
if (lang) {
if (!groups[lang]) groups[lang] = [];
groups[lang].push(file);
}
}
return groups;
}
/**
* Parse a batch of files for a specific language
*/
async parseLanguageBatch(language, files, options = {}) {
const queryFile = this.getQueryFile(language);
if (!queryFile) {
throw new Error(`No query file found for language: ${language}`);
}
const nodes = [];
const edges = [];
// Try a small test batch first to see if tree-sitter works
const testBatch = files.slice(0, Math.min(3, files.length));
let treeSitterWorks = false;
try {
const testResult = await this.runTreeSitterQuery(language, testBatch, queryFile, options);
treeSitterWorks = true;
nodes.push(...testResult.nodes);
edges.push(...testResult.edges);
} catch (error) {
console.error(` Tree-sitter failed for ${language}: ${error.message}`);
console.error(` Using regex fallback for all ${files.length} files...`);
return await this.parseWithRegex(language, files);
}
// If tree-sitter works, process remaining files
if (treeSitterWorks && files.length > testBatch.length) {
const remainingFiles = files.slice(testBatch.length);
const batchSize = 20;
for (let i = 0; i < remainingFiles.length; i += batchSize) {
const batch = remainingFiles.slice(i, i + batchSize);
try {
const result = await this.runTreeSitterQuery(language, batch, queryFile, options);
nodes.push(...result.nodes);
edges.push(...result.edges);
} catch (error) {
console.warn(` Batch ${i}-${i + batch.length} failed, using regex for this batch...`);
const fallbackResult = await this.parseWithRegex(language, batch);
nodes.push(...fallbackResult.nodes);
edges.push(...fallbackResult.edges);
}
}
}
return { nodes, edges };
}
/**
* Get the appropriate query file for a language
*/
getQueryFile(language) {
// Map language to query file
const queryMap = {
'python': 'python.scm',
'javascript': 'javascript.scm',
'typescript': 'javascript.scm', // TS uses JS query
'c': 'cpp.scm',
'cpp': 'cpp.scm'
};
const queryFileName = queryMap[language];
if (!queryFileName) return null;
const queryPath = join(this.queriesDir, queryFileName);
return existsSync(queryPath) ? queryPath : null;
}
/**
* Run tree-sitter query on a batch of files
*/
async runTreeSitterQuery(language, files, queryFile, options = {}) {
const { timeout = 30000 } = options;
return new Promise((resolve, reject) => {
// Build tree-sitter command with shallow query
// Note: tree-sitter CLI syntax varies by version
const args = [
'query',
queryFile,
'--captures',
...files
];
console.error(` Debug: Running tree-sitter ${args.join(' ')}`);
const child = spawn('tree-sitter', args, {
cwd: this.rootPath,
stdio: ['pipe', 'pipe', 'pipe']
});
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => stdout += data);
child.stderr.on('data', (data) => stderr += data);
const timer = setTimeout(() => {
child.kill();
reject(new Error(`Tree-sitter query timeout after ${timeout}ms`));
}, timeout);
child.on('close', (code) => {
clearTimeout(timer);
if (code === 0) {
try {
const result = this.parseTreeSitterOutput(stdout, files);
console.error(` Debug: Parsed ${result.nodes.length} nodes, ${result.edges.length} edges`);
resolve(result);
} catch (error) {
console.error(` Debug: Parse output error: ${error.message}`);
reject(new Error(`Failed to parse tree-sitter output: ${error.message}`));
}
} else {
console.error(` Debug: Tree-sitter failed with code ${code}`);
console.error(` Debug: stderr: ${stderr}`);
console.error(` Debug: stdout: ${stdout}`);
reject(new Error(`Tree-sitter failed with code ${code}: ${stderr}`));
}
});
child.on('error', (error) => {
clearTimeout(timer);
reject(error);
});
});
}
/**
* Parse tree-sitter capture output into nodes and edges
*/
parseTreeSitterOutput(output, files) {
const nodes = [];
const edges = [];
const lines = output.trim().split('\n').filter(line => line.trim());
for (const line of lines) {
try {
// Parse tree-sitter capture format: file:line:col: @capture.name "value"
const match = line.match(/^(.+?):(\d+):(\d+):\s*@(\S+)\s+"(.+)"$/);
if (!match) continue;
const [, file, line, col, captureName, value] = match;
// Create node for the import/include
const nodeId = `${file}:${line}:${col}`;
nodes.push({
id: nodeId,
type: this.getCaptureType(captureName),
file: file,
line: parseInt(line),
column: parseInt(col),
name: value,
capture: captureName
});
// Create edge for the dependency relationship
if (captureName.includes('import') || captureName.includes('include')) {
edges.push({
from: file,
to: value,
type: 'dependency',
source_line: parseInt(line),
capture_type: captureName
});
}
} catch (error) {
console.warn(`Failed to parse line: ${line}`, error.message);
}
}
return { nodes, edges };
}
/**
* Get semantic type from capture name
*/
getCaptureType(captureName) {
if (captureName.includes('import')) return 'import';
if (captureName.includes('include')) return 'include';
if (captureName.includes('require')) return 'require';
return 'dependency';
}
/**
* Fallback regex-based parsing when tree-sitter is unavailable
*/
async parseWithRegex(language, files) {
const nodes = [];
const edges = [];
for (const file of files) {
try {
const content = await readFile(file, 'utf8');
const result = this.extractImportsWithRegex(content, file, language);
nodes.push(...result.nodes);
edges.push(...result.edges);
} catch (error) {
console.warn(` Failed to read ${file}:`, error.message);
}
}
return { nodes, edges };
}
/**
* Extract imports using language-specific regex patterns
*/
extractImportsWithRegex(content, filePath, language) {
const nodes = [];
const edges = [];
const lines = content.split('\n');
lines.forEach((line, lineNum) => {
const trimmed = line.trim();
let matches = [];
switch (language) {
case 'python':
// import module
matches = trimmed.match(/^import\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)/);
if (matches) {
this.addImportNode(nodes, edges, filePath, lineNum + 1, matches[1], 'import');
}
// from module import ...
matches = trimmed.match(/^from\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s+import/);
if (matches) {
this.addImportNode(nodes, edges, filePath, lineNum + 1, matches[1], 'from_import');
}
break;
case 'javascript':
case 'typescript':
// import ... from 'module'
matches = trimmed.match(/import\s+.*\s+from\s+['"']([^'"]+)['"']/);
if (matches) {
this.addImportNode(nodes, edges, filePath, lineNum + 1, matches[1], 'es6_import');
}
// require('module')
matches = trimmed.match(/require\s*\(\s*['"']([^'"]+)['"']\s*\)/);
if (matches) {
this.addImportNode(nodes, edges, filePath, lineNum + 1, matches[1], 'require');
}
break;
case 'c':
case 'cpp':
// #include "header.h"
matches = trimmed.match(/^#include\s*[<"]([^>"]+)[>"]/);
if (matches) {
this.addImportNode(nodes, edges, filePath, lineNum + 1, matches[1], 'include');
}
break;
}
});
return { nodes, edges };
}
/**
* Add an import node and edge
*/
addImportNode(nodes, edges, filePath, lineNum, importTarget, importType) {
const nodeId = `${filePath}:${lineNum}`;
nodes.push({
id: nodeId,
type: importType,
file: filePath,
line: lineNum,
name: importTarget,
capture: `regex_${importType}`
});
edges.push({
from: filePath,
to: importTarget,
type: 'dependency',
source_line: lineNum,
capture_type: `regex_${importType}`
});
}
}
// CLI interface
if (import.meta.url === `file://${process.argv[1]}`) {
const [,, rootPath, ...files] = process.argv;
if (!rootPath || files.length === 0) {
console.error('Usage: fast-treesitter.js <rootPath> <file1> [file2] ...');
process.exit(1);
}
const parser = new FastTreeSitter(rootPath);
try {
const result = await parser.parseFiles(files);
console.log(JSON.stringify(result, null, 2));
} catch (error) {
console.error('Parse failed:', error.message);
process.exit(1);
}
}
export { FastTreeSitter };