mnemos-coder
Version:
CLI-based coding agent with graph-based execution loop and terminal UI
609 lines • 23.1 kB
JavaScript
/**
* Tree-sitter based code parser for semantic chunking
* Supports TypeScript, JavaScript, and other languages
*/
import Parser from 'tree-sitter';
import TypeScript from 'tree-sitter-typescript';
import JavaScript from 'tree-sitter-javascript';
import Java from 'tree-sitter-java';
import Python from 'tree-sitter-python';
import Go from 'tree-sitter-go';
import Rust from 'tree-sitter-rust';
import Cpp from 'tree-sitter-cpp';
// import C from 'tree-sitter-c'; // Removed due to version conflicts
import * as path from 'path';
import { SymbolAwareChunker } from './SymbolAwareChunker.js';
export class CodeParser {
parsers = new Map();
options;
symbolChunker;
constructor(options = {}) {
this.options = {
maxChunkSize: 50, // Reduced from 500 to 50 lines to stay within token limits
includeComments: true,
includeImports: true,
contextLines: 1, // Reduced context lines
// New symbol-aware defaults
useSymbolAwareChunking: true,
maxTokens: 512, // Default for most embedding models
preserveSignatures: true,
splitLargeFunctions: true,
includeControlFlow: true,
...options
};
this.initializeParsers();
// Initialize symbol-aware chunker with options
const chunkingOptions = {
maxTokens: this.options.maxTokens,
preserveSignatures: this.options.preserveSignatures,
includeContext: this.options.includeImports,
contextLines: this.options.contextLines,
minChunkSize: 20, // Minimum meaningful chunk size in tokens
splitLargeFunctions: this.options.splitLargeFunctions,
includeControlFlow: this.options.includeControlFlow,
overlapRatio: 0.1 // 10% overlap by default
};
this.symbolChunker = new SymbolAwareChunker(chunkingOptions);
}
initializeParsers() {
const languages = [
{ name: 'TypeScript', lang: TypeScript?.typescript, keys: ['typescript', 'ts'] },
{ name: 'TSX', lang: TypeScript?.tsx, keys: ['tsx'] },
{ name: 'JavaScript', lang: JavaScript, keys: ['javascript', 'js', 'jsx'] },
{ name: 'Java', lang: Java, keys: ['java'] },
{ name: 'Python', lang: Python, keys: ['python', 'py'] },
{ name: 'Go', lang: Go, keys: ['go'] },
{ name: 'Rust', lang: Rust, keys: ['rust', 'rs'] },
{ name: 'C++', lang: Cpp, keys: ['cpp', 'cc', 'cxx', 'c++'] },
// { name: 'C', lang: C, keys: ['c'] } // Disabled due to version conflicts
];
let successCount = 0;
for (const { name, lang, keys } of languages) {
if (!lang) {
// console.warn(`Skipping ${name}: language not available`);
continue;
}
try {
const parser = new Parser();
parser.setLanguage(lang); // Type assertion for compatibility
// Test the parser with a simple string to make sure it works
parser.parse('test');
// If we get here, the parser works
for (const key of keys) {
this.parsers.set(key, parser);
}
successCount++;
// Parser initialized successfully
}
catch (error) {
// console.warn(`✗ Failed to initialize ${name} parser:`, error instanceof Error ? error.message : String(error));
}
}
// All parsers initialized: successCount out of total languages
}
/**
* Detect language from file extension
*/
detectLanguage(filePath) {
const ext = path.extname(filePath).toLowerCase().slice(1);
const langMap = {
'ts': 'typescript',
'tsx': 'tsx',
'js': 'javascript',
'jsx': 'jsx',
'mjs': 'javascript',
'cjs': 'javascript',
'java': 'java',
'py': 'python',
'go': 'go',
'rs': 'rust',
'cpp': 'cpp',
'cc': 'cpp',
'cxx': 'cpp',
'c++': 'cpp',
'c': 'c',
'h': 'c',
'hpp': 'cpp'
};
return langMap[ext] || 'unknown';
}
/**
* Parse code into semantic chunks
*/
parseCode(content, filePath) {
const language = this.detectLanguage(filePath);
const parser = this.parsers.get(language);
if (!parser) {
// Fallback to simple line-based chunking for unsupported languages
return this.fallbackChunking(content, language);
}
try {
const tree = parser.parse(content);
// Use symbol-aware chunking if enabled
if (this.options.useSymbolAwareChunking) {
return this.parseCodeWithSymbolAwareness(tree, content, filePath, language);
}
// Legacy chunking method
const lines = content.split('\n');
const chunks = [];
// Extract semantic chunks from AST
this.extractChunks(tree.rootNode, lines, chunks, language);
// Add import chunks if enabled
if (this.options.includeImports) {
const importChunks = this.extractImports(tree.rootNode, lines, language);
chunks.unshift(...importChunks);
}
// Add comment chunks if enabled
if (this.options.includeComments) {
const commentChunks = this.extractComments(content, lines, language);
chunks.push(...commentChunks);
}
// Sort chunks by line number and merge overlapping ones
return this.mergeOverlappingChunks(chunks.sort((a, b) => a.startLine - b.startLine));
}
catch (error) {
// console.warn(`Failed to parse ${filePath} with tree-sitter:`, error);
return this.fallbackChunking(content, language);
}
}
/**
* Parse code using symbol-aware chunking
*/
parseCodeWithSymbolAwareness(tree, content, filePath, language) {
// Get symbol-aware chunks
const symbolChunks = this.symbolChunker.chunkBySymbols(tree.rootNode, content, language, filePath);
// Convert symbol chunks to ParsedChunk format
const parsedChunks = symbolChunks.map(symbolChunk => ({
startLine: symbolChunk.startLine,
endLine: symbolChunk.endLine,
content: symbolChunk.content,
chunkType: this.mapSymbolTypeToChunkType(symbolChunk.symbolType),
metadata: {
...symbolChunk.metadata,
symbolType: symbolChunk.symbolType,
name: symbolChunk.name,
signature: symbolChunk.signature,
isOverlapped: symbolChunk.isOverlapped,
chunkIndex: symbolChunk.chunkIndex,
totalChunks: symbolChunk.totalChunks,
id: symbolChunk.id
}
}));
// Add comments if enabled (using legacy method for now)
if (this.options.includeComments) {
const lines = content.split('\n');
const commentChunks = this.extractComments(content, lines, language);
parsedChunks.push(...commentChunks);
}
return parsedChunks.sort((a, b) => a.startLine - b.startLine);
}
/**
* Map symbol types to legacy chunk types
*/
mapSymbolTypeToChunkType(symbolType) {
const mapping = {
'function': 'function',
'method': 'function',
'constructor': 'function',
'class': 'class',
'interface': 'interface',
'struct': 'interface',
'enum': 'interface',
'condition_block': 'other',
'loop_block': 'other',
'import': 'import',
'variable': 'variable',
'other': 'other'
};
return mapping[symbolType];
}
/**
* Extract semantic chunks from AST nodes
*/
extractChunks(node, lines, chunks, language) {
// Handle different node types based on language
const chunkableTypes = this.getChunkableTypes(language);
if (chunkableTypes.includes(node.type)) {
const chunk = this.nodeToChunk(node, lines, language);
if (chunk && chunk.content.trim().length > 0) {
chunks.push(chunk);
}
}
// Recursively process child nodes
for (const child of node.children) {
this.extractChunks(child, lines, chunks, language);
}
}
/**
* Get chunkable node types for a language
*/
getChunkableTypes(language) {
const baseTypes = [
'function_declaration',
'method_definition',
'arrow_function',
'function_expression',
'class_declaration',
'interface_declaration',
'type_alias_declaration',
'enum_declaration',
'namespace_declaration',
'variable_declaration',
'const_declaration',
'let_declaration'
];
const tsTypes = [
'interface_declaration',
'type_alias_declaration',
'enum_declaration',
'namespace_declaration',
'module_declaration'
];
const javaTypes = [
'method_declaration',
'constructor_declaration',
'class_declaration',
'interface_declaration',
'enum_declaration',
'annotation_type_declaration',
'field_declaration',
'package_declaration',
'import_declaration'
];
const pythonTypes = [
'function_definition',
'async_function_definition',
'class_definition',
'assignment',
'import_statement',
'import_from_statement',
'decorated_definition'
];
const goTypes = [
'function_declaration',
'method_declaration',
'type_declaration',
'var_declaration',
'const_declaration',
'import_declaration',
'package_clause'
];
const rustTypes = [
'function_item',
'impl_item',
'trait_item',
'struct_item',
'enum_item',
'type_item',
'static_item',
'const_item',
'use_declaration',
'mod_item'
];
const cppTypes = [
'function_definition',
'function_declarator',
'class_specifier',
'struct_specifier',
'enum_specifier',
'namespace_definition',
'using_declaration',
'preproc_include',
'template_declaration'
];
const cTypes = [
'function_definition',
'function_declarator',
'struct_specifier',
'enum_specifier',
'typedef_declaration',
'preproc_include',
'declaration'
];
if (language === 'java') {
return [...baseTypes, ...javaTypes];
}
if (language === 'python') {
return [...baseTypes, ...pythonTypes];
}
if (language === 'go') {
return [...baseTypes, ...goTypes];
}
if (language === 'rust') {
return [...baseTypes, ...rustTypes];
}
if (language === 'cpp') {
return [...baseTypes, ...cppTypes];
}
if (language === 'c') {
return [...baseTypes, ...cTypes];
}
return language.includes('typescript') || language.includes('tsx')
? [...baseTypes, ...tsTypes]
: baseTypes;
}
/**
* Convert AST node to ParsedChunk
*/
nodeToChunk(node, lines, language) {
const startLine = node.startPosition.row;
const endLine = node.endPosition.row;
if (endLine - startLine > this.options.maxChunkSize) {
// Split large chunks
return this.splitLargeChunk(node, lines, language);
}
const chunkType = this.mapNodeTypeToChunkType(node.type);
const content = this.extractNodeContent(node, lines);
if (!content.trim())
return null;
const metadata = this.extractMetadata(node, language);
return {
startLine: startLine + 1, // Convert to 1-based indexing
endLine: endLine + 1,
content,
chunkType,
metadata
};
}
/**
* Map AST node type to chunk type
*/
mapNodeTypeToChunkType(nodeType) {
const typeMap = {
// Functions
'function_declaration': 'function',
'method_definition': 'function',
'method_declaration': 'function',
'constructor_declaration': 'function',
'arrow_function': 'function',
'function_expression': 'function',
'function_definition': 'function',
'async_function_definition': 'function',
'function_item': 'function',
'function_declarator': 'function',
// Classes and types
'class_declaration': 'class',
'class_definition': 'class',
'class_specifier': 'class',
'interface_declaration': 'interface',
'type_alias_declaration': 'interface',
'enum_declaration': 'interface',
'annotation_type_declaration': 'interface',
'namespace_declaration': 'interface',
'type_declaration': 'interface',
'trait_item': 'interface',
'struct_item': 'interface',
'enum_item': 'interface',
'type_item': 'interface',
'impl_item': 'interface',
'struct_specifier': 'interface',
'enum_specifier': 'interface',
'namespace_definition': 'interface',
'mod_item': 'interface',
// Variables and fields
'variable_declaration': 'variable',
'field_declaration': 'variable',
'const_declaration': 'variable',
'let_declaration': 'variable',
'assignment': 'variable',
'var_declaration': 'variable',
'static_item': 'variable',
'const_item': 'variable',
'typedef_declaration': 'variable',
'declaration': 'variable',
// Imports and exports
'import_statement': 'import',
'import_declaration': 'import',
'package_declaration': 'import',
'export_statement': 'import',
'import_from_statement': 'import',
'use_declaration': 'import',
'using_declaration': 'import',
'preproc_include': 'import',
'package_clause': 'import',
'template_declaration': 'import',
// Other
'decorated_definition': 'other',
'comment': 'comment'
};
return typeMap[nodeType] || 'other';
}
/**
* Extract content from AST node with context
*/
extractNodeContent(node, lines) {
const startLine = Math.max(0, node.startPosition.row - this.options.contextLines);
const endLine = Math.min(lines.length - 1, node.endPosition.row + this.options.contextLines);
const content = lines.slice(startLine, endLine + 1).join('\n');
// Truncate content to stay within token limits (rough estimate: 1 token ≈ 4 characters)
const maxChars = 800; // Conservative limit for ~200 tokens
if (content.length > maxChars) {
return content.substring(0, maxChars) + '\n// ... [truncated]';
}
return content;
}
/**
* Extract metadata from AST node
*/
extractMetadata(node, language) {
const metadata = {
nodeType: node.type,
language
};
// Extract function/method name
const functionTypes = [
'function_declaration', 'method_definition', 'method_declaration', 'constructor_declaration',
'function_definition', 'async_function_definition', 'function_item', 'function_declarator'
];
if (functionTypes.includes(node.type)) {
const nameNode = node.childForFieldName('name');
if (nameNode) {
metadata.name = nameNode.text;
}
}
// Extract class name
const classTypes = ['class_declaration', 'class_definition', 'class_specifier'];
if (classTypes.includes(node.type)) {
const nameNode = node.childForFieldName('name');
if (nameNode) {
metadata.className = nameNode.text;
}
}
// Extract interface/type name
const typeTypes = [
'interface_declaration', 'type_alias_declaration', 'enum_declaration', 'annotation_type_declaration',
'type_declaration', 'trait_item', 'struct_item', 'enum_item', 'type_item', 'struct_specifier',
'enum_specifier', 'namespace_definition', 'mod_item'
];
if (typeTypes.includes(node.type)) {
const nameNode = node.childForFieldName('name');
if (nameNode) {
metadata.typeName = nameNode.text;
}
}
// Extract parameters for functions
const paramFunctionTypes = [
'function_declaration', 'method_definition', 'method_declaration', 'constructor_declaration',
'function_definition', 'async_function_definition', 'function_item', 'function_declarator'
];
if (paramFunctionTypes.includes(node.type)) {
const paramsNode = node.childForFieldName('parameters');
if (paramsNode) {
metadata.parameterCount = paramsNode.children.filter(c => c.type === 'required_parameter' ||
c.type === 'optional_parameter' ||
c.type === 'formal_parameter' ||
c.type === 'parameter' ||
c.type === 'parameter_declaration').length;
}
}
return metadata;
}
/**
* Extract import statements
*/
extractImports(rootNode, lines, language) {
const imports = [];
const findImports = (node) => {
if (['import_statement', 'import_declaration'].includes(node.type)) {
const chunk = this.nodeToChunk(node, lines, language);
if (chunk) {
imports.push(chunk);
}
}
for (const child of node.children) {
findImports(child);
}
};
findImports(rootNode);
return imports;
}
/**
* Extract comments using regex (simpler than AST for comments)
*/
extractComments(content, lines, language) {
const comments = [];
let commentRegex;
if (language.includes('javascript') || language.includes('typescript') ||
language === 'java' || language === 'go' || language === 'rust' ||
language === 'cpp' || language === 'c') {
// C-style comments: /* */ and //
commentRegex = /(\/\*[\s\S]*?\*\/|\/\/.*$)/gm;
}
else if (language === 'python') {
// Python comments: # and """docstrings"""
commentRegex = /("""[\s\S]*?"""|'''[\s\S]*?'''|#.*$)/gm;
}
else {
// Default to # style comments
commentRegex = /(#.*$)/gm;
}
let match;
while ((match = commentRegex.exec(content)) !== null) {
const beforeMatch = content.substring(0, match.index);
const startLine = beforeMatch.split('\n').length;
const commentLines = match[0].split('\n').length;
const endLine = startLine + commentLines - 1;
if (match[0].trim().length > 10) { // Only include substantial comments
comments.push({
startLine,
endLine,
content: match[0],
chunkType: 'comment',
metadata: { language }
});
}
}
return comments;
}
/**
* Split large chunks into smaller ones
*/
splitLargeChunk(node, lines, language) {
// For now, just return the original chunk - can be enhanced to split intelligently
return {
startLine: node.startPosition.row + 1,
endLine: node.endPosition.row + 1,
content: this.extractNodeContent(node, lines),
chunkType: this.mapNodeTypeToChunkType(node.type),
metadata: this.extractMetadata(node, language)
};
}
/**
* Merge overlapping chunks
*/
mergeOverlappingChunks(chunks) {
if (chunks.length <= 1)
return chunks;
const merged = [];
let current = chunks[0];
for (let i = 1; i < chunks.length; i++) {
const next = chunks[i];
// If chunks overlap or are adjacent, merge them
if (current.endLine >= next.startLine - 1) {
current = {
startLine: current.startLine,
endLine: Math.max(current.endLine, next.endLine),
content: current.content + '\n' + next.content,
chunkType: current.chunkType === next.chunkType ? current.chunkType : 'other',
metadata: { ...current.metadata, ...next.metadata }
};
}
else {
merged.push(current);
current = next;
}
}
merged.push(current);
return merged;
}
/**
* Fallback chunking for unsupported languages
*/
fallbackChunking(content, language) {
const lines = content.split('\n');
const chunks = [];
const chunkSize = 50; // Lines per chunk
for (let i = 0; i < lines.length; i += chunkSize) {
const endLine = Math.min(i + chunkSize, lines.length);
const chunkContent = lines.slice(i, endLine).join('\n');
if (chunkContent.trim().length > 0) {
chunks.push({
startLine: i + 1,
endLine: endLine,
content: chunkContent,
chunkType: 'other',
metadata: { language, fallback: true }
});
}
}
return chunks;
}
/**
* Check if file should be parsed
*/
shouldParseFile(filePath) {
const language = this.detectLanguage(filePath);
return this.parsers.has(language) || language !== 'unknown';
}
}
//# sourceMappingURL=parser.js.map