UNPKG

mnemos-coder

Version:

CLI-based coding agent with graph-based execution loop and terminal UI

609 lines 23.1 kB
/** * Tree-sitter based code parser for semantic chunking * Supports TypeScript, JavaScript, and other languages */ import Parser from 'tree-sitter'; import TypeScript from 'tree-sitter-typescript'; import JavaScript from 'tree-sitter-javascript'; import Java from 'tree-sitter-java'; import Python from 'tree-sitter-python'; import Go from 'tree-sitter-go'; import Rust from 'tree-sitter-rust'; import Cpp from 'tree-sitter-cpp'; // import C from 'tree-sitter-c'; // Removed due to version conflicts import * as path from 'path'; import { SymbolAwareChunker } from './SymbolAwareChunker.js'; export class CodeParser { parsers = new Map(); options; symbolChunker; constructor(options = {}) { this.options = { maxChunkSize: 50, // Reduced from 500 to 50 lines to stay within token limits includeComments: true, includeImports: true, contextLines: 1, // Reduced context lines // New symbol-aware defaults useSymbolAwareChunking: true, maxTokens: 512, // Default for most embedding models preserveSignatures: true, splitLargeFunctions: true, includeControlFlow: true, ...options }; this.initializeParsers(); // Initialize symbol-aware chunker with options const chunkingOptions = { maxTokens: this.options.maxTokens, preserveSignatures: this.options.preserveSignatures, includeContext: this.options.includeImports, contextLines: this.options.contextLines, minChunkSize: 20, // Minimum meaningful chunk size in tokens splitLargeFunctions: this.options.splitLargeFunctions, includeControlFlow: this.options.includeControlFlow, overlapRatio: 0.1 // 10% overlap by default }; this.symbolChunker = new SymbolAwareChunker(chunkingOptions); } initializeParsers() { const languages = [ { name: 'TypeScript', lang: TypeScript?.typescript, keys: ['typescript', 'ts'] }, { name: 'TSX', lang: TypeScript?.tsx, keys: ['tsx'] }, { name: 'JavaScript', lang: JavaScript, keys: ['javascript', 'js', 'jsx'] }, { name: 'Java', lang: Java, keys: ['java'] }, { name: 'Python', lang: Python, keys: ['python', 'py'] }, { name: 'Go', lang: Go, keys: ['go'] }, { name: 'Rust', lang: Rust, keys: ['rust', 'rs'] }, { name: 'C++', lang: Cpp, keys: ['cpp', 'cc', 'cxx', 'c++'] }, // { name: 'C', lang: C, keys: ['c'] } // Disabled due to version conflicts ]; let successCount = 0; for (const { name, lang, keys } of languages) { if (!lang) { // console.warn(`Skipping ${name}: language not available`); continue; } try { const parser = new Parser(); parser.setLanguage(lang); // Type assertion for compatibility // Test the parser with a simple string to make sure it works parser.parse('test'); // If we get here, the parser works for (const key of keys) { this.parsers.set(key, parser); } successCount++; // Parser initialized successfully } catch (error) { // console.warn(`✗ Failed to initialize ${name} parser:`, error instanceof Error ? error.message : String(error)); } } // All parsers initialized: successCount out of total languages } /** * Detect language from file extension */ detectLanguage(filePath) { const ext = path.extname(filePath).toLowerCase().slice(1); const langMap = { 'ts': 'typescript', 'tsx': 'tsx', 'js': 'javascript', 'jsx': 'jsx', 'mjs': 'javascript', 'cjs': 'javascript', 'java': 'java', 'py': 'python', 'go': 'go', 'rs': 'rust', 'cpp': 'cpp', 'cc': 'cpp', 'cxx': 'cpp', 'c++': 'cpp', 'c': 'c', 'h': 'c', 'hpp': 'cpp' }; return langMap[ext] || 'unknown'; } /** * Parse code into semantic chunks */ parseCode(content, filePath) { const language = this.detectLanguage(filePath); const parser = this.parsers.get(language); if (!parser) { // Fallback to simple line-based chunking for unsupported languages return this.fallbackChunking(content, language); } try { const tree = parser.parse(content); // Use symbol-aware chunking if enabled if (this.options.useSymbolAwareChunking) { return this.parseCodeWithSymbolAwareness(tree, content, filePath, language); } // Legacy chunking method const lines = content.split('\n'); const chunks = []; // Extract semantic chunks from AST this.extractChunks(tree.rootNode, lines, chunks, language); // Add import chunks if enabled if (this.options.includeImports) { const importChunks = this.extractImports(tree.rootNode, lines, language); chunks.unshift(...importChunks); } // Add comment chunks if enabled if (this.options.includeComments) { const commentChunks = this.extractComments(content, lines, language); chunks.push(...commentChunks); } // Sort chunks by line number and merge overlapping ones return this.mergeOverlappingChunks(chunks.sort((a, b) => a.startLine - b.startLine)); } catch (error) { // console.warn(`Failed to parse ${filePath} with tree-sitter:`, error); return this.fallbackChunking(content, language); } } /** * Parse code using symbol-aware chunking */ parseCodeWithSymbolAwareness(tree, content, filePath, language) { // Get symbol-aware chunks const symbolChunks = this.symbolChunker.chunkBySymbols(tree.rootNode, content, language, filePath); // Convert symbol chunks to ParsedChunk format const parsedChunks = symbolChunks.map(symbolChunk => ({ startLine: symbolChunk.startLine, endLine: symbolChunk.endLine, content: symbolChunk.content, chunkType: this.mapSymbolTypeToChunkType(symbolChunk.symbolType), metadata: { ...symbolChunk.metadata, symbolType: symbolChunk.symbolType, name: symbolChunk.name, signature: symbolChunk.signature, isOverlapped: symbolChunk.isOverlapped, chunkIndex: symbolChunk.chunkIndex, totalChunks: symbolChunk.totalChunks, id: symbolChunk.id } })); // Add comments if enabled (using legacy method for now) if (this.options.includeComments) { const lines = content.split('\n'); const commentChunks = this.extractComments(content, lines, language); parsedChunks.push(...commentChunks); } return parsedChunks.sort((a, b) => a.startLine - b.startLine); } /** * Map symbol types to legacy chunk types */ mapSymbolTypeToChunkType(symbolType) { const mapping = { 'function': 'function', 'method': 'function', 'constructor': 'function', 'class': 'class', 'interface': 'interface', 'struct': 'interface', 'enum': 'interface', 'condition_block': 'other', 'loop_block': 'other', 'import': 'import', 'variable': 'variable', 'other': 'other' }; return mapping[symbolType]; } /** * Extract semantic chunks from AST nodes */ extractChunks(node, lines, chunks, language) { // Handle different node types based on language const chunkableTypes = this.getChunkableTypes(language); if (chunkableTypes.includes(node.type)) { const chunk = this.nodeToChunk(node, lines, language); if (chunk && chunk.content.trim().length > 0) { chunks.push(chunk); } } // Recursively process child nodes for (const child of node.children) { this.extractChunks(child, lines, chunks, language); } } /** * Get chunkable node types for a language */ getChunkableTypes(language) { const baseTypes = [ 'function_declaration', 'method_definition', 'arrow_function', 'function_expression', 'class_declaration', 'interface_declaration', 'type_alias_declaration', 'enum_declaration', 'namespace_declaration', 'variable_declaration', 'const_declaration', 'let_declaration' ]; const tsTypes = [ 'interface_declaration', 'type_alias_declaration', 'enum_declaration', 'namespace_declaration', 'module_declaration' ]; const javaTypes = [ 'method_declaration', 'constructor_declaration', 'class_declaration', 'interface_declaration', 'enum_declaration', 'annotation_type_declaration', 'field_declaration', 'package_declaration', 'import_declaration' ]; const pythonTypes = [ 'function_definition', 'async_function_definition', 'class_definition', 'assignment', 'import_statement', 'import_from_statement', 'decorated_definition' ]; const goTypes = [ 'function_declaration', 'method_declaration', 'type_declaration', 'var_declaration', 'const_declaration', 'import_declaration', 'package_clause' ]; const rustTypes = [ 'function_item', 'impl_item', 'trait_item', 'struct_item', 'enum_item', 'type_item', 'static_item', 'const_item', 'use_declaration', 'mod_item' ]; const cppTypes = [ 'function_definition', 'function_declarator', 'class_specifier', 'struct_specifier', 'enum_specifier', 'namespace_definition', 'using_declaration', 'preproc_include', 'template_declaration' ]; const cTypes = [ 'function_definition', 'function_declarator', 'struct_specifier', 'enum_specifier', 'typedef_declaration', 'preproc_include', 'declaration' ]; if (language === 'java') { return [...baseTypes, ...javaTypes]; } if (language === 'python') { return [...baseTypes, ...pythonTypes]; } if (language === 'go') { return [...baseTypes, ...goTypes]; } if (language === 'rust') { return [...baseTypes, ...rustTypes]; } if (language === 'cpp') { return [...baseTypes, ...cppTypes]; } if (language === 'c') { return [...baseTypes, ...cTypes]; } return language.includes('typescript') || language.includes('tsx') ? [...baseTypes, ...tsTypes] : baseTypes; } /** * Convert AST node to ParsedChunk */ nodeToChunk(node, lines, language) { const startLine = node.startPosition.row; const endLine = node.endPosition.row; if (endLine - startLine > this.options.maxChunkSize) { // Split large chunks return this.splitLargeChunk(node, lines, language); } const chunkType = this.mapNodeTypeToChunkType(node.type); const content = this.extractNodeContent(node, lines); if (!content.trim()) return null; const metadata = this.extractMetadata(node, language); return { startLine: startLine + 1, // Convert to 1-based indexing endLine: endLine + 1, content, chunkType, metadata }; } /** * Map AST node type to chunk type */ mapNodeTypeToChunkType(nodeType) { const typeMap = { // Functions 'function_declaration': 'function', 'method_definition': 'function', 'method_declaration': 'function', 'constructor_declaration': 'function', 'arrow_function': 'function', 'function_expression': 'function', 'function_definition': 'function', 'async_function_definition': 'function', 'function_item': 'function', 'function_declarator': 'function', // Classes and types 'class_declaration': 'class', 'class_definition': 'class', 'class_specifier': 'class', 'interface_declaration': 'interface', 'type_alias_declaration': 'interface', 'enum_declaration': 'interface', 'annotation_type_declaration': 'interface', 'namespace_declaration': 'interface', 'type_declaration': 'interface', 'trait_item': 'interface', 'struct_item': 'interface', 'enum_item': 'interface', 'type_item': 'interface', 'impl_item': 'interface', 'struct_specifier': 'interface', 'enum_specifier': 'interface', 'namespace_definition': 'interface', 'mod_item': 'interface', // Variables and fields 'variable_declaration': 'variable', 'field_declaration': 'variable', 'const_declaration': 'variable', 'let_declaration': 'variable', 'assignment': 'variable', 'var_declaration': 'variable', 'static_item': 'variable', 'const_item': 'variable', 'typedef_declaration': 'variable', 'declaration': 'variable', // Imports and exports 'import_statement': 'import', 'import_declaration': 'import', 'package_declaration': 'import', 'export_statement': 'import', 'import_from_statement': 'import', 'use_declaration': 'import', 'using_declaration': 'import', 'preproc_include': 'import', 'package_clause': 'import', 'template_declaration': 'import', // Other 'decorated_definition': 'other', 'comment': 'comment' }; return typeMap[nodeType] || 'other'; } /** * Extract content from AST node with context */ extractNodeContent(node, lines) { const startLine = Math.max(0, node.startPosition.row - this.options.contextLines); const endLine = Math.min(lines.length - 1, node.endPosition.row + this.options.contextLines); const content = lines.slice(startLine, endLine + 1).join('\n'); // Truncate content to stay within token limits (rough estimate: 1 token ≈ 4 characters) const maxChars = 800; // Conservative limit for ~200 tokens if (content.length > maxChars) { return content.substring(0, maxChars) + '\n// ... [truncated]'; } return content; } /** * Extract metadata from AST node */ extractMetadata(node, language) { const metadata = { nodeType: node.type, language }; // Extract function/method name const functionTypes = [ 'function_declaration', 'method_definition', 'method_declaration', 'constructor_declaration', 'function_definition', 'async_function_definition', 'function_item', 'function_declarator' ]; if (functionTypes.includes(node.type)) { const nameNode = node.childForFieldName('name'); if (nameNode) { metadata.name = nameNode.text; } } // Extract class name const classTypes = ['class_declaration', 'class_definition', 'class_specifier']; if (classTypes.includes(node.type)) { const nameNode = node.childForFieldName('name'); if (nameNode) { metadata.className = nameNode.text; } } // Extract interface/type name const typeTypes = [ 'interface_declaration', 'type_alias_declaration', 'enum_declaration', 'annotation_type_declaration', 'type_declaration', 'trait_item', 'struct_item', 'enum_item', 'type_item', 'struct_specifier', 'enum_specifier', 'namespace_definition', 'mod_item' ]; if (typeTypes.includes(node.type)) { const nameNode = node.childForFieldName('name'); if (nameNode) { metadata.typeName = nameNode.text; } } // Extract parameters for functions const paramFunctionTypes = [ 'function_declaration', 'method_definition', 'method_declaration', 'constructor_declaration', 'function_definition', 'async_function_definition', 'function_item', 'function_declarator' ]; if (paramFunctionTypes.includes(node.type)) { const paramsNode = node.childForFieldName('parameters'); if (paramsNode) { metadata.parameterCount = paramsNode.children.filter(c => c.type === 'required_parameter' || c.type === 'optional_parameter' || c.type === 'formal_parameter' || c.type === 'parameter' || c.type === 'parameter_declaration').length; } } return metadata; } /** * Extract import statements */ extractImports(rootNode, lines, language) { const imports = []; const findImports = (node) => { if (['import_statement', 'import_declaration'].includes(node.type)) { const chunk = this.nodeToChunk(node, lines, language); if (chunk) { imports.push(chunk); } } for (const child of node.children) { findImports(child); } }; findImports(rootNode); return imports; } /** * Extract comments using regex (simpler than AST for comments) */ extractComments(content, lines, language) { const comments = []; let commentRegex; if (language.includes('javascript') || language.includes('typescript') || language === 'java' || language === 'go' || language === 'rust' || language === 'cpp' || language === 'c') { // C-style comments: /* */ and // commentRegex = /(\/\*[\s\S]*?\*\/|\/\/.*$)/gm; } else if (language === 'python') { // Python comments: # and """docstrings""" commentRegex = /("""[\s\S]*?"""|'''[\s\S]*?'''|#.*$)/gm; } else { // Default to # style comments commentRegex = /(#.*$)/gm; } let match; while ((match = commentRegex.exec(content)) !== null) { const beforeMatch = content.substring(0, match.index); const startLine = beforeMatch.split('\n').length; const commentLines = match[0].split('\n').length; const endLine = startLine + commentLines - 1; if (match[0].trim().length > 10) { // Only include substantial comments comments.push({ startLine, endLine, content: match[0], chunkType: 'comment', metadata: { language } }); } } return comments; } /** * Split large chunks into smaller ones */ splitLargeChunk(node, lines, language) { // For now, just return the original chunk - can be enhanced to split intelligently return { startLine: node.startPosition.row + 1, endLine: node.endPosition.row + 1, content: this.extractNodeContent(node, lines), chunkType: this.mapNodeTypeToChunkType(node.type), metadata: this.extractMetadata(node, language) }; } /** * Merge overlapping chunks */ mergeOverlappingChunks(chunks) { if (chunks.length <= 1) return chunks; const merged = []; let current = chunks[0]; for (let i = 1; i < chunks.length; i++) { const next = chunks[i]; // If chunks overlap or are adjacent, merge them if (current.endLine >= next.startLine - 1) { current = { startLine: current.startLine, endLine: Math.max(current.endLine, next.endLine), content: current.content + '\n' + next.content, chunkType: current.chunkType === next.chunkType ? current.chunkType : 'other', metadata: { ...current.metadata, ...next.metadata } }; } else { merged.push(current); current = next; } } merged.push(current); return merged; } /** * Fallback chunking for unsupported languages */ fallbackChunking(content, language) { const lines = content.split('\n'); const chunks = []; const chunkSize = 50; // Lines per chunk for (let i = 0; i < lines.length; i += chunkSize) { const endLine = Math.min(i + chunkSize, lines.length); const chunkContent = lines.slice(i, endLine).join('\n'); if (chunkContent.trim().length > 0) { chunks.push({ startLine: i + 1, endLine: endLine, content: chunkContent, chunkType: 'other', metadata: { language, fallback: true } }); } } return chunks; } /** * Check if file should be parsed */ shouldParseFile(filePath) { const language = this.detectLanguage(filePath); return this.parsers.has(language) || language !== 'unknown'; } } //# sourceMappingURL=parser.js.map