UNPKG

@wildcard-ai/deepcontext

Version:

Advanced codebase indexing and semantic search MCP server

945 lines 43.2 kB
/** * TreeSitterChunkExtractor - AST-Based Semantic Chunking * * Creates meaningful code chunks based on AST structure rather than individual symbols. * Inspired by research from: * - the-dream-machine/ebdde5abc0e7432d66ca16bc48c8108d * - CintraAI/code-chunker * - yilinjz/astchunk * * Key Principle: Extract complete semantic units (full classes, functions, interfaces) * not individual symbol metadata. */ import { Logger } from '../../utils/Logger.js'; import * as crypto from 'crypto'; // Import Tree-sitter modules let Parser; let TypeScriptLanguage; let JavaScriptLanguage; let PythonLanguage; // Lazy load Tree-sitter modules async function loadTreeSitter() { if (!Parser) { Parser = (await import('tree-sitter')).default; const tsModule = await import('tree-sitter-typescript'); const jsModule = await import('tree-sitter-javascript'); const pyModule = await import('tree-sitter-python'); TypeScriptLanguage = tsModule.default.typescript; JavaScriptLanguage = jsModule.default; PythonLanguage = pyModule.default; } } export class TreeSitterChunkExtractor { configurationService; parsers = new Map(); initialized = false; logger; // Chunking parameters (based on research) MIN_CHUNK_SIZE = 30; // capture small functions while avoiding tiny fragments PREFERRED_CHUNK_SIZE = 1000; // sweet spot for search constructor(configurationService) { this.configurationService = configurationService; this.logger = new Logger('TREESITTER-CHUNKER', 'info'); } /** * Generate a short, unique ID that fits within Turbopuffer's 64-byte limit */ generateShortId(filePath, suffix) { // Extract just the filename from the path const fileName = filePath.split('/').pop() || filePath; const baseName = fileName.split('.')[0]; // Remove extensions // Create a short hash from the full path for uniqueness const pathHash = crypto.createHash('md5').update(filePath).digest('hex').substring(0, 8); // Combine into a short ID: basename_hash_suffix const shortId = `${baseName}_${pathHash}_${suffix}`; // Ensure it's under 64 bytes return shortId.length > 60 ? shortId.substring(0, 60) : shortId; } async initialize() { if (this.initialized) return; try { await loadTreeSitter(); // Initialize parsers const tsParser = new Parser(); tsParser.setLanguage(TypeScriptLanguage); this.parsers.set('typescript', tsParser); const jsParser = new Parser(); jsParser.setLanguage(JavaScriptLanguage); this.parsers.set('javascript', jsParser); const pyParser = new Parser(); pyParser.setLanguage(PythonLanguage); this.parsers.set('python', pyParser); this.initialized = true; this.logger.info('✅ Tree-sitter chunker initialized successfully (TypeScript, JavaScript, Python)'); } catch (error) { this.logger.error(`❌ Tree-sitter chunker initialization failed: ${error}`); throw error; } } /** * Extract semantic chunks from source code using AST structure */ async extractSemanticChunks(content, language, filePath, relativePath = filePath) { const startTime = Date.now(); await this.initialize(); if (!this.parsers.has(language)) { throw new Error(`Unsupported language: ${language}`); } const parser = this.parsers.get(language); const chunks = []; const parseErrors = []; try { // TreeSitter's actual limit based on testing const TREESITTER_LIMIT = 32768; // 32KB - TreeSitter's proven reliable limit if (content.length > TREESITTER_LIMIT) { this.logger.warn(`File ${filePath} (${content.length} chars) exceeds Tree-sitter limit, using smart pre-chunking`); return this.handleLargeFile(content, filePath, relativePath, language, parser); } // Parse the entire file to get AST const tree = parser.parse(content); const rootNode = tree.rootNode; // Find semantic units in the AST const semanticUnits = this.findSemanticUnits(rootNode, content); // Convert semantic units to chunks (pure semantic approach - no size splitting) for (const unit of semanticUnits) { const chunk = await this.createChunkFromUnit(unit, content, filePath, relativePath, language); if (chunk) { chunks.push(chunk); } } // If no semantic units found, create a single chunk for the entire file if (chunks.length === 0 && content.trim().length > 0) { const fallbackChunk = await this.createChunkFromContent(content, 1, content.split('\n').length, filePath, relativePath, language, 'mixed'); if (fallbackChunk) { chunks.push(fallbackChunk); } } else if (chunks.length > 0) { // Check if there's remaining content after the last chunk const lines = content.split('\n'); const lastChunk = chunks[chunks.length - 1]; if (lastChunk.endLine < lines.length) { const remainingContent = lines.slice(lastChunk.endLine).join('\n').trim(); if (remainingContent.length > 20) { // Only if substantial content remains const tailChunk = await this.createChunkFromContent(remainingContent, lastChunk.endLine + 1, lines.length, filePath, relativePath, language, 'mixed'); if (tailChunk) { chunks.push(tailChunk); } } } } const processingTime = Date.now() - startTime; const totalNodes = this.countNodes(rootNode); const averageChunkSize = chunks.reduce((sum, chunk) => sum + chunk.size, 0) / chunks.length || 0; this.logger.info(`Created ${chunks.length} semantic chunks from ${filePath}`); return { chunks, parseErrors, metadata: { totalNodes, totalChunks: chunks.length, averageChunkSize, processingTime } }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); this.logger.error(`Chunking failed for ${filePath}: ${errorMessage}`); parseErrors.push(`AST parsing failed: ${errorMessage}`); // Fallback to simple chunking return this.fallbackToSimpleChunking(content, filePath, relativePath, language); } } /** * Find semantic units in the AST (complete classes, functions, interfaces, etc.) */ findSemanticUnits(rootNode, sourceCode) { const units = []; // Define what constitutes a semantic unit based on AST node types const semanticNodeTypes = new Set([ // TypeScript/JavaScript 'class_declaration', 'interface_declaration', 'type_alias_declaration', 'function_declaration', 'method_definition', // Re-added for smart class splitting 'namespace_declaration', 'enum_declaration', // Python 'class_definition', 'function_definition', 'decorated_definition' // For @decorator functions/classes // Note: Removed individual import statements to avoid tiny chunks // Imports will be captured via symbol extraction instead ]); // Traverse AST to find semantic units this.traverseForSemanticUnits(rootNode, units, semanticNodeTypes, sourceCode); // Sort units by position units.sort((a, b) => a.startIndex - b.startIndex); // Merge small adjacent units and handle overlaps return this.optimizeSemanticUnits(units, sourceCode); } traverseForSemanticUnits(node, units, semanticTypes, sourceCode) { // Check if this node represents a semantic unit if (semanticTypes.has(node.type)) { const unitText = sourceCode.slice(node.startIndex, node.endIndex); // Include all semantic units regardless of size (pure semantic approach) if (unitText.length >= this.MIN_CHUNK_SIZE) { units.push({ type: this.mapNodeTypeToChunkType(node.type), startIndex: node.startIndex, endIndex: node.endIndex, startLine: node.startPosition.row + 1, endLine: node.endPosition.row + 1, node: node, content: unitText }); // Smart hierarchical chunking: For large classes, extract both class AND methods if (['class_declaration', 'class_definition'].includes(node.type)) { const classLines = node.endPosition.row - node.startPosition.row + 1; const className = this.getNodeName?.(node) || 'unknown'; this.logger?.info(`🔍 Found class: ${className} (${classLines} lines)`); // If class is large (>150 lines), also extract individual methods if (classLines > 150) { this.logger?.info(`📤 Splitting large class: ${className} (${classLines} lines > 150)`); this.extractMethodsFromLargeClass(node, units, semanticTypes, sourceCode); } else { this.logger?.info(`📦 Keeping small class intact: ${className} (${classLines} lines <= 150)`); } return; // Don't traverse normally to avoid duplicates } // For namespaces, don't traverse children (complete unit) if (['namespace_declaration'].includes(node.type)) { return; } } } // Continue traversal for child nodes for (const child of node.namedChildren) { this.traverseForSemanticUnits(child, units, semanticTypes, sourceCode); } } /** * Extract individual methods from large classes for better granularity */ extractMethodsFromLargeClass(classNode, units, semanticTypes, sourceCode) { let methodCount = 0; this.logger?.info(`🔧 Extracting methods from large class with ${classNode.namedChildren.length} children`); // Look for class_body first, then methods within it const classBody = classNode.namedChildren.find(child => child.type === 'class_body'); const nodesToCheck = classBody ? classBody.namedChildren : classNode.namedChildren; this.logger?.info(`🔍 Checking ${nodesToCheck.length} nodes for methods (using ${classBody ? 'class_body' : 'direct children'})`); for (const child of nodesToCheck) { this.logger?.info(` Child type: ${child.type} at lines ${child.startPosition.row + 1}-${child.endPosition.row + 1}`); if (child.type === 'method_definition') { const methodText = sourceCode.slice(child.startIndex, child.endIndex); const methodName = this.getNodeName(child) || 'unknown'; const methodLines = child.endPosition.row - child.startPosition.row + 1; this.logger?.info(` 🎯 Found method: ${methodName} (${methodLines} lines, ${methodText.length} chars)`); if (methodText.length >= this.MIN_CHUNK_SIZE) { units.push({ type: 'function', // Methods are treated as functions for chunking startIndex: child.startIndex, endIndex: child.endIndex, startLine: child.startPosition.row + 1, endLine: child.endPosition.row + 1, node: child, content: methodText }); methodCount++; this.logger?.info(` ✅ Added method chunk: ${methodName}`); } else { this.logger?.info(` ❌ Method too small: ${methodName} (${methodText.length} < ${this.MIN_CHUNK_SIZE})`); } } // Recursively check nested classes or other structures (but avoid infinite recursion) if (child.namedChildren.length > 0 && ['class_declaration', 'class_definition'].includes(child.type)) { this.extractMethodsFromLargeClass(child, units, semanticTypes, sourceCode); } } this.logger?.info(`🏁 Extracted ${methodCount} methods from large class`); } optimizeSemanticUnits(units, sourceCode) { const optimized = []; let currentUnit = null; for (const unit of units) { if (!currentUnit) { currentUnit = unit; continue; } const gap = unit.startIndex - currentUnit.endIndex; const combinedSize = (unit.endIndex - currentUnit.startIndex); // Merge if gap is small and combined size is reasonable const chunkingConfig = this.configurationService.getChunkingConfig(); if (gap < 100 && combinedSize <= chunkingConfig.maxChunkSize) { // Merge units currentUnit = { type: 'mixed', startIndex: currentUnit.startIndex, endIndex: unit.endIndex, startLine: currentUnit.startLine, endLine: unit.endLine, node: currentUnit.node, // Keep first node as reference content: sourceCode.slice(currentUnit.startIndex, unit.endIndex) }; } else { // Add current unit and start new one optimized.push(currentUnit); currentUnit = unit; } } if (currentUnit) { optimized.push(currentUnit); } return optimized; } async createChunkFromUnit(unit, sourceCode, filePath, relativePath, language) { // Note: Symbol extraction removed - handled by IndexingOrchestrator to avoid duplication // Extract imports using the comprehensive import extraction const imports = this.extractImportsFromContent(unit.content); // Generate unique chunk ID (short format for Turbopuffer) const chunkId = this.generateShortId(filePath, `${unit.startLine}-${unit.endLine}`); return { id: chunkId, content: unit.content, filePath, relativePath, startLine: unit.startLine, endLine: unit.endLine, language, chunkType: unit.type, symbols: [], // Will be populated by IndexingOrchestrator imports, size: unit.content.length, complexity: this.calculateComplexity(unit.content) }; } // Symbol extraction removed - handled by IndexingOrchestrator to avoid duplication // Symbol extraction removed - handled by IndexingOrchestrator to avoid duplication // Symbol extraction removed - handled by IndexingOrchestrator to avoid duplication /** * Split large semantic units (like huge classes) into manageable chunks * while preserving semantic boundaries */ async splitLargeSemanticUnit(unit, sourceCode, filePath, relativePath, language) { const chunkingConfig = this.configurationService.getChunkingConfig(); const chunks = []; this.logger.info(`Splitting large ${unit.type}: ${unit.content.length} chars at ${filePath}:${unit.startLine}-${unit.endLine}`); // For classes, try to split by methods while preserving class structure if (unit.type === 'class' && unit.node) { const classSplits = await this.splitClassIntoMethods(unit, sourceCode, filePath, relativePath, language); if (classSplits.length > 1) { return classSplits; } } // Fallback: Split by line boundaries while preserving scope return await this.splitByLineBoundaries(unit, sourceCode, filePath, relativePath, language); } /** * Split a large class by its methods while preserving class context */ async splitClassIntoMethods(unit, sourceCode, filePath, relativePath, language) { const chunks = []; const lines = unit.content.split('\n'); const chunkingConfig = this.configurationService.getChunkingConfig(); // Extract class header (class declaration + initial content) let classHeader = ''; let currentMethodChunk = ''; let methodStartLine = unit.startLine; let inMethod = false; let braceDepth = 0; let chunkIndex = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const trimmed = line.trim(); // Track brace depth braceDepth += (line.match(/\{/g) || []).length; braceDepth -= (line.match(/\}/g) || []).length; // Detect method boundaries (simple heuristic) const isMethodStart = trimmed.match(/^\s*(?:public|private|protected|async)?\s*\w+\s*\(/); if (!inMethod && (i < 5 || !isMethodStart)) { // Accumulate class header (first few lines and non-methods) classHeader += (classHeader ? '\n' : '') + line; } else { if (!inMethod && isMethodStart) { // Starting a new method inMethod = true; methodStartLine = unit.startLine + i; } currentMethodChunk += (currentMethodChunk ? '\n' : '') + line; // If we've accumulated enough content or reached method boundary if ((currentMethodChunk.length > chunkingConfig.maxChunkSize * 0.8) || (inMethod && braceDepth <= 1 && trimmed === '}')) { // Create chunk with class context const chunkContent = classHeader + '\n\n// --- Method section ---\n' + currentMethodChunk; chunks.push({ id: this.generateShortId(filePath, `class_${chunkIndex++}`), content: chunkContent, filePath, relativePath, startLine: methodStartLine, endLine: unit.startLine + i, language, chunkType: 'class', symbols: [], // Will be populated by IndexingOrchestrator imports: this.extractImportsFromContent(chunkContent), size: chunkContent.length, complexity: this.calculateComplexity(chunkContent) }); // Reset for next method currentMethodChunk = ''; inMethod = false; methodStartLine = unit.startLine + i + 1; } } } // Add final chunk if there's remaining content if (currentMethodChunk.trim()) { const chunkContent = classHeader + '\n\n// --- Final section ---\n' + currentMethodChunk; chunks.push({ id: this.generateShortId(filePath, `class_${chunkIndex}`), content: chunkContent, filePath, relativePath, startLine: methodStartLine, endLine: unit.endLine, language, chunkType: 'class', symbols: [], // Will be populated by IndexingOrchestrator imports: this.extractImportsFromContent(chunkContent), size: chunkContent.length, complexity: this.calculateComplexity(chunkContent) }); } return chunks.length > 1 ? chunks : []; } /** * Fallback: Split by line boundaries while preserving semantic structure */ async splitByLineBoundaries(unit, sourceCode, filePath, relativePath, language) { const chunks = []; const lines = unit.content.split('\n'); const chunkingConfig = this.configurationService.getChunkingConfig(); const linesPerChunk = Math.ceil(chunkingConfig.maxChunkSize / 80); // Assume ~80 chars per line for (let i = 0; i < lines.length; i += linesPerChunk) { const chunkLines = lines.slice(i, Math.min(i + linesPerChunk, lines.length)); const chunkContent = chunkLines.join('\n'); chunks.push({ id: this.generateShortId(filePath, `split_${Math.floor(i / linesPerChunk)}`), content: chunkContent, filePath, relativePath, startLine: unit.startLine + i, endLine: unit.startLine + i + chunkLines.length - 1, language, chunkType: unit.type, symbols: [], // Will be populated by IndexingOrchestrator imports: this.extractImportsFromContent(chunkContent), size: chunkContent.length, complexity: this.calculateComplexity(chunkContent) }); } return chunks; } calculateComplexity(content) { const lines = content.split('\n').length; const nestingLevel = (content.match(/{/g) || []).length; if (lines < 20 && nestingLevel < 3) return 'low'; if (lines < 100 && nestingLevel < 10) return 'medium'; return 'high'; } mapNodeTypeToChunkType(nodeType) { switch (nodeType) { // TypeScript/JavaScript case 'class_declaration': return 'class'; case 'interface_declaration': return 'interface'; case 'type_alias_declaration': return 'type'; case 'function_declaration': case 'method_definition': case 'arrow_function': return 'function'; case 'namespace_declaration': return 'module'; // Python case 'class_definition': return 'class'; case 'function_definition': return 'function'; case 'decorated_definition': return 'function'; // Treat decorated items as functions case 'import_statement': case 'import_from_statement': return 'module'; default: return 'mixed'; } } mapNodeTypeToSymbolType(nodeType) { switch (nodeType) { // TypeScript/JavaScript case 'class_declaration': return 'class'; case 'interface_declaration': return 'interface'; case 'type_alias_declaration': return 'type'; case 'function_declaration': return 'function'; // Python case 'class_definition': return 'class'; case 'function_definition': return 'function'; case 'decorated_definition': return 'function'; default: return 'variable'; } } getNodeName(node) { // Try to find identifier child node for (const child of node.namedChildren) { if (child.type === 'identifier') { return child.text; } } return null; } countNodes(node) { let count = 1; for (const child of node.children) { count += this.countNodes(child); } return count; } async handleLargeFile(content, filePath, relativePath, language, parser) { this.logger.info(`Using intelligent TreeSitter range-based parsing for large file: ${filePath}`); // Use intelligent range-based TreeSitter parsing instead of crude fallback return await this.intelligentRangeBasedParsing(content, filePath, relativePath, language, parser); } /** * Intelligent Range-Based TreeSitter Parsing * Splits large files into semantic ranges and parses each with TreeSitter */ async intelligentRangeBasedParsing(content, filePath, relativePath, language, parser) { const startTime = Date.now(); const WINDOW_SIZE = 30000; // 30KB windows (safe under 32KB limit) const OVERLAP_SIZE = 2000; // 2KB overlap for context preservation // Step 1: Find semantic boundaries (class/function/interface starts) const semanticBoundaries = this.findSemanticBoundaries(content); // Step 2: Create overlapping windows that respect semantic boundaries const windows = this.createIntelligentWindows(content, semanticBoundaries, WINDOW_SIZE, OVERLAP_SIZE); this.logger.info(`Created ${windows.length} intelligent windows for TreeSitter parsing`); const allChunks = []; const allErrors = []; let totalNodes = 0; // Step 3: Parse each window with TreeSitter for (let i = 0; i < windows.length; i++) { const window = windows[i]; try { this.logger.debug(`Parsing window ${i + 1}/${windows.length} (${window.content.length} chars)`); const tree = parser.parse(window.content); const rootNode = tree.rootNode; if (rootNode.hasError) { allErrors.push(`Window ${i} has parse errors`); } // Create comprehensive chunks from this window to ensure full content coverage const windowChunks = await this.createComprehensiveWindowChunks(rootNode, window.content, filePath, relativePath, language, i); totalNodes += this.countNodes(rootNode); // Adjust line numbers to file coordinates and add to collection for (const chunk of windowChunks) { chunk.startLine += window.startLine; chunk.endLine += window.startLine; chunk.id = this.generateShortId(filePath, `w${i}_${chunk.startLine}-${chunk.endLine}`); // Adjust symbol line numbers chunk.symbols.forEach(symbol => { symbol.startLine += window.startLine; symbol.endLine += window.startLine; }); allChunks.push(chunk); } } catch (error) { this.logger.warn(`TreeSitter parsing failed for window ${i}: ${error}`); allErrors.push(`Window ${i}: ${error}`); // Even if TreeSitter fails, create a semantic chunk for this window const fallbackChunk = this.createSemanticFallbackChunk(window, filePath, relativePath, language, i); allChunks.push(fallbackChunk); } } // Step 4: Remove duplicates from overlapping windows const deduplicatedChunks = this.removeDuplicateChunks(allChunks); const processingTime = Date.now() - startTime; const avgChunkSize = deduplicatedChunks.reduce((sum, chunk) => sum + chunk.size, 0) / deduplicatedChunks.length || 0; this.logger.info(`✅ Intelligent range-based parsing complete: ${deduplicatedChunks.length} chunks, ${processingTime}ms`); return { chunks: deduplicatedChunks, parseErrors: allErrors, metadata: { totalNodes, totalChunks: deduplicatedChunks.length, averageChunkSize: avgChunkSize, processingTime } }; } /** * Find semantic boundaries in code (class/function/interface starts) */ /** * Create comprehensive chunks from a window ensuring full content coverage */ async createComprehensiveWindowChunks(rootNode, windowContent, filePath, relativePath, language, windowIndex) { const chunks = []; const lines = windowContent.split('\n'); // First, find semantic units (functions, classes, etc.) const semanticUnits = this.findSemanticUnits(rootNode, windowContent); const coveredLines = new Set(); // Process semantic units first for (const unit of semanticUnits) { const chunk = await this.createChunkFromUnit(unit, windowContent, filePath, relativePath, language); if (chunk) { chunks.push(chunk); // Track which lines are covered for (let line = chunk.startLine; line <= chunk.endLine; line++) { coveredLines.add(line); } } } // All content should be covered by semantic units from intelligent windowing // No additional gap-filling needed for large files // Sort chunks by start line chunks.sort((a, b) => a.startLine - b.startLine); return chunks; } /** * Find gaps in line coverage */ /** * Create chunk from content string */ async createChunkFromContent(content, startLine, endLine, filePath, relativePath, language, chunkType) { if (content.trim().length === 0) { return null; } // Extract symbols from content if possible const symbols = []; try { // Simple regex-based symbol extraction for gap content this.extractBasicSymbols(content, symbols, startLine); } catch (error) { // Continue without symbols if extraction fails } return { id: this.generateShortId(filePath, `${startLine}-${endLine}`), content: content.trim(), filePath, relativePath, startLine, endLine, language, chunkType: chunkType, size: content.length, complexity: 'low', // Simple default complexity symbols: [], // Will be populated by IndexingOrchestrator imports: [] // TODO: Implement import extraction if needed }; } /** * Extract basic symbols using simple patterns for gap content */ extractBasicSymbols(content, symbols, baseLineNumber) { const lines = content.split('\n'); lines.forEach((line, i) => { const lineNumber = baseLineNumber + i; const trimmed = line.trim(); // Function declarations (TypeScript/JavaScript/Python) const funcMatch = trimmed.match(/^(?:export\s+)?(?:async\s+)?function\s+(\w+)/) || trimmed.match(/^(?:async\s+)?def\s+(\w+)\s*\(/); // Python if (funcMatch) { symbols.push({ name: funcMatch[1], type: 'function', startLine: lineNumber, endLine: lineNumber }); } // Class declarations (TypeScript/JavaScript/Python) const classMatch = trimmed.match(/^(?:export\s+)?class\s+(\w+)/) || trimmed.match(/^class\s+(\w+)\s*(?:\(.*\))?:/); // Python if (classMatch) { symbols.push({ name: classMatch[1], type: 'class', startLine: lineNumber, endLine: lineNumber }); } // Interface declarations const interfaceMatch = trimmed.match(/^(?:export\s+)?interface\s+(\w+)/); if (interfaceMatch) { symbols.push({ name: interfaceMatch[1], type: 'interface', startLine: lineNumber, endLine: lineNumber }); } // Type declarations const typeMatch = trimmed.match(/^(?:export\s+)?type\s+(\w+)/); if (typeMatch) { symbols.push({ name: typeMatch[1], type: 'type', startLine: lineNumber, endLine: lineNumber }); } }); } findSemanticBoundaries(content) { const lines = content.split('\n'); const boundaries = []; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); // Major semantic boundaries for TypeScript/JavaScript if (line.match(/^(export\s+)?(class|interface|enum)\s+\w+/)) { const match = line.match(/^(export\s+)?(class|interface|enum)\s+(\w+)/); boundaries.push({ line: i, type: match?.[2] || 'class', name: match?.[3] }); } else if (line.match(/^(export\s+)?(async\s+)?function\s+\w+/)) { const match = line.match(/^(export\s+)?(async\s+)?function\s+(\w+)/); boundaries.push({ line: i, type: 'function', name: match?.[3] }); } else if (line.match(/^(export\s+)?(const|let|var)\s+\w+\s*=/)) { const match = line.match(/^(export\s+)?(const|let|var)\s+(\w+)/); boundaries.push({ line: i, type: 'variable', name: match?.[3] }); } // Python semantic boundaries else if (line.match(/^class\s+\w+\s*(?:\(.*\))?:/)) { const match = line.match(/^class\s+(\w+)/); boundaries.push({ line: i, type: 'class', name: match?.[1] }); } else if (line.match(/^(?:async\s+)?def\s+\w+\s*\(/)) { const match = line.match(/^(?:async\s+)?def\s+(\w+)/); boundaries.push({ line: i, type: 'function', name: match?.[1] }); } else if (line.match(/^@\w+/)) { // Python decorators - often mark semantic boundaries boundaries.push({ line: i, type: 'decorator' }); } } return boundaries; } /** * Create intelligent windows that respect semantic boundaries */ createIntelligentWindows(content, boundaries, windowSize, overlapSize) { const lines = content.split('\n'); const windows = []; let currentStart = 0; while (currentStart < lines.length) { // Find optimal end point respecting semantic boundaries let currentEnd = Math.min(currentStart + Math.floor(windowSize / 50), lines.length); // ~50 chars per line estimate // Adjust end to semantic boundary if possible const nearbyBoundary = boundaries.find(b => b.line > currentEnd - 10 && b.line < currentEnd + 10); if (nearbyBoundary && nearbyBoundary.line < lines.length - 5) { currentEnd = nearbyBoundary.line; } const windowLines = lines.slice(currentStart, currentEnd); const windowContent = windowLines.join('\n'); // Ensure window is under size limit if (windowContent.length > windowSize) { // Trim to size while preserving semantic integrity currentEnd = this.findSafeTrimPoint(lines, currentStart, windowSize); const trimmedContent = lines.slice(currentStart, currentEnd).join('\n'); if (trimmedContent.length > 0) { windows.push({ content: trimmedContent, startLine: currentStart, endLine: currentEnd, startByte: this.calculateByteOffset(content, currentStart), endByte: this.calculateByteOffset(content, currentEnd) }); } } else if (windowContent.length > 0) { windows.push({ content: windowContent, startLine: currentStart, endLine: currentEnd, startByte: this.calculateByteOffset(content, currentStart), endByte: this.calculateByteOffset(content, currentEnd) }); } // Move to next window with meaningful overlap const overlapLines = Math.floor(overlapSize / 50); // ~40 lines for 2KB overlap const minIncrement = Math.max(50, Math.floor((currentEnd - currentStart) / 2)); // At least 50 lines or half window currentStart = Math.max(currentStart + minIncrement, currentEnd - overlapLines); // Prevent infinite loop and tiny windows at end if (currentStart >= currentEnd - 10 || currentEnd >= lines.length - 10) { break; // End processing to avoid tiny windows } } return windows; } findSafeTrimPoint(lines, start, maxSize) { let size = 0; let lastSafeTrim = start; for (let i = start; i < lines.length; i++) { const lineSize = lines[i].length + 1; // +1 for newline if (size + lineSize > maxSize) break; size += lineSize; // Safe trim points: end of functions, classes, or natural breaks const line = lines[i].trim(); if (line === '}' || line === '' || line.startsWith('//')) { lastSafeTrim = i + 1; } } return Math.max(lastSafeTrim, start + 1); } calculateByteOffset(content, lineNumber) { const lines = content.split('\n'); let offset = 0; for (let i = 0; i < Math.min(lineNumber, lines.length); i++) { offset += lines[i].length + 1; // +1 for newline } return offset; } /** * Create a semantic fallback chunk when TreeSitter fails */ createSemanticFallbackChunk(window, filePath, relativePath, language, windowIndex) { // Symbol extraction removed - handled by IndexingOrchestrator return { id: this.generateShortId(filePath, `semantic_fallback_w${windowIndex}`), content: window.content, filePath, relativePath, startLine: window.startLine + 1, endLine: window.endLine, language, chunkType: 'mixed', symbols: [], // Will be populated by IndexingOrchestrator imports: this.extractImportsFromContent(window.content), size: window.content.length, complexity: this.calculateComplexity(window.content) }; } /** * Extract imports from content */ extractImportsFromContent(content) { const imports = []; const lines = content.split('\n'); for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); // TypeScript/JavaScript imports if (line.startsWith('import ')) { const moduleMatch = line.match(/from\s+['"]([^'"]+)['"]/); const symbolsMatch = line.match(/import\s+\{([^}]+)\}/); imports.push({ module: moduleMatch?.[1] || 'unknown', symbols: symbolsMatch?.[1]?.split(',').map(s => s.trim()) || [], line: i + 1 }); } // Python imports else if (line.startsWith('from ') && line.includes(' import ')) { const match = line.match(/from\s+([^\s]+)\s+import\s+(.+)/); if (match) { const module = match[1]; const symbolsStr = match[2]; const symbols = symbolsStr.split(',').map(s => s.trim().split(' as ')[0]); imports.push({ module, symbols: [], // Will be populated by IndexingOrchestrator line: i + 1 }); } } else if (line.startsWith('import ') && !line.includes(' from ')) { const match = line.match(/import\s+([^\s]+)(?:\s+as\s+\w+)?/); if (match) { imports.push({ module: match[1], symbols: [], line: i + 1 }); } } } return imports; } /** * Remove duplicate chunks from overlapping windows */ removeDuplicateChunks(chunks) { const uniqueChunks = []; const seenRanges = new Set(); for (const chunk of chunks) { const rangeKey = `${chunk.startLine}-${chunk.endLine}-${chunk.chunkType}`; if (!seenRanges.has(rangeKey)) { seenRanges.add(rangeKey); uniqueChunks.push(chunk); } } return uniqueChunks; } fallbackToSimpleChunking(content, filePath, relativePath, language) { this.logger.warn(`Falling back to simple chunking for ${filePath}`); // Simple line-based chunking as fallback const lines = content.split('\n'); const chunks = []; const chunkSize = 50; // lines per chunk for (let i = 0; i < lines.length; i += chunkSize) { const chunkLines = lines.slice(i, i + chunkSize); const chunkContent = chunkLines.join('\n'); // Symbol extraction removed - handled by IndexingOrchestrator chunks.push({ id: this.generateShortId(filePath, `fb_${i}`), content: chunkContent, filePath, relativePath, startLine: i + 1, endLine: i + chunkLines.length, language, chunkType: 'mixed', symbols: [], // Will be populated by IndexingOrchestrator imports: [], size: chunkContent.length, complexity: 'low' }); } return { chunks, parseErrors: ['Fallback chunking used'], metadata: { totalNodes: 0, totalChunks: chunks.length, averageChunkSize: chunks.reduce((sum, chunk) => sum + chunk.size, 0) / chunks.length || 0, processingTime: 0 } }; } } //# sourceMappingURL=TreeSitterChunkExtractor.js.map