UNPKG

@wildcard-ai/deepcodex

Version:

Advanced codebase indexing and semantic search MCP server

124 lines 3.77 kB
/** * TreeSitterChunkExtractor - AST-Based Semantic Chunking * * Creates meaningful code chunks based on AST structure rather than individual symbols. * Inspired by research from: * - the-dream-machine/ebdde5abc0e7432d66ca16bc48c8108d * - CintraAI/code-chunker * - yilinjz/astchunk * * Key Principle: Extract complete semantic units (full classes, functions, interfaces) * not individual symbol metadata. */ export interface SemanticChunk { id: string; content: string; filePath: string; relativePath: string; startLine: number; endLine: number; language: string; chunkType: 'class' | 'function' | 'interface' | 'type' | 'module' | 'mixed'; symbols: Array<{ name: string; type: 'function' | 'class' | 'interface' | 'type' | 'variable' | 'constant'; line: number; scope?: string; }>; imports: Array<{ module: string; symbols: string[]; line: number; }>; size: number; complexity: 'low' | 'medium' | 'high'; } export interface ChunkExtractionResult { chunks: SemanticChunk[]; parseErrors: string[]; metadata: { totalNodes: number; totalChunks: number; averageChunkSize: number; processingTime: number; }; } export declare class TreeSitterChunkExtractor { private parsers; private initialized; private logger; private readonly MAX_CHUNK_SIZE; private readonly MIN_CHUNK_SIZE; private readonly PREFERRED_CHUNK_SIZE; constructor(); /** * Generate a short, unique ID that fits within Turbopuffer's 64-byte limit */ private generateShortId; initialize(): Promise<void>; /** * Extract semantic chunks from source code using AST structure */ extractSemanticChunks(content: string, language: string, filePath: string, relativePath?: string): Promise<ChunkExtractionResult>; /** * Find semantic units in the AST (complete classes, functions, interfaces, etc.) */ private findSemanticUnits; private traverseForSemanticUnits; private optimizeSemanticUnits; private createChunkFromUnit; private extractSymbolsFromUnit; private extractSymbolsFromContent; private traverseNodeForSymbols; private calculateComplexity; private mapNodeTypeToChunkType; private mapNodeTypeToSymbolType; private getNodeName; private countNodes; private handleLargeFile; /** * Intelligent Range-Based TreeSitter Parsing * Splits large files into semantic ranges and parses each with TreeSitter */ private intelligentRangeBasedParsing; /** * Find semantic boundaries in code (class/function/interface starts) */ /** * Create comprehensive chunks from a window ensuring full content coverage */ private createComprehensiveWindowChunks; /** * Find gaps in line coverage */ private findContentGaps; /** * Create chunk from content string */ private createChunkFromContent; /** * Extract basic symbols using simple patterns for gap content */ private extractBasicSymbols; private findSemanticBoundaries; /** * Create intelligent windows that respect semantic boundaries */ private createIntelligentWindows; private findSafeTrimPoint; private calculateByteOffset; /** * Create a semantic fallback chunk when TreeSitter fails */ private createSemanticFallbackChunk; /** * Extract imports from content */ private extractImportsFromContent; /** * Remove duplicate chunks from overlapping windows */ private removeDuplicateChunks; private fallbackToSimpleChunking; } //# sourceMappingURL=TreeSitterChunkExtractor.d.ts.map