@wildcard-ai/deepcodex
Version:
Advanced codebase indexing and semantic search MCP server
124 lines • 3.77 kB
TypeScript
/**
* TreeSitterChunkExtractor - AST-Based Semantic Chunking
*
* Creates meaningful code chunks based on AST structure rather than individual symbols.
* Inspired by research from:
* - the-dream-machine/ebdde5abc0e7432d66ca16bc48c8108d
* - CintraAI/code-chunker
* - yilinjz/astchunk
*
* Key Principle: Extract complete semantic units (full classes, functions, interfaces)
* not individual symbol metadata.
*/
export interface SemanticChunk {
id: string;
content: string;
filePath: string;
relativePath: string;
startLine: number;
endLine: number;
language: string;
chunkType: 'class' | 'function' | 'interface' | 'type' | 'module' | 'mixed';
symbols: Array<{
name: string;
type: 'function' | 'class' | 'interface' | 'type' | 'variable' | 'constant';
line: number;
scope?: string;
}>;
imports: Array<{
module: string;
symbols: string[];
line: number;
}>;
size: number;
complexity: 'low' | 'medium' | 'high';
}
export interface ChunkExtractionResult {
chunks: SemanticChunk[];
parseErrors: string[];
metadata: {
totalNodes: number;
totalChunks: number;
averageChunkSize: number;
processingTime: number;
};
}
export declare class TreeSitterChunkExtractor {
private parsers;
private initialized;
private logger;
private readonly MAX_CHUNK_SIZE;
private readonly MIN_CHUNK_SIZE;
private readonly PREFERRED_CHUNK_SIZE;
constructor();
/**
* Generate a short, unique ID that fits within Turbopuffer's 64-byte limit
*/
private generateShortId;
initialize(): Promise<void>;
/**
* Extract semantic chunks from source code using AST structure
*/
extractSemanticChunks(content: string, language: string, filePath: string, relativePath?: string): Promise<ChunkExtractionResult>;
/**
* Find semantic units in the AST (complete classes, functions, interfaces, etc.)
*/
private findSemanticUnits;
private traverseForSemanticUnits;
private optimizeSemanticUnits;
private createChunkFromUnit;
private extractSymbolsFromUnit;
private extractSymbolsFromContent;
private traverseNodeForSymbols;
private calculateComplexity;
private mapNodeTypeToChunkType;
private mapNodeTypeToSymbolType;
private getNodeName;
private countNodes;
private handleLargeFile;
/**
* Intelligent Range-Based TreeSitter Parsing
* Splits large files into semantic ranges and parses each with TreeSitter
*/
private intelligentRangeBasedParsing;
/**
* Find semantic boundaries in code (class/function/interface starts)
*/
/**
* Create comprehensive chunks from a window ensuring full content coverage
*/
private createComprehensiveWindowChunks;
/**
* Find gaps in line coverage
*/
private findContentGaps;
/**
* Create chunk from content string
*/
private createChunkFromContent;
/**
* Extract basic symbols using simple patterns for gap content
*/
private extractBasicSymbols;
private findSemanticBoundaries;
/**
* Create intelligent windows that respect semantic boundaries
*/
private createIntelligentWindows;
private findSafeTrimPoint;
private calculateByteOffset;
/**
* Create a semantic fallback chunk when TreeSitter fails
*/
private createSemanticFallbackChunk;
/**
* Extract imports from content
*/
private extractImportsFromContent;
/**
* Remove duplicate chunks from overlapping windows
*/
private removeDuplicateChunks;
private fallbackToSimpleChunking;
}
//# sourceMappingURL=TreeSitterChunkExtractor.d.ts.map