@wildcard-ai/deepcontext
Version: 
Advanced codebase indexing and semantic search MCP server
134 lines • 4.21 kB
TypeScript
/**
 * TreeSitterChunkExtractor - AST-Based Semantic Chunking
 *
 * Creates meaningful code chunks based on AST structure rather than individual symbols.
 * Inspired by research from:
 * - the-dream-machine/ebdde5abc0e7432d66ca16bc48c8108d
 * - CintraAI/code-chunker
 * - yilinjz/astchunk
 *
 * Key Principle: Extract complete semantic units (full classes, functions, interfaces)
 * not individual symbol metadata.
 */
import { SymbolInfo } from '../../types/core.js';
import { ConfigurationService } from '../../services/ConfigurationService.js';
export interface SemanticChunk {
    id: string;
    content: string;
    filePath: string;
    relativePath: string;
    startLine: number;
    endLine: number;
    language: string;
    chunkType: 'class' | 'function' | 'interface' | 'type' | 'module' | 'mixed';
    symbols: SymbolInfo[];
    imports: Array<{
        module: string;
        symbols: string[];
        line: number;
    }>;
    size: number;
    complexity: 'low' | 'medium' | 'high';
}
export interface ChunkExtractionResult {
    chunks: SemanticChunk[];
    parseErrors: string[];
    metadata: {
        totalNodes: number;
        totalChunks: number;
        averageChunkSize: number;
        processingTime: number;
    };
}
export declare class TreeSitterChunkExtractor {
    private configurationService;
    private parsers;
    private initialized;
    private logger;
    private readonly MIN_CHUNK_SIZE;
    private readonly PREFERRED_CHUNK_SIZE;
    constructor(configurationService: ConfigurationService);
    /**
     * Generate a short, unique ID that fits within Turbopuffer's 64-byte limit
     */
    private generateShortId;
    initialize(): Promise<void>;
    /**
     * Extract semantic chunks from source code using AST structure
     */
    extractSemanticChunks(content: string, language: string, filePath: string, relativePath?: string): Promise<ChunkExtractionResult>;
    /**
     * Find semantic units in the AST (complete classes, functions, interfaces, etc.)
     */
    private findSemanticUnits;
    private traverseForSemanticUnits;
    /**
     * Extract individual methods from large classes for better granularity
     */
    private extractMethodsFromLargeClass;
    private optimizeSemanticUnits;
    private createChunkFromUnit;
    /**
     * Split large semantic units (like huge classes) into manageable chunks
     * while preserving semantic boundaries
     */
    private splitLargeSemanticUnit;
    /**
     * Split a large class by its methods while preserving class context
     */
    private splitClassIntoMethods;
    /**
     * Fallback: Split by line boundaries while preserving semantic structure
     */
    private splitByLineBoundaries;
    private calculateComplexity;
    private mapNodeTypeToChunkType;
    private mapNodeTypeToSymbolType;
    private getNodeName;
    private countNodes;
    private handleLargeFile;
    /**
     * Intelligent Range-Based TreeSitter Parsing
     * Splits large files into semantic ranges and parses each with TreeSitter
     */
    private intelligentRangeBasedParsing;
    /**
     * Find semantic boundaries in code (class/function/interface starts)
     */
    /**
     * Create comprehensive chunks from a window ensuring full content coverage
     */
    private createComprehensiveWindowChunks;
    /**
     * Find gaps in line coverage
     */
    /**
     * Create chunk from content string
     */
    private createChunkFromContent;
    /**
     * Extract basic symbols using simple patterns for gap content
     */
    private extractBasicSymbols;
    private findSemanticBoundaries;
    /**
     * Create intelligent windows that respect semantic boundaries
     */
    private createIntelligentWindows;
    private findSafeTrimPoint;
    private calculateByteOffset;
    /**
     * Create a semantic fallback chunk when TreeSitter fails
     */
    private createSemanticFallbackChunk;
    /**
     * Extract imports from content
     */
    private extractImportsFromContent;
    /**
     * Remove duplicate chunks from overlapping windows
     */
    private removeDuplicateChunks;
    private fallbackToSimpleChunking;
}
//# sourceMappingURL=TreeSitterChunkExtractor.d.ts.map