UNPKG

embedocs-mcp

Version:

Transform any GitHub repository into searchable vector embeddings. MCP server with smart indexing, voyage-context-3 embeddings, and semantic search for Claude/Cursor IDEs.

105 lines 3.2 kB
/** * Advanced Semantic Chunking Service * Inspired by Harry-231/Contextual_RAG + curiousily/ragbase + research benchmarks * * Combines: * - Harry-231's voyage-context-3 integration patterns * - curiousily/ragbase hybrid chunking approach * - monami44's benchmark-proven methods */ interface ChunkingMetrics { totalChunks: number; averageChunkSize: number; semanticBoundaries: number; fallbackUsage: number; totalRequests: number; } export declare class AdvancedSemanticChunker { private embeddingService; private metricsCollected; private tokenEncoder; constructor(); /** * Multi-strategy semantic chunking based on research * 1. Try interquartile method (highest benchmark score: 41.71) * 2. Fallback to gradient method (Harry-231's choice) * 3. Ultimate fallback to hybrid approach (curiousily/ragbase) */ chunkContent(content: string, strategy?: 'auto' | 'interquartile' | 'gradient' | 'hybrid'): Promise<string[]>; /** * Content preprocessing inspired by Harry-231's approach */ private preprocessContent; /** * Smart strategy selection based on content analysis */ private selectOptimalStrategy; /** * Interquartile method - highest benchmark score (41.71) * Based on monami44/Langchain-Semantic-Chunking-Arena research */ private interquartileChunking; /** * Gradient method - Harry-231's choice for production * Proven with policy documents in Contextual_RAG */ private gradientChunking; /** * Hybrid approach inspired by curiousily/ragbase * Combines semantic + size-based chunking */ private hybridChunking; /** * Advanced sentence splitting (Harry-231 + LangChain patterns) */ private splitIntoSentences; /** * Calculate cosine similarities (same as Harry-231's approach) */ private calculateSimilarities; /** * Create chunks from sentences and breakpoints */ private createChunks; /** * Production constraints inspired by Harry-231's implementation + MongoDB Dev token validation * CRITICAL: voyage-context-3 has 32,000 token limit - ULTRA SAFE limits */ private applyProductionConstraints; /** * Token counting using MongoDB Dev's proven js-tiktoken approach */ private getTokenCount; /** * TOKEN-AWARE recursive chunk splitting with overlap * GUARANTEES no chunk exceeds Voyage API limits */ private recursiveChunkSplit; /** * Fallback chunking with character-level splitting for oversized content */ private fallbackChunking; /** * Character-level splitting for unsplittable content * Based on Microsoft Semantic Kernel's approach */ private splitByCharacters; /** * Cosine similarity (same as Harry-231) */ private cosineSimilarity; /** * Update performance metrics */ private updateMetrics; /** * Get performance metrics for monitoring */ getMetrics(): ChunkingMetrics; /** * Reset metrics */ resetMetrics(): void; } export {}; //# sourceMappingURL=semantic-chunker.d.ts.map