embedocs-mcp
Version:
Transform any GitHub repository into searchable vector embeddings. MCP server with smart indexing, voyage-context-3 embeddings, and semantic search for Claude/Cursor IDEs.
441 lines • 18.8 kB
JavaScript
/**
* Advanced Semantic Chunking Service
* Inspired by Harry-231/Contextual_RAG + curiousily/ragbase + research benchmarks
*
* Combines:
* - Harry-231's voyage-context-3 integration patterns
* - curiousily/ragbase hybrid chunking approach
* - monami44's benchmark-proven methods
*/
import { EmbeddingService } from './embeddings.js';
import { config } from '../config/index.js';
import { getEncoding } from 'js-tiktoken';
export class AdvancedSemanticChunker {
embeddingService;
metricsCollected;
tokenEncoder = getEncoding('gpt2'); // MongoDB Dev's proven approach
constructor() {
this.embeddingService = EmbeddingService.getInstance();
this.metricsCollected = {
totalChunks: 0,
averageChunkSize: 0,
semanticBoundaries: 0,
fallbackUsage: 0,
totalRequests: 0
};
}
/**
* Multi-strategy semantic chunking based on research
* 1. Try interquartile method (highest benchmark score: 41.71)
* 2. Fallback to gradient method (Harry-231's choice)
* 3. Ultimate fallback to hybrid approach (curiousily/ragbase)
*/
async chunkContent(content, strategy = 'auto') {
this.metricsCollected.totalRequests++;
try {
// Preprocess content like Harry-231's implementation
const cleanContent = this.preprocessContent(content);
// Split into sentences (following LangChain + Harry-231 patterns)
const sentences = this.splitIntoSentences(cleanContent);
if (sentences.length <= 3) {
return [cleanContent]; // Too short for semantic chunking
}
// Choose strategy based on content type and research
const chosenStrategy = strategy === 'auto' ? this.selectOptimalStrategy(content) : strategy;
let chunks;
switch (chosenStrategy) {
case 'interquartile':
chunks = await this.interquartileChunking(sentences);
break;
case 'gradient':
chunks = await this.gradientChunking(sentences);
break;
case 'hybrid':
chunks = await this.hybridChunking(sentences);
break;
default:
chunks = await this.interquartileChunking(sentences);
}
// Apply Harry-231's post-processing constraints
const finalChunks = this.applyProductionConstraints(chunks);
// Update metrics
this.updateMetrics(finalChunks);
return finalChunks;
}
catch (error) {
console.warn('Advanced semantic chunking failed, using fallback:', error);
this.metricsCollected.fallbackUsage++;
return this.fallbackChunking(content);
}
}
/**
* Content preprocessing inspired by Harry-231's approach
*/
preprocessContent(content) {
return content
.replace(/\s+/g, ' ') // Normalize whitespace
.replace(/([.!?])\s*\n/g, '$1 ') // Handle line breaks after sentences
.trim();
}
/**
* Smart strategy selection based on content analysis
*/
selectOptimalStrategy(content) {
// Technical documentation → interquartile (best benchmark performance)
if (content.includes('function') || content.includes('API') || content.includes('method')) {
return 'interquartile';
}
// Policy/legal documents → gradient (Harry-231's choice for policies)
if (content.includes('policy') || content.includes('requirement') || content.includes('shall')) {
return 'gradient';
}
// Mixed content → hybrid approach
return 'hybrid';
}
/**
* Interquartile method - highest benchmark score (41.71)
* Based on monami44/Langchain-Semantic-Chunking-Arena research
*/
async interquartileChunking(sentences) {
// Get embeddings for all sentences using our voyage-context-3 service
const embeddings = await this.embeddingService.embedDocuments(sentences);
// Calculate similarity scores
const similarities = this.calculateSimilarities(embeddings);
// Interquartile breakpoint detection (research-proven)
const sorted = [...similarities].sort((a, b) => a - b);
const q1 = sorted[Math.floor(sorted.length * 0.25)];
const q3 = sorted[Math.floor(sorted.length * 0.75)];
const iqr = q3 - q1;
const threshold = q1 - (1.5 * iqr); // IQR outlier detection
const breakpoints = similarities.map(sim => sim < threshold);
return this.createChunks(sentences, breakpoints);
}
/**
* Gradient method - Harry-231's choice for production
* Proven with policy documents in Contextual_RAG
*/
async gradientChunking(sentences) {
const embeddings = await this.embeddingService.embedDocuments(sentences);
const similarities = this.calculateSimilarities(embeddings);
// Gradient-based breakpoint detection (Harry-231's approach)
const gradients = [];
for (let i = 1; i < similarities.length; i++) {
gradients.push(similarities[i] - similarities[i - 1]);
}
const meanGradient = gradients.reduce((a, b) => a + b, 0) / gradients.length;
const stdGradient = Math.sqrt(gradients.reduce((sum, g) => sum + Math.pow(g - meanGradient, 2), 0) / gradients.length);
const threshold = meanGradient - stdGradient;
const breakpoints = gradients.map(g => g < threshold);
return this.createChunks(sentences, [false, ...breakpoints]); // Adjust for offset
}
/**
* Hybrid approach inspired by curiousily/ragbase
* Combines semantic + size-based chunking
*/
async hybridChunking(sentences) {
// First pass: semantic chunking
const semanticChunks = await this.interquartileChunking(sentences);
// Second pass: size-based adjustment (ragbase pattern)
const { chunkSize, chunkOverlap } = config.indexing;
const finalChunks = [];
for (const chunk of semanticChunks) {
if (chunk.length <= chunkSize) {
finalChunks.push(chunk);
}
else {
// Split oversized semantic chunks while preserving boundaries
const subChunks = this.recursiveChunkSplit(chunk, chunkSize, chunkOverlap);
finalChunks.push(...subChunks);
}
}
return finalChunks;
}
/**
* Advanced sentence splitting (Harry-231 + LangChain patterns)
*/
splitIntoSentences(text) {
// Advanced sentence splitting that handles technical documentation
const sentences = text
.split(/(?<=[.!?])\s+(?=[A-Z])/) // Basic sentence split
.filter(s => s.trim().length > 20) // Filter very short sentences
.map(s => s.trim());
return sentences;
}
/**
* Calculate cosine similarities (same as Harry-231's approach)
*/
calculateSimilarities(embeddings) {
const similarities = [];
for (let i = 0; i < embeddings.length - 1; i++) {
const sim = this.cosineSimilarity(embeddings[i].normalized, embeddings[i + 1].normalized);
similarities.push(sim);
}
return similarities;
}
/**
* Create chunks from sentences and breakpoints
*/
createChunks(sentences, breakpoints) {
const chunks = [];
let currentChunk = [sentences[0]];
for (let i = 0; i < breakpoints.length; i++) {
if (breakpoints[i] && currentChunk.length > 0) {
chunks.push(currentChunk.join(' '));
this.metricsCollected.semanticBoundaries++;
currentChunk = [sentences[i + 1]];
}
else {
currentChunk.push(sentences[i + 1]);
}
}
if (currentChunk.length > 0) {
chunks.push(currentChunk.join(' '));
}
return chunks;
}
/**
* Production constraints inspired by Harry-231's implementation + MongoDB Dev token validation
* CRITICAL: voyage-context-3 has 32,000 token limit - ULTRA SAFE limits
*/
applyProductionConstraints(chunks) {
const { chunkSize, chunkOverlap } = config.indexing;
const minChunkSize = 100;
const maxChunkSize = 2500; // EMERGENCY FIX: Increased to handle 2246 char chunks from retryable-reads.txt
const constrainedChunks = [];
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
// CRITICAL: Hard token limit for voyage-context-3 (32,000 token limit)
const tokenCount = this.getTokenCount(chunk);
// EMERGENCY: Split chunks that exceed safe token limits
if (tokenCount > 6000) { // 20% of context window - ultra safe
console.warn(`🚨 SPLITTING oversized chunk: ${tokenCount} tokens -> splitting`);
let subChunks = this.recursiveChunkSplit(chunk, 1000, 100);
// SAFETY: Re-split any chunks that are still too large
const finalSubChunks = [];
for (const subChunk of subChunks) {
const subTokens = this.getTokenCount(subChunk);
if (subTokens > 5000) {
// Force split by sentences if still too large
const sentences = subChunk.split(/[.!?]+/).filter(s => s.trim().length > 10);
let currentGroup = '';
for (const sentence of sentences) {
const testGroup = currentGroup + sentence + '. ';
if (this.getTokenCount(testGroup) > 4000 && currentGroup.length > 0) {
finalSubChunks.push(currentGroup.trim());
currentGroup = sentence + '. ';
}
else {
currentGroup = testGroup;
}
}
if (currentGroup.trim().length > 0) {
finalSubChunks.push(currentGroup.trim());
}
}
else {
finalSubChunks.push(subChunk);
}
}
constrainedChunks.push(...finalSubChunks);
continue; // Skip normal processing for this chunk
}
// Apply size constraints
if (chunk.length < minChunkSize) {
// Merge with next chunk if too small
if (i < chunks.length - 1 && constrainedChunks.length > 0) {
constrainedChunks[constrainedChunks.length - 1] += ' ' + chunk;
}
else if (constrainedChunks.length > 0) {
constrainedChunks[constrainedChunks.length - 1] += ' ' + chunk;
}
else {
constrainedChunks.push(chunk);
}
}
else if (chunk.length > maxChunkSize) {
// Split oversized chunks
const subChunks = this.recursiveChunkSplit(chunk, chunkSize, chunkOverlap);
constrainedChunks.push(...subChunks);
}
else {
constrainedChunks.push(chunk);
}
}
return constrainedChunks.filter(chunk => chunk.length >= minChunkSize);
}
/**
* Token counting using MongoDB Dev's proven js-tiktoken approach
*/
getTokenCount(text) {
return this.tokenEncoder.encode(text).length;
}
/**
* TOKEN-AWARE recursive chunk splitting with overlap
* GUARANTEES no chunk exceeds Voyage API limits
*/
recursiveChunkSplit(chunk, _maxSize, _overlap) {
const words = chunk.split(' ');
const chunks = [];
const MAX_SAFE_TOKENS = 5000; // Ultra-safe limit for voyage-context-3
let currentChunk = [];
let currentTokens = 0;
for (let i = 0; i < words.length; i++) {
const word = words[i];
const wordTokens = this.getTokenCount(word);
// Check if adding this word would exceed token limit
if (currentTokens + wordTokens > MAX_SAFE_TOKENS && currentChunk.length > 0) {
// Save current chunk
chunks.push(currentChunk.join(' '));
// Start new chunk with overlap
const overlapWords = Math.min(50, Math.floor(currentChunk.length * 0.1)); // 10% overlap
currentChunk = currentChunk.slice(-overlapWords);
currentTokens = this.getTokenCount(currentChunk.join(' '));
}
currentChunk.push(word);
currentTokens += wordTokens;
}
// Add final chunk
if (currentChunk.length > 0) {
chunks.push(currentChunk.join(' '));
}
// SAFETY CHECK: Validate all chunks are under token limit
return chunks.filter(c => {
const tokens = this.getTokenCount(c);
if (tokens > MAX_SAFE_TOKENS) {
console.warn(`⚠️ Chunk still oversized (${tokens} tokens), splitting further`);
return false; // Will be re-split
}
return true;
});
}
/**
* Fallback chunking with character-level splitting for oversized content
*/
fallbackChunking(content) {
const { chunkSize, chunkOverlap } = config.indexing;
const chunks = [];
const sentences = content.match(/[^.!?]+[.!?]+/g) || [content];
let currentChunk = '';
let overlap = '';
for (const sentence of sentences) {
// Check if this sentence alone is oversized
if (this.getTokenCount(sentence) > 30000) {
// Push current chunk if exists
if (currentChunk.trim()) {
chunks.push(overlap + currentChunk);
currentChunk = '';
}
// Use character-level splitting for oversized sentence
console.log(`🔄 Sentence too large (${this.getTokenCount(sentence)} tokens), using character-level splitting`);
const characterChunks = this.splitByCharacters(sentence, 25000); // Conservative limit
chunks.push(...characterChunks);
continue;
}
if (currentChunk.length + sentence.length > chunkSize && currentChunk) {
chunks.push(overlap + currentChunk);
const words = currentChunk.split(' ');
overlap = words.slice(-Math.floor(chunkOverlap / 10)).join(' ') + ' ';
currentChunk = sentence;
}
else {
currentChunk += ' ' + sentence;
}
}
if (currentChunk.trim()) {
chunks.push(overlap + currentChunk);
}
return chunks.map(c => c.trim()).filter(c => c.length > 100);
}
/**
* Character-level splitting for unsplittable content
* Based on Microsoft Semantic Kernel's approach
*/
splitByCharacters(text, maxTokens) {
if (this.getTokenCount(text) <= maxTokens) {
return [text];
}
console.log(`🔧 Character-level splitting: ${this.getTokenCount(text)} tokens -> target: ${maxTokens}`);
const chunks = [];
let currentText = text;
while (currentText.length > 0) {
if (this.getTokenCount(currentText) <= maxTokens) {
chunks.push(currentText);
break;
}
// Find the split point (Microsoft SK pattern: split at halfway point)
const halfPoint = Math.floor(currentText.length / 2);
let splitPoint = halfPoint;
// Try to find a better split point near whitespace (within 10% of halfway)
const searchRange = Math.floor(currentText.length * 0.1);
const searchStart = Math.max(0, halfPoint - searchRange);
const searchEnd = Math.min(currentText.length, halfPoint + searchRange);
// Search backwards for whitespace first
for (let i = halfPoint; i >= searchStart; i--) {
if (/\s/.test(currentText[i])) {
splitPoint = i + 1; // Split after the whitespace
break;
}
}
// If no whitespace found backwards, try forwards
if (splitPoint === halfPoint) {
for (let i = halfPoint; i < searchEnd; i++) {
if (/\s/.test(currentText[i])) {
splitPoint = i + 1;
break;
}
}
}
// Extract the chunk and continue with the rest
const chunk = currentText.substring(0, splitPoint).trim();
if (chunk) {
chunks.push(chunk);
}
currentText = currentText.substring(splitPoint).trim();
}
const result = chunks.filter(chunk => chunk.length > 0);
console.log(`✅ Character-level split complete: ${result.length} chunks, avg tokens: ${Math.round(result.reduce((sum, chunk) => sum + this.getTokenCount(chunk), 0) / result.length)}`);
return result;
}
/**
* Cosine similarity (same as Harry-231)
*/
cosineSimilarity(a, b) {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
/**
* Update performance metrics
*/
updateMetrics(chunks) {
this.metricsCollected.totalChunks += chunks.length;
const avgSize = chunks.reduce((sum, chunk) => sum + chunk.length, 0) / chunks.length;
this.metricsCollected.averageChunkSize =
(this.metricsCollected.averageChunkSize + avgSize) / 2;
}
/**
* Get performance metrics for monitoring
*/
getMetrics() {
return { ...this.metricsCollected };
}
/**
* Reset metrics
*/
resetMetrics() {
this.metricsCollected = {
totalChunks: 0,
averageChunkSize: 0,
semanticBoundaries: 0,
fallbackUsage: 0,
totalRequests: 0
};
}
}
//# sourceMappingURL=semantic-chunker.js.map