UNPKG

ultimate-mcp-server

Version:

The definitive all-in-one Model Context Protocol server for AI-assisted coding across 30+ platforms

185 lines 6.97 kB
import { v4 as uuidv4 } from 'uuid'; export class DocumentChunker { strategy; constructor(strategy) { this.strategy = strategy; } chunk(document) { switch (this.strategy.type) { case 'fixed': return this.fixedSizeChunking(document); case 'sentence': return this.sentenceChunking(document); case 'paragraph': return this.paragraphChunking(document); case 'semantic': return this.semanticChunking(document); default: throw new Error(`Unknown chunking strategy: ${this.strategy.type}`); } } fixedSizeChunking(document) { const chunks = []; const chunkSize = this.strategy.chunkSize || 1000; const overlap = this.strategy.chunkOverlap || 200; const content = document.content; let start = 0; while (start < content.length) { const end = Math.min(start + chunkSize, content.length); const chunkContent = content.slice(start, end); chunks.push({ id: uuidv4(), documentId: document.id, content: chunkContent, startIndex: start, endIndex: end, metadata: { chunkIndex: chunks.length, totalChunks: Math.ceil(content.length / (chunkSize - overlap)) } }); start += chunkSize - overlap; } return chunks; } sentenceChunking(document) { const chunks = []; const sentences = this.splitIntoSentences(document.content); const chunkSize = this.strategy.chunkSize || 5; // Number of sentences per chunk const overlap = this.strategy.chunkOverlap || 1; let currentIndex = 0; for (let i = 0; i < sentences.length; i += chunkSize - overlap) { const chunkSentences = sentences.slice(i, i + chunkSize); const chunkContent = chunkSentences.join(' '); const startIndex = currentIndex; const endIndex = startIndex + chunkContent.length; chunks.push({ id: uuidv4(), documentId: document.id, content: chunkContent, startIndex, endIndex, metadata: { chunkIndex: chunks.length, sentenceCount: chunkSentences.length } }); currentIndex = endIndex + 1; } return chunks; } paragraphChunking(document) { const chunks = []; const paragraphs = this.splitIntoParagraphs(document.content); const chunkSize = this.strategy.chunkSize || 3; // Number of paragraphs per chunk const overlap = this.strategy.chunkOverlap || 1; let currentIndex = 0; for (let i = 0; i < paragraphs.length; i += chunkSize - overlap) { const chunkParagraphs = paragraphs.slice(i, i + chunkSize); const chunkContent = chunkParagraphs.join('\n\n'); const startIndex = currentIndex; const endIndex = startIndex + chunkContent.length; chunks.push({ id: uuidv4(), documentId: document.id, content: chunkContent, startIndex, endIndex, metadata: { chunkIndex: chunks.length, paragraphCount: chunkParagraphs.length } }); currentIndex = endIndex + 2; // Account for double newline } return chunks; } semanticChunking(document) { // Semantic chunking based on content structure const chunks = []; const separators = this.strategy.separators || [ '\n\n## ', '\n\n### ', '\n\n#### ', // Markdown headers '\n\n', // Paragraphs '. ', '! ', '? ', // Sentences ]; const segments = this.splitBySeparators(document.content, separators); const targetSize = this.strategy.chunkSize || 1000; let currentChunk = ''; let currentStart = 0; for (const segment of segments) { if (currentChunk.length + segment.length > targetSize && currentChunk.length > 0) { // Save current chunk chunks.push({ id: uuidv4(), documentId: document.id, content: currentChunk.trim(), startIndex: currentStart, endIndex: currentStart + currentChunk.length, metadata: { chunkIndex: chunks.length, semantic: true } }); currentStart += currentChunk.length; currentChunk = segment; } else { currentChunk += segment; } } // Don't forget the last chunk if (currentChunk.trim()) { chunks.push({ id: uuidv4(), documentId: document.id, content: currentChunk.trim(), startIndex: currentStart, endIndex: currentStart + currentChunk.length, metadata: { chunkIndex: chunks.length, semantic: true } }); } return chunks; } splitIntoSentences(text) { // Simple sentence splitting - can be improved with NLP libraries const sentenceEnders = /([.!?]+[\s]+)/g; const sentences = text.split(sentenceEnders).filter(s => s.trim().length > 0); // Recombine sentence with its ender const result = []; for (let i = 0; i < sentences.length; i += 2) { if (i + 1 < sentences.length) { result.push(sentences[i] + sentences[i + 1]); } else { result.push(sentences[i]); } } return result; } splitIntoParagraphs(text) { return text.split(/\n\s*\n/).filter(p => p.trim().length > 0); } splitBySeparators(text, separators) { let segments = [text]; for (const separator of separators) { const newSegments = []; for (const segment of segments) { const parts = segment.split(separator); for (let i = 0; i < parts.length; i++) { if (i > 0) { newSegments.push(separator + parts[i]); } else { newSegments.push(parts[i]); } } } segments = newSegments; } return segments.filter(s => s.trim().length > 0); } } //# sourceMappingURL=chunking.js.map