UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

62 lines 2.4 kB
/** * Token Chunker * * Splits text by token count using a tokenizer. * Useful for precise token budget management. */ import { BaseChunker, DEFAULT_CHUNKER_CONFIG } from "./BaseChunker.js"; /** * Token Chunker * * Approximates token-based splitting using word count. * For production, integrate with a proper tokenizer (tiktoken, etc.) */ export class TokenChunker extends BaseChunker { strategy = "token"; getDefaultConfig() { return { ...DEFAULT_CHUNKER_CONFIG, maxSize: 512, // Tokens overlap: 50, // Tokens }; } async doChunk(content, config) { const maxTokens = config.maxSize ?? 512; const overlapTokens = config.overlap ?? 50; // Approximate tokenization using words // In production, use a proper tokenizer like tiktoken const words = content.split(/\s+/); const chunks = []; let currentWords = []; let currentStart = 0; let chunkIndex = 0; for (let i = 0; i < words.length; i++) { const word = words[i]; if (!word) { continue; } // Estimate tokens (roughly 1.3 tokens per word on average) const estimatedTokens = Math.ceil(currentWords.length * 1.3); if (estimatedTokens >= maxTokens) { const chunkText = currentWords.join(" "); const startOffset = content.indexOf(currentWords[0] ?? "", currentStart); const endOffset = startOffset + chunkText.length; chunks.push(this.createChunk(chunkText, chunkIndex++, startOffset, endOffset)); // Keep overlap words const overlapCount = Math.ceil(overlapTokens / 1.3); currentWords = currentWords.slice(-overlapCount); currentStart = endOffset - currentWords.join(" ").length; } currentWords.push(word); } // Add remaining chunk if (currentWords.length > 0) { const chunkText = currentWords.join(" "); const startOffset = content.indexOf(currentWords[0] ?? "", currentStart); const endOffset = startOffset + chunkText.length; chunks.push(this.createChunk(chunkText, chunkIndex, startOffset, endOffset)); } return chunks; } } //# sourceMappingURL=TokenChunker.js.map