@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
62 lines • 2.4 kB
JavaScript
/**
* Token Chunker
*
* Splits text by token count using a tokenizer.
* Useful for precise token budget management.
*/
import { BaseChunker, DEFAULT_CHUNKER_CONFIG } from "./BaseChunker.js";
/**
* Token Chunker
*
* Approximates token-based splitting using word count.
* For production, integrate with a proper tokenizer (tiktoken, etc.)
*/
export class TokenChunker extends BaseChunker {
strategy = "token";
getDefaultConfig() {
return {
...DEFAULT_CHUNKER_CONFIG,
maxSize: 512, // Tokens
overlap: 50, // Tokens
};
}
async doChunk(content, config) {
const maxTokens = config.maxSize ?? 512;
const overlapTokens = config.overlap ?? 50;
// Approximate tokenization using words
// In production, use a proper tokenizer like tiktoken
const words = content.split(/\s+/);
const chunks = [];
let currentWords = [];
let currentStart = 0;
let chunkIndex = 0;
for (let i = 0; i < words.length; i++) {
const word = words[i];
if (!word) {
continue;
}
// Estimate tokens (roughly 1.3 tokens per word on average)
const estimatedTokens = Math.ceil(currentWords.length * 1.3);
if (estimatedTokens >= maxTokens) {
const chunkText = currentWords.join(" ");
const startOffset = content.indexOf(currentWords[0] ?? "", currentStart);
const endOffset = startOffset + chunkText.length;
chunks.push(this.createChunk(chunkText, chunkIndex++, startOffset, endOffset));
// Keep overlap words
const overlapCount = Math.ceil(overlapTokens / 1.3);
currentWords = currentWords.slice(-overlapCount);
currentStart = endOffset - currentWords.join(" ").length;
}
currentWords.push(word);
}
// Add remaining chunk
if (currentWords.length > 0) {
const chunkText = currentWords.join(" ");
const startOffset = content.indexOf(currentWords[0] ?? "", currentStart);
const endOffset = startOffset + chunkText.length;
chunks.push(this.createChunk(chunkText, chunkIndex, startOffset, endOffset));
}
return chunks;
}
}
//# sourceMappingURL=TokenChunker.js.map