@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
184 lines (183 loc) • 7.2 kB
JavaScript
/**
* Token-based Chunker
*
* Splits text based on token counts using simple tokenization.
* Best for controlling context window usage with LLMs.
*/
import { randomUUID } from "crypto";
/**
* Token-aware chunker implementation
* Splits text based on approximate token counts
*
* Note: Uses simple word-based tokenization as approximation.
* For exact token counts, integrate with tiktoken or model-specific tokenizers.
*/
export class TokenChunker {
strategy = "token";
// Approximate characters per token for different tokenizers
CHARS_PER_TOKEN = {
cl100k_base: 4, // GPT-4, GPT-3.5-turbo
p50k_base: 4, // Codex
r50k_base: 4, // GPT-3
default: 4,
};
async chunk(text, config) {
const { maxSize, overlap = 0, tokenizer = "cl100k_base", maxTokens = 512, tokenOverlap, trimWhitespace = true, metadata = {}, } = config || {};
const chunks = [];
const documentId = randomUUID();
if (!text || text.length === 0) {
return chunks;
}
// Determine effective overlap
const effectiveOverlap = tokenOverlap ?? Math.floor(overlap / this.getCharsPerToken(tokenizer));
// Use maxSize if provided, otherwise calculate from maxTokens
const _effectiveMaxChars = maxSize ?? maxTokens * this.getCharsPerToken(tokenizer);
// Tokenize text (simple word-based approximation)
const words = this.tokenize(text);
const _tokensPerWord = this.estimateTokensPerWord(tokenizer);
let currentWords = [];
let currentTokenCount = 0;
let chunkIndex = 0;
let startPosition = 0;
let charPosition = 0;
for (const word of words) {
const wordTokens = Math.ceil(word.length / this.getCharsPerToken(tokenizer));
// Check if adding this word would exceed the limit
if (currentTokenCount + wordTokens > maxTokens &&
currentWords.length > 0) {
// Save current chunk
const chunkText = currentWords.join(" ");
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
if (finalText.length > 0) {
chunks.push({
id: randomUUID(),
text: finalText,
metadata: {
documentId,
chunkIndex,
startPosition,
endPosition: charPosition,
documentType: "text",
custom: {
...metadata,
estimatedTokens: currentTokenCount,
},
},
});
chunkIndex++;
}
// Handle token overlap
if (effectiveOverlap > 0 && currentWords.length > 0) {
// Keep some words for overlap
let overlapTokens = 0;
const overlapWords = [];
for (let i = currentWords.length - 1; i >= 0; i--) {
const w = currentWords[i];
const wTokens = Math.ceil(w.length / this.getCharsPerToken(tokenizer));
if (overlapTokens + wTokens <= effectiveOverlap) {
overlapWords.unshift(w);
overlapTokens += wTokens;
}
else {
break;
}
}
currentWords = overlapWords;
currentTokenCount = overlapTokens;
// Adjust start position for overlap
const overlapChars = overlapWords.join(" ").length + 1;
startPosition = charPosition - overlapChars;
}
else {
currentWords = [];
currentTokenCount = 0;
startPosition = charPosition;
}
}
currentWords.push(word);
currentTokenCount += wordTokens;
charPosition += word.length + 1; // +1 for space
}
// Don't forget the last chunk
if (currentWords.length > 0) {
const chunkText = currentWords.join(" ");
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
if (finalText.length > 0) {
chunks.push({
id: randomUUID(),
text: finalText,
metadata: {
documentId,
chunkIndex,
startPosition,
endPosition: charPosition,
documentType: "text",
custom: {
...metadata,
estimatedTokens: currentTokenCount,
},
},
});
}
}
// Update total chunks count
chunks.forEach((chunk) => {
chunk.metadata.totalChunks = chunks.length;
});
return chunks;
}
/**
* Simple word-based tokenization
*/
tokenize(text) {
// Split on whitespace and filter empty strings
return text.split(/\s+/).filter((w) => w.length > 0);
}
/**
* Get characters per token for a tokenizer
*/
getCharsPerToken(tokenizer) {
return this.CHARS_PER_TOKEN[tokenizer] ?? this.CHARS_PER_TOKEN.default;
}
/**
* Estimate average tokens per word
*/
estimateTokensPerWord(_tokenizer) {
// Average English word is ~5 characters, so roughly 1.25 tokens
return 1.25;
}
/**
* Estimate token count for text
*/
estimateTokenCount(text, tokenizer = "cl100k_base") {
return Math.ceil(text.length / this.getCharsPerToken(tokenizer));
}
validateConfig(config) {
const errors = [];
const warnings = [];
const tokenConfig = config;
if (tokenConfig.maxTokens !== undefined && tokenConfig.maxTokens <= 0) {
errors.push("maxTokens must be greater than 0");
}
if (tokenConfig.tokenOverlap !== undefined &&
tokenConfig.tokenOverlap < 0) {
errors.push("tokenOverlap must be non-negative");
}
if (tokenConfig.tokenOverlap !== undefined &&
tokenConfig.maxTokens !== undefined) {
if (tokenConfig.tokenOverlap >= tokenConfig.maxTokens) {
errors.push("tokenOverlap must be less than maxTokens");
}
}
if (tokenConfig.maxSize !== undefined && tokenConfig.maxSize <= 0) {
errors.push("maxSize must be greater than 0");
}
// Warn about tokenizer approximation
warnings.push("Token counts are approximated. For exact counts, integrate with tiktoken.");
return {
valid: errors.length === 0,
errors,
warnings,
};
}
}