@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
149 lines (148 loc) • 5.68 kB
JavaScript
/**
* Recursive Chunker
*
* Smart text splitting using hierarchical separators.
* Tries each separator in order, recursively splitting chunks that are too large.
* Best for general-purpose text that has natural boundaries.
*/
import { randomUUID } from "crypto";
/**
* Recursive chunker implementation
* Smart splitting based on content structure using hierarchical separators
*/
export class RecursiveChunker {
strategy = "recursive";
defaultSeparators = ["\n\n", "\n", ". ", " ", ""];
async chunk(text, config) {
const { maxSize = 1000, overlap = 200, separators = this.defaultSeparators, isSeparatorRegex = false, trimWhitespace = true, metadata = {}, } = config || {};
const documentId = randomUUID();
const chunks = [];
if (!text || text.length === 0) {
return chunks;
}
const splitTexts = this.recursiveSplit(text, separators, maxSize, overlap, isSeparatorRegex);
let chunkIndex = 0;
let currentPosition = 0;
for (const splitText of splitTexts) {
const chunkText = trimWhitespace ? splitText.trim() : splitText;
if (chunkText.length > 0) {
const startPosition = text.indexOf(splitText, currentPosition);
chunks.push({
id: randomUUID(),
text: chunkText,
metadata: {
documentId,
chunkIndex,
startPosition: startPosition >= 0 ? startPosition : currentPosition,
endPosition: startPosition >= 0
? startPosition + splitText.length
: currentPosition + splitText.length,
documentType: "text",
custom: metadata,
},
});
chunkIndex++;
if (startPosition >= 0) {
currentPosition = startPosition + splitText.length - overlap;
}
}
}
// Update total chunks count
chunks.forEach((chunk) => {
chunk.metadata.totalChunks = chunks.length;
});
return chunks;
}
recursiveSplit(text, separators, maxSize, overlap, isRegex) {
const results = [];
if (text.length <= maxSize) {
return [text];
}
// Find the best separator to use
let separator = separators[separators.length - 1]; // Default to last (usually "")
let newSeparators = separators;
for (let i = 0; i < separators.length; i++) {
const sep = separators[i];
const hasMatch = isRegex
? new RegExp(sep).test(text)
: text.includes(sep);
if (sep === "" || hasMatch) {
separator = sep;
newSeparators = separators.slice(i + 1);
break;
}
}
// Split the text
const splits = isRegex
? text.split(new RegExp(separator))
: text.split(separator);
// Merge splits into chunks
let currentChunk = "";
for (const split of splits) {
const potentialChunk = currentChunk
? currentChunk + separator + split
: split;
if (potentialChunk.length <= maxSize) {
currentChunk = potentialChunk;
}
else {
// Current chunk is ready
if (currentChunk.length > 0) {
results.push(currentChunk);
}
// Handle split that's still too large
if (split.length > maxSize) {
const subSplits = this.recursiveSplit(split, newSeparators, maxSize, overlap, isRegex);
results.push(...subSplits.slice(0, -1));
currentChunk = subSplits[subSplits.length - 1] || "";
}
else {
// Add overlap from previous chunk
if (results.length > 0 && overlap > 0) {
const lastChunk = results[results.length - 1];
const overlapText = lastChunk.slice(-overlap);
currentChunk = overlapText + separator + split;
}
else {
currentChunk = split;
}
}
}
}
// Don't forget the last chunk
if (currentChunk.length > 0) {
results.push(currentChunk);
}
return results;
}
validateConfig(config) {
const errors = [];
const warnings = [];
const recConfig = config;
if (recConfig.maxSize !== undefined && recConfig.maxSize <= 0) {
errors.push("maxSize must be greater than 0");
}
if (recConfig.overlap !== undefined && recConfig.overlap < 0) {
errors.push("overlap must be non-negative");
}
if (recConfig.separators !== undefined &&
recConfig.separators.length === 0) {
errors.push("separators array must not be empty");
}
if (recConfig.isSeparatorRegex && recConfig.separators) {
for (const sep of recConfig.separators) {
try {
new RegExp(sep);
}
catch {
errors.push(`Invalid regex separator: ${sep}`);
}
}
}
return {
valid: errors.length === 0,
errors,
warnings,
};
}
}