@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
231 lines (230 loc) • 9.4 kB
JavaScript
/**
* Sentence-based Chunker
*
* Splits text based on sentence boundaries while respecting size limits.
* Best for prose and natural language content where sentence integrity matters.
*/
import { randomUUID } from "crypto";
/**
* Sentence-aware chunker implementation
* Splits text by sentences while respecting size constraints
*/
export class SentenceChunker {
strategy = "sentence";
defaultSentenceEnders = [".", "!", "?"];
async chunk(text, config) {
const { maxSize = 1000, overlap = 0, sentenceEnders = this.defaultSentenceEnders, minSentences = 1, maxSentences, trimWhitespace = true, metadata = {}, } = config || {};
const chunks = [];
const documentId = randomUUID();
if (!text || text.length === 0) {
return chunks;
}
// Split text into sentences
const sentences = this.splitIntoSentences(text, sentenceEnders);
if (sentences.length === 0) {
return chunks;
}
let currentChunkSentences = [];
let currentChunkLength = 0;
let chunkIndex = 0;
let startPosition = 0;
let currentPosition = 0;
for (let i = 0; i < sentences.length; i++) {
const sentence = sentences[i];
const sentenceLength = sentence.length;
// Check if adding this sentence would exceed limits
const wouldExceedSize = currentChunkLength + sentenceLength + 1 > maxSize;
const wouldExceedSentences = maxSentences !== undefined &&
currentChunkSentences.length >= maxSentences;
if (currentChunkSentences.length > 0 &&
(wouldExceedSize || wouldExceedSentences)) {
// Save current chunk if it meets minimum requirements
if (currentChunkSentences.length >= minSentences) {
const chunkText = currentChunkSentences.join(" ");
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
if (finalText.length > 0) {
chunks.push({
id: randomUUID(),
text: finalText,
metadata: {
documentId,
chunkIndex,
startPosition,
endPosition: startPosition + chunkText.length,
documentType: "text",
custom: metadata,
},
});
chunkIndex++;
}
}
// Handle overlap by keeping some sentences
if (overlap > 0 && currentChunkSentences.length > 0) {
// Calculate how many sentences to keep for overlap
let overlapLength = 0;
const overlapSentences = [];
for (let j = currentChunkSentences.length - 1; j >= 0; j--) {
const s = currentChunkSentences[j];
if (overlapLength + s.length + 1 <= overlap) {
overlapSentences.unshift(s);
overlapLength += s.length + 1;
}
else {
break;
}
}
currentChunkSentences = overlapSentences;
currentChunkLength = overlapLength;
startPosition = currentPosition - overlapLength;
}
else {
currentChunkSentences = [];
currentChunkLength = 0;
startPosition = currentPosition;
}
}
// Handle sentences larger than maxSize
if (sentenceLength > maxSize) {
// Split the sentence itself if necessary
const subChunks = this.splitLargeSentence(sentence, maxSize);
for (const subChunk of subChunks) {
chunks.push({
id: randomUUID(),
text: trimWhitespace ? subChunk.trim() : subChunk,
metadata: {
documentId,
chunkIndex,
startPosition: currentPosition,
endPosition: currentPosition + subChunk.length,
documentType: "text",
custom: metadata,
},
});
chunkIndex++;
currentPosition += subChunk.length;
}
startPosition = currentPosition;
}
else {
currentChunkSentences.push(sentence);
currentChunkLength += sentenceLength + 1; // +1 for space
currentPosition += sentenceLength + 1;
}
}
// Don't forget the last chunk
if (currentChunkSentences.length >= minSentences) {
const chunkText = currentChunkSentences.join(" ");
const finalText = trimWhitespace ? chunkText.trim() : chunkText;
if (finalText.length > 0) {
chunks.push({
id: randomUUID(),
text: finalText,
metadata: {
documentId,
chunkIndex,
startPosition,
endPosition: startPosition + chunkText.length,
documentType: "text",
custom: metadata,
},
});
}
}
// Update total chunks count
chunks.forEach((chunk) => {
chunk.metadata.totalChunks = chunks.length;
});
return chunks;
}
/**
* Split text into sentences based on sentence enders
*/
splitIntoSentences(text, sentenceEnders) {
const sentences = [];
// Build regex pattern for sentence splitting
// Look for sentence enders followed by whitespace or end of string
const pattern = new RegExp(`([${sentenceEnders.map((e) => "\\" + e).join("")}]+)(?=\\s|$)`, "g");
let lastIndex = 0;
let match;
// Reset regex state
pattern.lastIndex = 0;
while ((match = pattern.exec(text)) !== null) {
const endIndex = match.index + match[0].length;
const sentence = text.slice(lastIndex, endIndex).trim();
if (sentence.length > 0) {
sentences.push(sentence);
}
lastIndex = endIndex;
// Skip whitespace
while (lastIndex < text.length && /\s/.test(text[lastIndex])) {
lastIndex++;
}
}
// Don't forget the last part
if (lastIndex < text.length) {
const remaining = text.slice(lastIndex).trim();
if (remaining.length > 0) {
sentences.push(remaining);
}
}
return sentences;
}
/**
* Split a large sentence into smaller chunks
*/
splitLargeSentence(sentence, maxSize) {
const chunks = [];
const words = sentence.split(/\s+/);
let currentChunk = "";
for (const word of words) {
if (currentChunk.length + word.length + 1 <= maxSize) {
currentChunk = currentChunk ? currentChunk + " " + word : word;
}
else {
if (currentChunk.length > 0) {
chunks.push(currentChunk);
}
// If a single word is larger than maxSize, we have to include it anyway
currentChunk = word;
}
}
if (currentChunk.length > 0) {
chunks.push(currentChunk);
}
return chunks;
}
validateConfig(config) {
const errors = [];
const warnings = [];
const sentConfig = config;
if (sentConfig.maxSize !== undefined && sentConfig.maxSize <= 0) {
errors.push("maxSize must be greater than 0");
}
if (sentConfig.overlap !== undefined && sentConfig.overlap < 0) {
errors.push("overlap must be non-negative");
}
if (sentConfig.overlap !== undefined &&
sentConfig.maxSize !== undefined &&
sentConfig.overlap >= sentConfig.maxSize) {
errors.push("overlap must be less than maxSize");
}
if (sentConfig.minSentences !== undefined && sentConfig.minSentences < 1) {
errors.push("minSentences must be at least 1");
}
if (sentConfig.maxSentences !== undefined &&
sentConfig.minSentences !== undefined) {
if (sentConfig.maxSentences < sentConfig.minSentences) {
errors.push("maxSentences must be >= minSentences");
}
}
if (sentConfig.sentenceEnders !== undefined &&
sentConfig.sentenceEnders.length === 0) {
warnings.push("No sentence enders specified, using defaults");
}
return {
valid: errors.length === 0,
errors,
warnings,
};
}
}