@coworker-agency/rag
Version:
Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering
55 lines (46 loc) • 1.71 kB
JavaScript
/**
* Recursive document splitter that intelligently breaks text into chunks
* based on natural boundaries like paragraphs and sentences.
*/
/**
* Split text into chunks using a recursive approach
* @param {string} text - The text to split into chunks
* @param {number} chunkSize - The target size for each chunk
* @param {number} chunkOverlap - The amount of overlap between chunks
* @returns {string[]} Array of text chunks
*/
export function recursiveSplit(text, chunkSize = 1000, chunkOverlap = 200) {
if (!text || text.length <= chunkSize) {
return [text];
}
const chunks = [];
let startPos = 0;
while (startPos < text.length) {
// Determine end position
let endPos = Math.min(startPos + chunkSize, text.length);
// Adjust end position to avoid cutting in the middle of a sentence or paragraph
if (endPos < text.length) {
// Try to find paragraph break
const nextParaBreak = text.indexOf('\n\n', endPos - chunkOverlap);
if (nextParaBreak !== -1 && nextParaBreak < endPos + 500) {
endPos = nextParaBreak + 2;
} else {
// Try to find sentence break
const nextSentenceBreak = text.indexOf('. ', endPos - chunkOverlap);
if (nextSentenceBreak !== -1 && nextSentenceBreak < endPos + 100) {
endPos = nextSentenceBreak + 2;
}
}
}
// Add chunk
chunks.push(text.substring(startPos, endPos));
// Move to next position with overlap
startPos = endPos - chunkOverlap;
// Avoid tiny final chunks
if (text.length - startPos < chunkSize / 3) {
chunks[chunks.length - 1] += text.substring(startPos);
break;
}
}
return chunks;
}