UNPKG

@coworker-agency/rag

Version:

Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering

55 lines (46 loc) 1.71 kB
/** * Recursive document splitter that intelligently breaks text into chunks * based on natural boundaries like paragraphs and sentences. */ /** * Split text into chunks using a recursive approach * @param {string} text - The text to split into chunks * @param {number} chunkSize - The target size for each chunk * @param {number} chunkOverlap - The amount of overlap between chunks * @returns {string[]} Array of text chunks */ export function recursiveSplit(text, chunkSize = 1000, chunkOverlap = 200) { if (!text || text.length <= chunkSize) { return [text]; } const chunks = []; let startPos = 0; while (startPos < text.length) { // Determine end position let endPos = Math.min(startPos + chunkSize, text.length); // Adjust end position to avoid cutting in the middle of a sentence or paragraph if (endPos < text.length) { // Try to find paragraph break const nextParaBreak = text.indexOf('\n\n', endPos - chunkOverlap); if (nextParaBreak !== -1 && nextParaBreak < endPos + 500) { endPos = nextParaBreak + 2; } else { // Try to find sentence break const nextSentenceBreak = text.indexOf('. ', endPos - chunkOverlap); if (nextSentenceBreak !== -1 && nextSentenceBreak < endPos + 100) { endPos = nextSentenceBreak + 2; } } } // Add chunk chunks.push(text.substring(startPos, endPos)); // Move to next position with overlap startPos = endPos - chunkOverlap; // Avoid tiny final chunks if (text.length - startPos < chunkSize / 3) { chunks[chunks.length - 1] += text.substring(startPos); break; } } return chunks; }