UNPKG

@forge-ml/rag

Version:

A RAG (Retrieval-Augmented Generation) package for Forge ML

94 lines (93 loc) 4.09 kB
import { ChunkingStrategy } from "../types"; const DEFAULT_CHUNK_SIZE = 1000; const DEFAULT_CHUNK_OVERLAP = 200; /** * TODO: * - [ ] Fix chunking strategies * - [ ] Generate unique document IDs and chunk IDs * - [ ] Actually add metadata to chunks */ /** * Splits text into chunks suitable for RAG. * @param text The input text to be chunked. * @param options Chunking options. * @returns An array of Chunk objects. */ const chunkText = (document, options = {}) => { const { strategy = ChunkingStrategy.BY_PARAGRAPH, chunkSize = DEFAULT_CHUNK_SIZE, chunkOverlap = DEFAULT_CHUNK_OVERLAP, wordCount = 500, } = options; const text = document.getText(); const documentId = document.getForgeMetadata().documentId; const chunks = []; const splitText = (() => { switch (strategy) { case ChunkingStrategy.BY_PARAGRAPH: return text.split(/\n\s*\n/).filter(Boolean); case ChunkingStrategy.BY_SENTENCE: return text.split(/(?<=[.!?])\s+/).filter(Boolean); case ChunkingStrategy.BY_ITEM_IN_LIST: return text.split(/\n\s*[-•*]\s*/).filter(Boolean); case ChunkingStrategy.BY_CUSTOM_DELIMITER: return text.split(options?.delimiter || ",").map(s => s.trim()).filter(Boolean); case ChunkingStrategy.BY_WORD_COUNT: return text.split(/\s+/).reduce((acc, word, index) => { if (index % wordCount === 0) acc.push([word]); else acc[acc.length - 1].push(word); return acc; }, []).map(words => words.join(' ')); case ChunkingStrategy.BY_DOCUMENT: return [text]; default: return text.split(/\n\s*\n/).filter(Boolean); } })(); // Initialize a counter for chunk IDs let chunkId = 0; // Iterate through each split text segment for (let i = 0; i < splitText.length; i++) { // Get the current text segment let currentChunk = splitText[i]; // Process the current chunk, potentially breaking it into smaller pieces while (currentChunk.length > 0) { // Extract a portion of the current chunk up to the specified chunk size const chunkToAdd = currentChunk.slice(0, chunkSize); // Create a new chunk and add it to the chunks array chunks.push(createChunk(chunkToAdd, documentId, chunkId++)); // If the current chunk is longer than the chunk size, prepare for the next iteration if (currentChunk.length > chunkSize) { // Slide the window, considering the overlap currentChunk = currentChunk.slice(chunkSize - chunkOverlap); } else { // If the remaining text is shorter than chunk size, exit the loop break; } } const separator = strategy === ChunkingStrategy.BY_SENTENCE ? ' ' : '\n\n'; // Check if we can combine the current chunk with the next one // (except for word count strategy, which has predefined chunk sizes) if (i < splitText.length - 1 && strategy !== ChunkingStrategy.BY_WORD_COUNT) { const lastChunk = chunks[chunks.length - 1]; const nextChunk = splitText[i + 1]; // If combining doesn't exceed the chunk size, merge them if (lastChunk.text.length + nextChunk.length <= chunkSize) { // Add appropriate separator based on the chunking strategy lastChunk.text += separator + nextChunk; // Skip the next iteration since we've already processed that chunk i++; } } } return chunks; }; const createChunk = (text, documentId, chunkId) => ({ id: `${documentId}-${chunkId}`, forgeMetadata: { documentId, chunkId: `${documentId}-${chunkId}`, }, metadata: {}, // Add this line text, }); export default chunkText;