UNPKG

@forge-ml/rag

Version:

A RAG (Retrieval-Augmented Generation) package for Forge ML

78 lines (77 loc) 2.93 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const types_1 = require("../../types"); const DEFAULT_CHUNK_SIZE = 1000; const DEFAULT_CHUNK_OVERLAP = 200; /** * TODO: * - [ ] Fix chunking strategies * - [ ] Generate unique document IDs and chunk IDs * - [ ] Actually add metadata to chunks */ /** * Splits text into chunks suitable for RAG. * @param text The input text to be chunked. * @param options Chunking options. * @returns An array of Chunk objects. */ const chunkText = (document, options = {}) => { const { strategy = types_1.ChunkingStrategy.BY_PARAGRAPH, chunkSize = DEFAULT_CHUNK_SIZE, chunkOverlap = DEFAULT_CHUNK_OVERLAP, } = options; const text = document.getText(); const documentId = document.getForgeMetadata().documentId; const chunks = []; const splitText = (() => { switch (strategy) { case types_1.ChunkingStrategy.BY_PARAGRAPH: return text.split(/\n\s*\n/); case types_1.ChunkingStrategy.BY_SENTENCE: return text.split(/[.!?]+\s+/); case types_1.ChunkingStrategy.BY_ITEM_IN_LIST: return text.split(/\n\s*[-•*]\s*/); case types_1.ChunkingStrategy.BY_CUSTOM_DELIMITER: return text.split(options?.delimiter || ","); default: return text.split(/\n\s*\n/); // Default to paragraph splitting } })(); let currentChunk = ""; let chunkId = 0; for (const segment of splitText) { // TODO: chunk size should be "maximum" size, not "once exceeded" size if (currentChunk.length + segment.length > chunkSize) { if (currentChunk) { chunks.push(createChunk(currentChunk, documentId, chunkId++)); currentChunk = currentChunk.slice(-chunkOverlap); } } if (currentChunk.length + segment.length <= chunkSize) { currentChunk += (currentChunk ? " " : "") + segment; } else { // If adding the segment would exceed chunkSize, start a new chunk if (currentChunk) { chunks.push(createChunk(currentChunk, documentId, chunkId++)); } currentChunk = segment; } // Ensure the last chunk doesn't exceed chunkSize while (currentChunk.length > chunkSize) { chunks.push(createChunk(currentChunk.slice(0, chunkSize), documentId, chunkId++)); currentChunk = currentChunk.slice(chunkSize - chunkOverlap); } } if (currentChunk) { chunks.push(createChunk(currentChunk, documentId, chunkId)); } return chunks; }; const createChunk = (text, documentId, chunkId) => ({ id: `${documentId}-${chunkId}`, forgeMetadata: { documentId, chunkId: `${documentId}-${chunkId}`, }, metadata: {}, text, }); exports.default = chunkText;