@forge-ml/rag
Version:
A RAG (Retrieval-Augmented Generation) package for Forge ML
78 lines (77 loc) • 2.93 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const types_1 = require("../../types");
const DEFAULT_CHUNK_SIZE = 1000;
const DEFAULT_CHUNK_OVERLAP = 200;
/**
* TODO:
* - [ ] Fix chunking strategies
* - [ ] Generate unique document IDs and chunk IDs
* - [ ] Actually add metadata to chunks
*/
/**
* Splits text into chunks suitable for RAG.
* @param text The input text to be chunked.
* @param options Chunking options.
* @returns An array of Chunk objects.
*/
const chunkText = (document, options = {}) => {
const { strategy = types_1.ChunkingStrategy.BY_PARAGRAPH, chunkSize = DEFAULT_CHUNK_SIZE, chunkOverlap = DEFAULT_CHUNK_OVERLAP, } = options;
const text = document.getText();
const documentId = document.getForgeMetadata().documentId;
const chunks = [];
const splitText = (() => {
switch (strategy) {
case types_1.ChunkingStrategy.BY_PARAGRAPH:
return text.split(/\n\s*\n/);
case types_1.ChunkingStrategy.BY_SENTENCE:
return text.split(/[.!?]+\s+/);
case types_1.ChunkingStrategy.BY_ITEM_IN_LIST:
return text.split(/\n\s*[-•*]\s*/);
case types_1.ChunkingStrategy.BY_CUSTOM_DELIMITER:
return text.split(options?.delimiter || ",");
default:
return text.split(/\n\s*\n/); // Default to paragraph splitting
}
})();
let currentChunk = "";
let chunkId = 0;
for (const segment of splitText) {
// TODO: chunk size should be "maximum" size, not "once exceeded" size
if (currentChunk.length + segment.length > chunkSize) {
if (currentChunk) {
chunks.push(createChunk(currentChunk, documentId, chunkId++));
currentChunk = currentChunk.slice(-chunkOverlap);
}
}
if (currentChunk.length + segment.length <= chunkSize) {
currentChunk += (currentChunk ? " " : "") + segment;
}
else {
// If adding the segment would exceed chunkSize, start a new chunk
if (currentChunk) {
chunks.push(createChunk(currentChunk, documentId, chunkId++));
}
currentChunk = segment;
}
// Ensure the last chunk doesn't exceed chunkSize
while (currentChunk.length > chunkSize) {
chunks.push(createChunk(currentChunk.slice(0, chunkSize), documentId, chunkId++));
currentChunk = currentChunk.slice(chunkSize - chunkOverlap);
}
}
if (currentChunk) {
chunks.push(createChunk(currentChunk, documentId, chunkId));
}
return chunks;
};
const createChunk = (text, documentId, chunkId) => ({
id: `${documentId}-${chunkId}`,
forgeMetadata: {
documentId,
chunkId: `${documentId}-${chunkId}`,
},
metadata: {},
text,
});
exports.default = chunkText;