@forge-ml/rag
Version:
A RAG (Retrieval-Augmented Generation) package for Forge ML
153 lines (152 loc) • 6.48 kB
JavaScript
import { Client } from "minio";
import Document from "../../../documents/documents";
const mergeDocuments = (existingDocument, newDocument) => {
return new Document(existingDocument.text + newDocument.text, existingDocument.metadata, existingDocument.forgeMetadata);
};
class MinioDocStore {
client;
//@TODO: These should not be hardcoded or class properties - this is only because we are starting with one document for now - fix
bucketName = "doc-store"; //@TODO make this constructor parameter
documentName = "doc";
static DOCUMENT_FILE = "documents";
chunksName = "chunks";
static CHUNKS_FILE = "chunks";
constructor({ endpoint, port, useSSL, accessKey, secretKey, }) {
this.client = new Client({
endPoint: endpoint,
port: port,
useSSL: useSSL,
accessKey: accessKey,
secretKey: secretKey,
});
this.initializeBucket();
}
async initializeBucket() {
const bucketExists = await this.client.bucketExists(this.bucketName);
if (!bucketExists) {
await this.client.makeBucket(this.bucketName, "us-east-1");
//console.log(`Bucket '${this.bucketName}' created successfully.`);
return;
}
return;
}
async storeDocument(document, chunks) {
const docPath = `${document.getForgeMetadata().documentId}/${MinioDocStore.DOCUMENT_FILE}`;
const chunksPath = `${document.getForgeMetadata().documentId}/${MinioDocStore.CHUNKS_FILE}`;
try {
await Promise.all([
this.client.putObject(this.bucketName, docPath, Buffer.from(JSON.stringify(document))),
this.client.putObject(this.bucketName, chunksPath, Buffer.from(JSON.stringify(chunks))),
]);
//console.log(`Document stored with Name: ${this.documentName}`);
return;
}
catch (error) {
console.error("Error storing document:", error);
throw error;
}
}
async retrieveDocument(documentId) {
const docPath = `${documentId}/${MinioDocStore.DOCUMENT_FILE}`;
try {
const document = await this.client.getObject(this.bucketName, docPath);
const documentString = JSON.parse(await this.streamToString(document));
return new Document(documentString.text, documentString.metadata, documentString.forgeMetadata);
}
catch (error) {
console.error("Error retrieving document:", error);
throw error;
}
}
async updateDocument(document, documentId) {
const docPath = `${documentId}/${MinioDocStore.DOCUMENT_FILE}`;
//@TODO fix - this is broken
try {
// Check if the document exists before updating
const existingDocument = await this.retrieveDocument(documentId);
if (!existingDocument) {
throw new Error(`Document with ID ${documentId} does not exist.`);
}
const updatedDocument = mergeDocuments(existingDocument, document);
// Proceed with update
await this.client.putObject(this.bucketName, docPath, Buffer.from(JSON.stringify(updatedDocument)));
}
catch (error) {
console.error("Error updating document:", error);
throw error;
}
}
async deleteDocument(documentId) {
const docPath = `${documentId}/${MinioDocStore.DOCUMENT_FILE}`;
try {
//check if document exists
//statObject returns metadata on the document
const documentExists = await this.client.statObject(this.bucketName, docPath);
//check if chunks exist
const chunksPath = `${documentId}/${MinioDocStore.CHUNKS_FILE}`;
const chunksExists = await this.client.statObject(this.bucketName, chunksPath);
if (!documentExists || !chunksExists) {
throw new Error(`Document "${docPath}" does not exist.`);
}
await Promise.all([
this.client.removeObject(this.bucketName, docPath),
this.client.removeObject(this.bucketName, chunksPath),
]);
}
catch (error) {
console.error("Error deleting document:", error);
throw error;
}
}
async retrieveChunks(documentIds) {
const allChunks = [];
const bucketExists = await this.client.bucketExists(this.bucketName);
if (!bucketExists) {
throw new Error(`Bucket "${this.bucketName}" does not exist.`);
}
const chunkPromises = documentIds.map(async (documentId) => {
// Grab the chunk path
const chunksPath = `${documentId}/${MinioDocStore.CHUNKS_FILE}`;
try {
// Grab the chunks from the bucket
const chunks = await this.client.getObject(this.bucketName, chunksPath);
const chunksString = await this.streamToString(chunks);
// Parse the chunks and return them
return JSON.parse(chunksString);
}
catch (error) {
console.error(`Error retrieving chunks for document ${documentId}:`, error);
return [];
}
});
const chunksArrays = await Promise.all(chunkPromises);
allChunks.push(...chunksArrays.flat());
if (allChunks.length === 0) {
throw new Error(`No chunks found for the provided document IDs in bucket "${this.bucketName}"`);
}
return allChunks;
}
async mergeChunksAndEmbeddings(embeddings, documentIds) {
const chunks = await this.retrieveChunks(documentIds);
const relevantChunks = embeddings.map((embedding) => {
return {
...embedding,
text: chunks.find((c) => c.id === embedding.chunkId)?.text || "",
};
});
return relevantChunks;
}
async deleteBucket() {
await this.client.removeBucket(this.bucketName);
}
//helper
async streamToString(stream) {
return new Promise((resolve, reject) => {
const chunks = [];
stream.on("data", (chunk) => chunks.push(chunk));
stream.on("error", reject);
stream.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
});
}
}
export default MinioDocStore;