UNPKG

@forge-ml/rag

Version:

A RAG (Retrieval-Augmented Generation) package for Forge ML

153 lines (152 loc) 6.48 kB
import { Client } from "minio"; import Document from "../../../documents/documents"; const mergeDocuments = (existingDocument, newDocument) => { return new Document(existingDocument.text + newDocument.text, existingDocument.metadata, existingDocument.forgeMetadata); }; class MinioDocStore { client; //@TODO: These should not be hardcoded or class properties - this is only because we are starting with one document for now - fix bucketName = "doc-store"; //@TODO make this constructor parameter documentName = "doc"; static DOCUMENT_FILE = "documents"; chunksName = "chunks"; static CHUNKS_FILE = "chunks"; constructor({ endpoint, port, useSSL, accessKey, secretKey, }) { this.client = new Client({ endPoint: endpoint, port: port, useSSL: useSSL, accessKey: accessKey, secretKey: secretKey, }); this.initializeBucket(); } async initializeBucket() { const bucketExists = await this.client.bucketExists(this.bucketName); if (!bucketExists) { await this.client.makeBucket(this.bucketName, "us-east-1"); //console.log(`Bucket '${this.bucketName}' created successfully.`); return; } return; } async storeDocument(document, chunks) { const docPath = `${document.getForgeMetadata().documentId}/${MinioDocStore.DOCUMENT_FILE}`; const chunksPath = `${document.getForgeMetadata().documentId}/${MinioDocStore.CHUNKS_FILE}`; try { await Promise.all([ this.client.putObject(this.bucketName, docPath, Buffer.from(JSON.stringify(document))), this.client.putObject(this.bucketName, chunksPath, Buffer.from(JSON.stringify(chunks))), ]); //console.log(`Document stored with Name: ${this.documentName}`); return; } catch (error) { console.error("Error storing document:", error); throw error; } } async retrieveDocument(documentId) { const docPath = `${documentId}/${MinioDocStore.DOCUMENT_FILE}`; try { const document = await this.client.getObject(this.bucketName, docPath); const documentString = JSON.parse(await this.streamToString(document)); return new Document(documentString.text, documentString.metadata, documentString.forgeMetadata); } catch (error) { console.error("Error retrieving document:", error); throw error; } } async updateDocument(document, documentId) { const docPath = `${documentId}/${MinioDocStore.DOCUMENT_FILE}`; //@TODO fix - this is broken try { // Check if the document exists before updating const existingDocument = await this.retrieveDocument(documentId); if (!existingDocument) { throw new Error(`Document with ID ${documentId} does not exist.`); } const updatedDocument = mergeDocuments(existingDocument, document); // Proceed with update await this.client.putObject(this.bucketName, docPath, Buffer.from(JSON.stringify(updatedDocument))); } catch (error) { console.error("Error updating document:", error); throw error; } } async deleteDocument(documentId) { const docPath = `${documentId}/${MinioDocStore.DOCUMENT_FILE}`; try { //check if document exists //statObject returns metadata on the document const documentExists = await this.client.statObject(this.bucketName, docPath); //check if chunks exist const chunksPath = `${documentId}/${MinioDocStore.CHUNKS_FILE}`; const chunksExists = await this.client.statObject(this.bucketName, chunksPath); if (!documentExists || !chunksExists) { throw new Error(`Document "${docPath}" does not exist.`); } await Promise.all([ this.client.removeObject(this.bucketName, docPath), this.client.removeObject(this.bucketName, chunksPath), ]); } catch (error) { console.error("Error deleting document:", error); throw error; } } async retrieveChunks(documentIds) { const allChunks = []; const bucketExists = await this.client.bucketExists(this.bucketName); if (!bucketExists) { throw new Error(`Bucket "${this.bucketName}" does not exist.`); } const chunkPromises = documentIds.map(async (documentId) => { // Grab the chunk path const chunksPath = `${documentId}/${MinioDocStore.CHUNKS_FILE}`; try { // Grab the chunks from the bucket const chunks = await this.client.getObject(this.bucketName, chunksPath); const chunksString = await this.streamToString(chunks); // Parse the chunks and return them return JSON.parse(chunksString); } catch (error) { console.error(`Error retrieving chunks for document ${documentId}:`, error); return []; } }); const chunksArrays = await Promise.all(chunkPromises); allChunks.push(...chunksArrays.flat()); if (allChunks.length === 0) { throw new Error(`No chunks found for the provided document IDs in bucket "${this.bucketName}"`); } return allChunks; } async mergeChunksAndEmbeddings(embeddings, documentIds) { const chunks = await this.retrieveChunks(documentIds); const relevantChunks = embeddings.map((embedding) => { return { ...embedding, text: chunks.find((c) => c.id === embedding.chunkId)?.text || "", }; }); return relevantChunks; } async deleteBucket() { await this.client.removeBucket(this.bucketName); } //helper async streamToString(stream) { return new Promise((resolve, reject) => { const chunks = []; stream.on("data", (chunk) => chunks.push(chunk)); stream.on("error", reject); stream.on("end", () => resolve(Buffer.concat(chunks).toString("utf8"))); }); } } export default MinioDocStore;