UNPKG

rag-aiquest

Version:

### Aiquest is an npm package that streamlines the process of parsing websites, splitting content into manageable chunks, embedding these chunks into machine-friendly vectors, and subsequently storing and retrieving these embeddings from AWS. This documen

198 lines (193 loc) 6.56 kB
// src/AWS/VectorStoreAWS.ts import aws from "aws-sdk"; var VectorStoreAWS = class { s3Bucket; bucketName; constructor(accessKeyId, secretAccessKey, bucketName) { this.bucketName = bucketName; aws.config.update({ accessKeyId, secretAccessKey }); this.s3Bucket = new aws.S3(); } async uploadEmbededModeltoAWS(embeddingStore, fileName) { const uploadParams = { Bucket: this.bucketName, Key: `embeddingRAG/embeded-${fileName}.txt`, Body: JSON.stringify(embeddingStore) }; return new Promise((resolve, reject) => { this.s3Bucket.putObject(uploadParams, (err, data) => { if (err) { reject(err); } else { const embededFileLocation = `https://${this.bucketName}.s3.ap-south-1.amazonaws.com/embeddingRAG/embeded-${fileName}.txt`; resolve({ embededFileLocation }); } }); }); } getKnowledgeData(fileName) { const params = { Bucket: this.bucketName, Key: `embeddingRAG/embeded-${fileName}.txt` }; return new Promise((resolve, reject) => { this.s3Bucket.getObject(params, (err, data) => { if (err) { console.log("Error: ", err); reject(err); } else if (data && data.Body) { resolve(JSON.parse(data.Body.toString("utf-8"))); } else { reject(new Error("No data returned from S3 or data.Body is undefined.")); } }); }); } }; // src/QnA/Retrival.ts import { OpenAI } from "openai"; var Retrival = class { openai; constructor(apiKey) { this.openai = new OpenAI({ apiKey }); } async QnARetrival(embeddingStore, question) { const closestEntries = await this.semanticSearch(question, embeddingStore, 3); const closestParagraphs = closestEntries.map((entry) => entry.content); let completionData = await this.openai.chat.completions.create({ model: "gpt-3.5-turbo-16k", messages: [ { role: "user", content: this.Prompt(question, closestParagraphs) } ], temperature: 0 }); return completionData; } async semanticSearch(query, embeddingStore, topN = 5) { const response = await this.openai.embeddings.create({ input: query, model: "text-embedding-ada-002" }); const queryEmbedding = response.data && response.data[0] && response.data[0].embedding; if (!queryEmbedding) { throw new Error("No embedding found in response data"); } return this.findNearestParagraph(embeddingStore, queryEmbedding, topN); } findNearestParagraph(embeddingStore, targetEmbedding, count) { const scoredEntries = embeddingStore.map((entry) => ({ entry, score: this.cosineSimilarity(targetEmbedding, entry.embedding) })); return scoredEntries.sort((a, b) => b.score - a.score).slice(0, count).map((item) => item.entry); } Prompt(question, paragraphs) { return "You are AI Assistant, your are RAG ChatBot . Developed by Apurv Krishn Jha. Answer the following question from the context, if the answer cannot be deduced from the context, say 'Sorry! I didn't Understand the Question, Please explain it in detail':\n\nContext :\n" + paragraphs.join("\n\n") + "\n\nQuestion :\n" + question + "?\n\nAnswer :"; } compareEmbeddings(embedding1, embedding2) { const length = Math.min(embedding1.length, embedding2.length); let dotprod = 0; for (let i = 0; i < length; i++) { dotprod += (embedding1[i] || 0) * (embedding2[i] || 0); } return dotprod; } cosineSimilarity(embedding1, embedding2) { const dotProduct = this.compareEmbeddings(embedding1, embedding2); const magnitudeA = Math.sqrt(embedding1.reduce((sum, value) => sum + value * value, 0)); const magnitudeB = Math.sqrt(embedding2.reduce((sum, value) => sum + value * value, 0)); return dotProduct / (magnitudeA * magnitudeB); } }; // src/parsers/UnifiedParser.ts import fs from "fs"; import path from "path"; import axios from "axios"; import cheerio from "cheerio"; import pdf from "pdf-parse"; var UnifiedParser = class { async parse(input) { if (Buffer.isBuffer(input)) { if (input.slice(0, 4).toString() === "%PDF") { const pdfContent = await pdf(input); return pdfContent.text; } else { return input.toString("utf-8"); } } else if (typeof input === "string") { if (input.startsWith("http://") || input.startsWith("https://")) { const response = await axios.get(input); const $ = cheerio.load(response.data); return $.text(); } else { const extension = path.extname(input).toLowerCase(); switch (extension) { case ".txt": return fs.readFileSync(input, "utf-8"); case ".pdf": const absolutePath = path.resolve(input); const pdfData = fs.readFileSync(absolutePath); const pdfContent = await pdf(pdfData); return pdfContent.text; default: throw new Error("Unsupported file type"); } } } else { throw new Error("Unsupported input type for UnifiedParser"); } } }; // src/utilities/ChunkUtility.ts var ChunkUtility = class { static splitIntoChunks(text, numOfChunks, overlapSplitChunks) { const words = text.split(/\s+/); const wordsPerChunk = Math.ceil(words.length / numOfChunks); const chunks = []; for (let i = 0; i < words.length; i += wordsPerChunk - overlapSplitChunks) { chunks.push(words.slice(i, i + wordsPerChunk).join(" ")); } return chunks; } }; // src/utilities/EmbeddingUtility.ts import { OpenAI as OpenAI2 } from "openai"; var EmbeddingUtility = class { openai; constructor(apiKey) { this.openai = new OpenAI2({ apiKey }); } async createEmbedding(chunks) { const response = await this.openai.embeddings.create({ input: chunks, model: "text-embedding-ada-002" }); const embeddings = response.data; if (embeddings.length !== chunks.length) { throw new Error("Embedding response length mismatch."); } const embeddingStore = chunks.map((chunk, index) => { if (!embeddings[index] || !embeddings[index]?.embedding) { throw new Error(`Embedding missing for chunk at index ${index}`); } return { content: chunk, embedding: embeddings[index]?.embedding }; }); return embeddingStore; } }; export { ChunkUtility, EmbeddingUtility, Retrival, UnifiedParser, VectorStoreAWS };