UNPKG

rag-aiquest

Version:

### Aiquest is an npm package that streamlines the process of parsing websites, splitting content into manageable chunks, embedding these chunks into machine-friendly vectors, and subsequently storing and retrieving these embeddings from AWS. This documen

239 lines (232 loc) 8.66 kB
"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { ChunkUtility: () => ChunkUtility, EmbeddingUtility: () => EmbeddingUtility, Retrival: () => Retrival, UnifiedParser: () => UnifiedParser, VectorStoreAWS: () => VectorStoreAWS }); module.exports = __toCommonJS(src_exports); // src/AWS/VectorStoreAWS.ts var import_aws_sdk = __toESM(require("aws-sdk")); var VectorStoreAWS = class { s3Bucket; bucketName; constructor(accessKeyId, secretAccessKey, bucketName) { this.bucketName = bucketName; import_aws_sdk.default.config.update({ accessKeyId, secretAccessKey }); this.s3Bucket = new import_aws_sdk.default.S3(); } async uploadEmbededModeltoAWS(embeddingStore, fileName) { const uploadParams = { Bucket: this.bucketName, Key: `embeddingRAG/embeded-${fileName}.txt`, Body: JSON.stringify(embeddingStore) }; return new Promise((resolve, reject) => { this.s3Bucket.putObject(uploadParams, (err, data) => { if (err) { reject(err); } else { const embededFileLocation = `https://${this.bucketName}.s3.ap-south-1.amazonaws.com/embeddingRAG/embeded-${fileName}.txt`; resolve({ embededFileLocation }); } }); }); } getKnowledgeData(fileName) { const params = { Bucket: this.bucketName, Key: `embeddingRAG/embeded-${fileName}.txt` }; return new Promise((resolve, reject) => { this.s3Bucket.getObject(params, (err, data) => { if (err) { console.log("Error: ", err); reject(err); } else if (data && data.Body) { resolve(JSON.parse(data.Body.toString("utf-8"))); } else { reject(new Error("No data returned from S3 or data.Body is undefined.")); } }); }); } }; // src/QnA/Retrival.ts var import_openai = require("openai"); var Retrival = class { openai; constructor(apiKey) { this.openai = new import_openai.OpenAI({ apiKey }); } async QnARetrival(embeddingStore, question) { const closestEntries = await this.semanticSearch(question, embeddingStore, 3); const closestParagraphs = closestEntries.map((entry) => entry.content); let completionData = await this.openai.chat.completions.create({ model: "gpt-3.5-turbo-16k", messages: [ { role: "user", content: this.Prompt(question, closestParagraphs) } ], temperature: 0 }); return completionData; } async semanticSearch(query, embeddingStore, topN = 5) { const response = await this.openai.embeddings.create({ input: query, model: "text-embedding-ada-002" }); const queryEmbedding = response.data && response.data[0] && response.data[0].embedding; if (!queryEmbedding) { throw new Error("No embedding found in response data"); } return this.findNearestParagraph(embeddingStore, queryEmbedding, topN); } findNearestParagraph(embeddingStore, targetEmbedding, count) { const scoredEntries = embeddingStore.map((entry) => ({ entry, score: this.cosineSimilarity(targetEmbedding, entry.embedding) })); return scoredEntries.sort((a, b) => b.score - a.score).slice(0, count).map((item) => item.entry); } Prompt(question, paragraphs) { return "You are AI Assistant, your are RAG ChatBot . Developed by Apurv Krishn Jha. Answer the following question from the context, if the answer cannot be deduced from the context, say 'Sorry! I didn't Understand the Question, Please explain it in detail':\n\nContext :\n" + paragraphs.join("\n\n") + "\n\nQuestion :\n" + question + "?\n\nAnswer :"; } compareEmbeddings(embedding1, embedding2) { const length = Math.min(embedding1.length, embedding2.length); let dotprod = 0; for (let i = 0; i < length; i++) { dotprod += (embedding1[i] || 0) * (embedding2[i] || 0); } return dotprod; } cosineSimilarity(embedding1, embedding2) { const dotProduct = this.compareEmbeddings(embedding1, embedding2); const magnitudeA = Math.sqrt(embedding1.reduce((sum, value) => sum + value * value, 0)); const magnitudeB = Math.sqrt(embedding2.reduce((sum, value) => sum + value * value, 0)); return dotProduct / (magnitudeA * magnitudeB); } }; // src/parsers/UnifiedParser.ts var import_fs = __toESM(require("fs")); var import_path = __toESM(require("path")); var import_axios = __toESM(require("axios")); var import_cheerio = __toESM(require("cheerio")); var import_pdf_parse = __toESM(require("pdf-parse")); var UnifiedParser = class { async parse(input) { if (Buffer.isBuffer(input)) { if (input.slice(0, 4).toString() === "%PDF") { const pdfContent = await (0, import_pdf_parse.default)(input); return pdfContent.text; } else { return input.toString("utf-8"); } } else if (typeof input === "string") { if (input.startsWith("http://") || input.startsWith("https://")) { const response = await import_axios.default.get(input); const $ = import_cheerio.default.load(response.data); return $.text(); } else { const extension = import_path.default.extname(input).toLowerCase(); switch (extension) { case ".txt": return import_fs.default.readFileSync(input, "utf-8"); case ".pdf": const absolutePath = import_path.default.resolve(input); const pdfData = import_fs.default.readFileSync(absolutePath); const pdfContent = await (0, import_pdf_parse.default)(pdfData); return pdfContent.text; default: throw new Error("Unsupported file type"); } } } else { throw new Error("Unsupported input type for UnifiedParser"); } } }; // src/utilities/ChunkUtility.ts var ChunkUtility = class { static splitIntoChunks(text, numOfChunks, overlapSplitChunks) { const words = text.split(/\s+/); const wordsPerChunk = Math.ceil(words.length / numOfChunks); const chunks = []; for (let i = 0; i < words.length; i += wordsPerChunk - overlapSplitChunks) { chunks.push(words.slice(i, i + wordsPerChunk).join(" ")); } return chunks; } }; // src/utilities/EmbeddingUtility.ts var import_openai2 = require("openai"); var EmbeddingUtility = class { openai; constructor(apiKey) { this.openai = new import_openai2.OpenAI({ apiKey }); } async createEmbedding(chunks) { const response = await this.openai.embeddings.create({ input: chunks, model: "text-embedding-ada-002" }); const embeddings = response.data; if (embeddings.length !== chunks.length) { throw new Error("Embedding response length mismatch."); } const embeddingStore = chunks.map((chunk, index) => { if (!embeddings[index] || !embeddings[index]?.embedding) { throw new Error(`Embedding missing for chunk at index ${index}`); } return { content: chunk, embedding: embeddings[index]?.embedding }; }); return embeddingStore; } }; // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { ChunkUtility, EmbeddingUtility, Retrival, UnifiedParser, VectorStoreAWS });