rag-aiquest
Version:
### Aiquest is an npm package that streamlines the process of parsing websites, splitting content into manageable chunks, embedding these chunks into machine-friendly vectors, and subsequently storing and retrieving these embeddings from AWS. This documen
239 lines (232 loc) • 8.66 kB
JavaScript
;
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var src_exports = {};
__export(src_exports, {
ChunkUtility: () => ChunkUtility,
EmbeddingUtility: () => EmbeddingUtility,
Retrival: () => Retrival,
UnifiedParser: () => UnifiedParser,
VectorStoreAWS: () => VectorStoreAWS
});
module.exports = __toCommonJS(src_exports);
// src/AWS/VectorStoreAWS.ts
var import_aws_sdk = __toESM(require("aws-sdk"));
var VectorStoreAWS = class {
s3Bucket;
bucketName;
constructor(accessKeyId, secretAccessKey, bucketName) {
this.bucketName = bucketName;
import_aws_sdk.default.config.update({
accessKeyId,
secretAccessKey
});
this.s3Bucket = new import_aws_sdk.default.S3();
}
async uploadEmbededModeltoAWS(embeddingStore, fileName) {
const uploadParams = {
Bucket: this.bucketName,
Key: `embeddingRAG/embeded-${fileName}.txt`,
Body: JSON.stringify(embeddingStore)
};
return new Promise((resolve, reject) => {
this.s3Bucket.putObject(uploadParams, (err, data) => {
if (err) {
reject(err);
} else {
const embededFileLocation = `https://${this.bucketName}.s3.ap-south-1.amazonaws.com/embeddingRAG/embeded-${fileName}.txt`;
resolve({ embededFileLocation });
}
});
});
}
getKnowledgeData(fileName) {
const params = {
Bucket: this.bucketName,
Key: `embeddingRAG/embeded-${fileName}.txt`
};
return new Promise((resolve, reject) => {
this.s3Bucket.getObject(params, (err, data) => {
if (err) {
console.log("Error: ", err);
reject(err);
} else if (data && data.Body) {
resolve(JSON.parse(data.Body.toString("utf-8")));
} else {
reject(new Error("No data returned from S3 or data.Body is undefined."));
}
});
});
}
};
// src/QnA/Retrival.ts
var import_openai = require("openai");
var Retrival = class {
openai;
constructor(apiKey) {
this.openai = new import_openai.OpenAI({ apiKey });
}
async QnARetrival(embeddingStore, question) {
const closestEntries = await this.semanticSearch(question, embeddingStore, 3);
const closestParagraphs = closestEntries.map((entry) => entry.content);
let completionData = await this.openai.chat.completions.create({
model: "gpt-3.5-turbo-16k",
messages: [
{
role: "user",
content: this.Prompt(question, closestParagraphs)
}
],
temperature: 0
});
return completionData;
}
async semanticSearch(query, embeddingStore, topN = 5) {
const response = await this.openai.embeddings.create({
input: query,
model: "text-embedding-ada-002"
});
const queryEmbedding = response.data && response.data[0] && response.data[0].embedding;
if (!queryEmbedding) {
throw new Error("No embedding found in response data");
}
return this.findNearestParagraph(embeddingStore, queryEmbedding, topN);
}
findNearestParagraph(embeddingStore, targetEmbedding, count) {
const scoredEntries = embeddingStore.map((entry) => ({
entry,
score: this.cosineSimilarity(targetEmbedding, entry.embedding)
}));
return scoredEntries.sort((a, b) => b.score - a.score).slice(0, count).map((item) => item.entry);
}
Prompt(question, paragraphs) {
return "You are AI Assistant, your are RAG ChatBot . Developed by Apurv Krishn Jha. Answer the following question from the context, if the answer cannot be deduced from the context, say 'Sorry! I didn't Understand the Question, Please explain it in detail':\n\nContext :\n" + paragraphs.join("\n\n") + "\n\nQuestion :\n" + question + "?\n\nAnswer :";
}
compareEmbeddings(embedding1, embedding2) {
const length = Math.min(embedding1.length, embedding2.length);
let dotprod = 0;
for (let i = 0; i < length; i++) {
dotprod += (embedding1[i] || 0) * (embedding2[i] || 0);
}
return dotprod;
}
cosineSimilarity(embedding1, embedding2) {
const dotProduct = this.compareEmbeddings(embedding1, embedding2);
const magnitudeA = Math.sqrt(embedding1.reduce((sum, value) => sum + value * value, 0));
const magnitudeB = Math.sqrt(embedding2.reduce((sum, value) => sum + value * value, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
};
// src/parsers/UnifiedParser.ts
var import_fs = __toESM(require("fs"));
var import_path = __toESM(require("path"));
var import_axios = __toESM(require("axios"));
var import_cheerio = __toESM(require("cheerio"));
var import_pdf_parse = __toESM(require("pdf-parse"));
var UnifiedParser = class {
async parse(input) {
if (Buffer.isBuffer(input)) {
if (input.slice(0, 4).toString() === "%PDF") {
const pdfContent = await (0, import_pdf_parse.default)(input);
return pdfContent.text;
} else {
return input.toString("utf-8");
}
} else if (typeof input === "string") {
if (input.startsWith("http://") || input.startsWith("https://")) {
const response = await import_axios.default.get(input);
const $ = import_cheerio.default.load(response.data);
return $.text();
} else {
const extension = import_path.default.extname(input).toLowerCase();
switch (extension) {
case ".txt":
return import_fs.default.readFileSync(input, "utf-8");
case ".pdf":
const absolutePath = import_path.default.resolve(input);
const pdfData = import_fs.default.readFileSync(absolutePath);
const pdfContent = await (0, import_pdf_parse.default)(pdfData);
return pdfContent.text;
default:
throw new Error("Unsupported file type");
}
}
} else {
throw new Error("Unsupported input type for UnifiedParser");
}
}
};
// src/utilities/ChunkUtility.ts
var ChunkUtility = class {
static splitIntoChunks(text, numOfChunks, overlapSplitChunks) {
const words = text.split(/\s+/);
const wordsPerChunk = Math.ceil(words.length / numOfChunks);
const chunks = [];
for (let i = 0; i < words.length; i += wordsPerChunk - overlapSplitChunks) {
chunks.push(words.slice(i, i + wordsPerChunk).join(" "));
}
return chunks;
}
};
// src/utilities/EmbeddingUtility.ts
var import_openai2 = require("openai");
var EmbeddingUtility = class {
openai;
constructor(apiKey) {
this.openai = new import_openai2.OpenAI({ apiKey });
}
async createEmbedding(chunks) {
const response = await this.openai.embeddings.create({
input: chunks,
model: "text-embedding-ada-002"
});
const embeddings = response.data;
if (embeddings.length !== chunks.length) {
throw new Error("Embedding response length mismatch.");
}
const embeddingStore = chunks.map((chunk, index) => {
if (!embeddings[index] || !embeddings[index]?.embedding) {
throw new Error(`Embedding missing for chunk at index ${index}`);
}
return {
content: chunk,
embedding: embeddings[index]?.embedding
};
});
return embeddingStore;
}
};
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
ChunkUtility,
EmbeddingUtility,
Retrival,
UnifiedParser,
VectorStoreAWS
});