rag-aiquest
Version:
### Aiquest is an npm package that streamlines the process of parsing websites, splitting content into manageable chunks, embedding these chunks into machine-friendly vectors, and subsequently storing and retrieving these embeddings from AWS. This documen
198 lines (193 loc) • 6.56 kB
JavaScript
// src/AWS/VectorStoreAWS.ts
import aws from "aws-sdk";
var VectorStoreAWS = class {
s3Bucket;
bucketName;
constructor(accessKeyId, secretAccessKey, bucketName) {
this.bucketName = bucketName;
aws.config.update({
accessKeyId,
secretAccessKey
});
this.s3Bucket = new aws.S3();
}
async uploadEmbededModeltoAWS(embeddingStore, fileName) {
const uploadParams = {
Bucket: this.bucketName,
Key: `embeddingRAG/embeded-${fileName}.txt`,
Body: JSON.stringify(embeddingStore)
};
return new Promise((resolve, reject) => {
this.s3Bucket.putObject(uploadParams, (err, data) => {
if (err) {
reject(err);
} else {
const embededFileLocation = `https://${this.bucketName}.s3.ap-south-1.amazonaws.com/embeddingRAG/embeded-${fileName}.txt`;
resolve({ embededFileLocation });
}
});
});
}
getKnowledgeData(fileName) {
const params = {
Bucket: this.bucketName,
Key: `embeddingRAG/embeded-${fileName}.txt`
};
return new Promise((resolve, reject) => {
this.s3Bucket.getObject(params, (err, data) => {
if (err) {
console.log("Error: ", err);
reject(err);
} else if (data && data.Body) {
resolve(JSON.parse(data.Body.toString("utf-8")));
} else {
reject(new Error("No data returned from S3 or data.Body is undefined."));
}
});
});
}
};
// src/QnA/Retrival.ts
import { OpenAI } from "openai";
var Retrival = class {
openai;
constructor(apiKey) {
this.openai = new OpenAI({ apiKey });
}
async QnARetrival(embeddingStore, question) {
const closestEntries = await this.semanticSearch(question, embeddingStore, 3);
const closestParagraphs = closestEntries.map((entry) => entry.content);
let completionData = await this.openai.chat.completions.create({
model: "gpt-3.5-turbo-16k",
messages: [
{
role: "user",
content: this.Prompt(question, closestParagraphs)
}
],
temperature: 0
});
return completionData;
}
async semanticSearch(query, embeddingStore, topN = 5) {
const response = await this.openai.embeddings.create({
input: query,
model: "text-embedding-ada-002"
});
const queryEmbedding = response.data && response.data[0] && response.data[0].embedding;
if (!queryEmbedding) {
throw new Error("No embedding found in response data");
}
return this.findNearestParagraph(embeddingStore, queryEmbedding, topN);
}
findNearestParagraph(embeddingStore, targetEmbedding, count) {
const scoredEntries = embeddingStore.map((entry) => ({
entry,
score: this.cosineSimilarity(targetEmbedding, entry.embedding)
}));
return scoredEntries.sort((a, b) => b.score - a.score).slice(0, count).map((item) => item.entry);
}
Prompt(question, paragraphs) {
return "You are AI Assistant, your are RAG ChatBot . Developed by Apurv Krishn Jha. Answer the following question from the context, if the answer cannot be deduced from the context, say 'Sorry! I didn't Understand the Question, Please explain it in detail':\n\nContext :\n" + paragraphs.join("\n\n") + "\n\nQuestion :\n" + question + "?\n\nAnswer :";
}
compareEmbeddings(embedding1, embedding2) {
const length = Math.min(embedding1.length, embedding2.length);
let dotprod = 0;
for (let i = 0; i < length; i++) {
dotprod += (embedding1[i] || 0) * (embedding2[i] || 0);
}
return dotprod;
}
cosineSimilarity(embedding1, embedding2) {
const dotProduct = this.compareEmbeddings(embedding1, embedding2);
const magnitudeA = Math.sqrt(embedding1.reduce((sum, value) => sum + value * value, 0));
const magnitudeB = Math.sqrt(embedding2.reduce((sum, value) => sum + value * value, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
};
// src/parsers/UnifiedParser.ts
import fs from "fs";
import path from "path";
import axios from "axios";
import cheerio from "cheerio";
import pdf from "pdf-parse";
var UnifiedParser = class {
async parse(input) {
if (Buffer.isBuffer(input)) {
if (input.slice(0, 4).toString() === "%PDF") {
const pdfContent = await pdf(input);
return pdfContent.text;
} else {
return input.toString("utf-8");
}
} else if (typeof input === "string") {
if (input.startsWith("http://") || input.startsWith("https://")) {
const response = await axios.get(input);
const $ = cheerio.load(response.data);
return $.text();
} else {
const extension = path.extname(input).toLowerCase();
switch (extension) {
case ".txt":
return fs.readFileSync(input, "utf-8");
case ".pdf":
const absolutePath = path.resolve(input);
const pdfData = fs.readFileSync(absolutePath);
const pdfContent = await pdf(pdfData);
return pdfContent.text;
default:
throw new Error("Unsupported file type");
}
}
} else {
throw new Error("Unsupported input type for UnifiedParser");
}
}
};
// src/utilities/ChunkUtility.ts
var ChunkUtility = class {
static splitIntoChunks(text, numOfChunks, overlapSplitChunks) {
const words = text.split(/\s+/);
const wordsPerChunk = Math.ceil(words.length / numOfChunks);
const chunks = [];
for (let i = 0; i < words.length; i += wordsPerChunk - overlapSplitChunks) {
chunks.push(words.slice(i, i + wordsPerChunk).join(" "));
}
return chunks;
}
};
// src/utilities/EmbeddingUtility.ts
import { OpenAI as OpenAI2 } from "openai";
var EmbeddingUtility = class {
openai;
constructor(apiKey) {
this.openai = new OpenAI2({ apiKey });
}
async createEmbedding(chunks) {
const response = await this.openai.embeddings.create({
input: chunks,
model: "text-embedding-ada-002"
});
const embeddings = response.data;
if (embeddings.length !== chunks.length) {
throw new Error("Embedding response length mismatch.");
}
const embeddingStore = chunks.map((chunk, index) => {
if (!embeddings[index] || !embeddings[index]?.embedding) {
throw new Error(`Embedding missing for chunk at index ${index}`);
}
return {
content: chunk,
embedding: embeddings[index]?.embedding
};
});
return embeddingStore;
}
};
export {
ChunkUtility,
EmbeddingUtility,
Retrival,
UnifiedParser,
VectorStoreAWS
};