@langchain/community
Version:
Third-party integrations for LangChain.js
520 lines (519 loc) • 23.3 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.HanaDB = void 0;
const vectorstores_1 = require("@langchain/core/vectorstores");
const documents_1 = require("@langchain/core/documents");
const math_1 = require("@langchain/core/utils/math");
const HANA_DISTANCE_FUNCTION = {
cosine: ["COSINE_SIMILARITY", "DESC"],
euclidean: ["L2DISTANCE", "ASC"],
};
const defaultDistanceStrategy = "cosine";
const defaultTableName = "EMBEDDINGS";
const defaultContentColumn = "VEC_TEXT";
const defaultMetadataColumn = "VEC_META";
const defaultVectorColumn = "VEC_VECTOR";
const defaultVectorColumnLength = -1; // -1 means dynamic length
class HanaDB extends vectorstores_1.VectorStore {
_vectorstoreType() {
return "hanadb";
}
constructor(embeddings, args) {
super(embeddings, args);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Object.defineProperty(this, "connection", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "distanceStrategy", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "tableName", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "contentColumn", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "metadataColumn", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "vectorColumn", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "vectorColumnLength", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.distanceStrategy = args.distanceStrategy || defaultDistanceStrategy;
this.tableName = HanaDB.sanitizeName(args.tableName || defaultTableName);
this.contentColumn = HanaDB.sanitizeName(args.contentColumn || defaultContentColumn);
this.metadataColumn = HanaDB.sanitizeName(args.metadataColumn || defaultMetadataColumn);
this.vectorColumn = HanaDB.sanitizeName(args.vectorColumn || defaultVectorColumn);
this.vectorColumnLength = HanaDB.sanitizeInt(args.vectorColumnLength || defaultVectorColumnLength); // Using '??' to allow 0 as a valid value
this.connection = args.connection;
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
executeQuery(client, query) {
return new Promise((resolve, reject) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
client.exec(query, (err, result) => {
if (err) {
reject(err);
}
else {
resolve(result);
}
});
});
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
prepareQuery(client, query) {
return new Promise((resolve, reject) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
client.prepare(query, (err, statement) => {
if (err) {
reject(err);
}
else {
resolve(statement);
}
});
});
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
executeStatement(statement, params) {
return new Promise((resolve, reject) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
statement.exec(params, (err, res) => {
if (err) {
reject(err);
}
else {
resolve(res);
}
});
});
}
async initialize() {
let valid_distance = false;
for (const key in HANA_DISTANCE_FUNCTION) {
if (key === this.distanceStrategy) {
valid_distance = true;
break; // Added to exit loop once a match is found
}
}
if (!valid_distance) {
throw new Error(`Unsupported distance_strategy: ${this.distanceStrategy}`);
}
await this.createTableIfNotExists();
await this.checkColumn(this.tableName, this.contentColumn, [
"NCLOB",
"NVARCHAR",
]);
await this.checkColumn(this.tableName, this.metadataColumn, [
"NCLOB",
"NVARCHAR",
]);
await this.checkColumn(this.tableName, this.vectorColumn, ["REAL_VECTOR"], this.vectorColumnLength);
}
/**
* Sanitizes the input string by removing characters that are not alphanumeric or underscores.
* @param inputStr The string to be sanitized.
* @returns The sanitized string.
*/
static sanitizeName(inputStr) {
return inputStr.replace(/[^a-zA-Z0-9_]/g, "");
}
/**
* Sanitizes the input to integer. Throws an error if the value is less than -1.
* @param inputInt The input to be sanitized.
* @returns The sanitized integer.
*/
// eslint-disable-next-line @typescript-eslint/no-explicit-any
static sanitizeInt(inputInt) {
const value = parseInt(inputInt.toString(), 10);
if (Number.isNaN(value) || value < -1) {
throw new Error(`Value (${value}) must not be smaller than -1`);
}
return value;
}
/**
* Sanitizes a list to ensure all elements are floats (numbers in TypeScript).
* Throws an error if any element is not a number.
*
* @param {number[]} embedding - The array of numbers (floats) to be sanitized.
* @returns {number[]} The sanitized array of numbers (floats).
* @throws {Error} Throws an error if any element is not a number.
*/
static sanitizeListFloat(embedding) {
if (!Array.isArray(embedding)) {
throw new Error(`Expected 'embedding' to be an array, but received ${typeof embedding}`);
}
embedding.forEach((value) => {
if (typeof value !== "number") {
throw new Error(`Value (${value}) does not have type number`);
}
});
return embedding;
}
/**
* Sanitizes the keys of the metadata object to ensure they match the required pattern.
* Throws an error if any key does not match the pattern.
*
* @param {Record<string, any>} metadata - The metadata object with keys to be validated.
* @returns {object[] | object} The original metadata object if all keys are valid.
* @throws {Error} Throws an error if any metadata key is invalid.
*/
sanitizeMetadataKeys(metadata) {
if (!metadata) {
return {};
}
Object.keys(metadata).forEach((key) => {
if (!HanaDB.compiledPattern.test(key)) {
throw new Error(`Invalid metadata key ${key}`);
}
});
return metadata;
}
/**
* Parses a string representation of a float array and returns an array of numbers.
* @param {string} arrayAsString - The string representation of the array.
* @returns {number[]} An array of floats parsed from the string.
*/
static parseFloatArrayFromString(arrayAsString) {
const arrayWithoutBrackets = arrayAsString.slice(1, -1);
return arrayWithoutBrackets.split(",").map((x) => parseFloat(x));
}
/**
* Checks if the specified column exists in the table and validates its data type and length.
* @param tableName The name of the table.
* @param columnName The name of the column to check.
* @param columnType The expected data type(s) of the column.
* @param columnLength The expected length of the column. Optional.
*/
async checkColumn(tableName, columnName, columnType, columnLength) {
const sqlStr = `
SELECT DATA_TYPE_NAME, LENGTH
FROM SYS.TABLE_COLUMNS
WHERE SCHEMA_NAME = CURRENT_SCHEMA
AND TABLE_NAME = ?
AND COLUMN_NAME = ?`;
const client = this.connection; // Get the connection object
// Prepare the statement with parameter placeholders
const stm = await this.prepareQuery(client, sqlStr);
// Execute the query with actual parameters to avoid SQL injection
const resultSet = await this.executeStatement(stm, [tableName, columnName]);
if (resultSet.length === 0) {
throw new Error(`Column ${columnName} does not exist`);
}
else {
const dataType = resultSet[0].DATA_TYPE_NAME;
const length = resultSet[0].LENGTH;
// Check if dataType is within columnType
const isValidType = Array.isArray(columnType)
? columnType.includes(dataType)
: columnType === dataType;
if (!isValidType) {
throw new Error(`Column ${columnName} has the wrong type: ${dataType}`);
}
// Check length, if parameter was provided
if (columnLength !== undefined && length !== columnLength) {
throw new Error(`Column ${columnName} has the wrong length: ${length}`);
}
}
}
async createTableIfNotExists() {
const tableExists = await this.tableExists(this.tableName);
if (!tableExists) {
let sqlStr = `CREATE TABLE "${this.tableName}" (` +
`"${this.contentColumn}" NCLOB, ` +
`"${this.metadataColumn}" NCLOB, ` +
`"${this.vectorColumn}" REAL_VECTOR`;
sqlStr +=
this.vectorColumnLength === -1
? ");"
: `(${this.vectorColumnLength}));`;
const client = this.connection;
await this.executeQuery(client, sqlStr);
}
}
async tableExists(tableName) {
const tableExistsSQL = `SELECT COUNT(*) AS COUNT FROM SYS.TABLES WHERE SCHEMA_NAME = CURRENT_SCHEMA AND TABLE_NAME = ?`;
const client = this.connection; // Get the connection object
const stm = await this.prepareQuery(client, tableExistsSQL);
const resultSet = await this.executeStatement(stm, [tableName]);
if (resultSet[0].COUNT === 1) {
// Table does exist
return true;
}
return false;
}
/**
* Creates a WHERE clause based on the provided filter object.
* @param filter - A filter object with keys as metadata fields and values as filter values.
* @returns A tuple containing the WHERE clause string and an array of query parameters.
*/
createWhereByFilter(filter) {
const queryTuple = [];
let whereStr = "";
if (filter) {
Object.keys(filter).forEach((key, i) => {
whereStr += i === 0 ? " WHERE " : " AND ";
whereStr += ` JSON_VALUE(${this.metadataColumn}, '$.${key}') = ?`;
const value = filter[key];
if (typeof value === "number") {
if (Number.isInteger(value)) {
// hdb requires string while sap/hana-client doesn't
queryTuple.push(value.toString());
}
else {
throw new Error(`Unsupported filter data-type: wrong number type for key ${key}`);
}
}
else if (typeof value === "string") {
queryTuple.push(value);
}
else if (typeof value === "boolean") {
queryTuple.push(value.toString());
}
else {
throw new Error(`Unsupported filter data-type: ${typeof value} for key ${key}`);
}
});
}
return [whereStr, queryTuple];
}
/**
* Deletes entries from the table based on the provided filter.
* @param ids - Optional. Deletion by ids is not supported and will throw an error.
* @param filter - Optional. A filter object to specify which entries to delete.
* @throws Error if 'ids' parameter is provided, as deletion by ids is not supported.
* @throws Error if 'filter' parameter is not provided, as it is required for deletion.
* to do: adjust the call signature
*/
async delete(options) {
const { ids, filter } = options;
if (ids) {
throw new Error("Deletion via IDs is not supported");
}
if (!filter) {
throw new Error("Parameter 'filter' is required when calling 'delete'");
}
const [whereStr, queryTuple] = this.createWhereByFilter(filter);
const sqlStr = `DELETE FROM "${this.tableName}" ${whereStr}`;
const client = this.connection;
const stm = await this.prepareQuery(client, sqlStr);
await this.executeStatement(stm, queryTuple);
}
/**
* Static method to create a HanaDB instance from raw texts. This method embeds the documents,
* creates a table if it does not exist, and adds the documents to the table.
* @param texts Array of text documents to add.
* @param metadatas metadata for each text document.
* @param embedding EmbeddingsInterface instance for document embedding.
* @param dbConfig Configuration for the HanaDB.
* @returns A Promise that resolves to an instance of HanaDB.
*/
static async fromTexts(texts, metadatas, embeddings, dbConfig) {
const docs = [];
for (let i = 0; i < texts.length; i += 1) {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
const newDoc = new documents_1.Document({
pageContent: texts[i],
metadata,
});
docs.push(newDoc);
}
return HanaDB.fromDocuments(docs, embeddings, dbConfig);
}
/**
* Creates an instance of `HanaDB` from an array of
* Document instances. The documents are added to the database.
* @param docs List of documents to be converted to vectors.
* @param embeddings Embeddings instance used to convert the documents to vectors.
* @param dbConfig Configuration for the HanaDB.
* @returns Promise that resolves to an instance of `HanaDB`.
*/
static async fromDocuments(docs, embeddings, dbConfig) {
const instance = new HanaDB(embeddings, dbConfig);
await instance.initialize();
await instance.addDocuments(docs);
return instance;
}
/**
* Adds an array of documents to the table. The documents are first
* converted to vectors using the `embedDocuments` method of the
* `embeddings` instance.
* @param documents Array of Document instances to be added to the table.
* @returns Promise that resolves when the documents are added.
*/
async addDocuments(documents) {
const texts = documents.map(({ pageContent }) => pageContent);
return this.addVectors(await this.embeddings.embedDocuments(texts), documents);
}
/**
* Adds an array of vectors and corresponding documents to the database.
* The vectors and documents are batch inserted into the database.
* @param vectors Array of vectors to be added to the table.
* @param documents Array of Document instances corresponding to the vectors.
* @returns Promise that resolves when the vectors and documents are added.
*/
async addVectors(vectors, documents) {
if (vectors.length !== documents.length) {
throw new Error(`Vectors and metadatas must have the same length`);
}
const texts = documents.map((doc) => doc.pageContent);
const metadatas = documents.map((doc) => doc.metadata);
const client = this.connection;
const sqlParams = texts.map((text, i) => {
const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas;
// Ensure embedding is generated or provided
const embeddingString = `[${vectors[i].join(", ")}]`;
// Prepare the SQL parameters
return [
text,
JSON.stringify(this.sanitizeMetadataKeys(metadata)),
embeddingString,
];
});
// Insert data into the table, bulk insert.
const sqlStr = `INSERT INTO "${this.tableName}" ("${this.contentColumn}", "${this.metadataColumn}", "${this.vectorColumn}")
VALUES (?, ?, TO_REAL_VECTOR(?));`;
const stm = await this.prepareQuery(client, sqlStr);
await this.executeStatement(stm, sqlParams);
// stm.execBatch(sqlParams);
}
/**
* Return docs most similar to query.
* @param query Query text for the similarity search.
* @param k Number of Documents to return. Defaults to 4.
* @param filter A dictionary of metadata fields and values to filter by.
Defaults to None.
* @returns Promise that resolves to a list of documents and their corresponding similarity scores.
*/
async similaritySearch(query, k, filter) {
const results = await this.similaritySearchWithScore(query, k, filter);
return results.map((result) => result[0]);
}
/**
* Return documents and score values most similar to query.
* @param query Query text for the similarity search.
* @param k Number of Documents to return. Defaults to 4.
* @param filter A dictionary of metadata fields and values to filter by.
Defaults to None.
* @returns Promise that resolves to a list of documents and their corresponding similarity scores.
*/
async similaritySearchWithScore(query, k, filter) {
const queryEmbedding = await this.embeddings.embedQuery(query);
return this.similaritySearchVectorWithScore(queryEmbedding, k, filter);
}
/**
* Return docs most similar to the given embedding.
* @param query Query embedding for the similarity search.
* @param k Number of Documents to return. Defaults to 4.
* @param filter A dictionary of metadata fields and values to filter by.
Defaults to None.
* @returns Promise that resolves to a list of documents and their corresponding similarity scores.
*/
async similaritySearchVectorWithScore(queryEmbedding, k, filter) {
const wholeResult = await this.similaritySearchWithScoreAndVectorByVector(queryEmbedding, k, filter);
// Return documents and scores, discarding the vectors
return wholeResult.map(([doc, score]) => [doc, score]);
}
/**
* Performs a similarity search based on vector comparison and returns documents along with their similarity scores and vectors.
* @param embedding The vector representation of the query for similarity comparison.
* @param k The number of top similar documents to return.
* @param filter Optional filter criteria to apply to the search query.
* @returns A promise that resolves to an array of tuples, each containing a Document, its similarity score, and its vector.
*/
async similaritySearchWithScoreAndVectorByVector(embedding, k, filter) {
// const result: Array<[Document, number, number[]]> = [];
// Sanitize inputs
const sanitizedK = HanaDB.sanitizeInt(k);
const sanitizedEmbedding = HanaDB.sanitizeListFloat(embedding);
// Determine the distance function based on the configured strategy
const distanceFuncName = HANA_DISTANCE_FUNCTION[this.distanceStrategy][0];
// Convert the embedding vector to a string for SQL query
const embeddingAsString = sanitizedEmbedding.join(",");
let sqlStr = `SELECT TOP ${sanitizedK}
"${this.contentColumn}",
"${this.metadataColumn}",
TO_NVARCHAR("${this.vectorColumn}") AS VECTOR,
${distanceFuncName}("${this.vectorColumn}", TO_REAL_VECTOR('[${embeddingAsString}]')) AS CS
FROM "${this.tableName}"`;
// Add order by clause to sort by similarity
const orderStr = ` ORDER BY CS ${HANA_DISTANCE_FUNCTION[this.distanceStrategy][1]}`;
// Prepare and execute the SQL query
const [whereStr, queryTuple] = this.createWhereByFilter(filter);
sqlStr += whereStr + orderStr;
const client = this.connection;
const stm = await this.prepareQuery(client, sqlStr);
const resultSet = await this.executeStatement(stm, queryTuple);
const result = resultSet.map(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(row) => {
const metadata = JSON.parse(row[this.metadataColumn].toString("utf8"));
const doc = {
pageContent: row[this.contentColumn].toString("utf8"),
metadata,
};
const resultVector = HanaDB.parseFloatArrayFromString(row.VECTOR);
const score = row.CS;
return [doc, score, resultVector];
});
return result;
}
/**
* Return documents selected using the maximal marginal relevance.
* Maximal marginal relevance optimizes for similarity to the query AND
* diversity among selected documents.
* @param query Text to look up documents similar to.
* @param options.k Number of documents to return.
* @param options.fetchK=20 Number of documents to fetch before passing to
* the MMR algorithm.
* @param options.lambda=0.5 Number between 0 and 1 that determines the
* degree of diversity among the results, where 0 corresponds to maximum
* diversity and 1 to minimum diversity.
* @returns List of documents selected by maximal marginal relevance.
*/
async maxMarginalRelevanceSearch(query, options) {
const { k, fetchK = 20, lambda = 0.5 } = options;
// console.log(options)
const queryEmbedding = await this.embeddings.embedQuery(query);
const docs = await this.similaritySearchWithScoreAndVectorByVector(queryEmbedding, fetchK);
// docs is an Array of tuples: [Document, number, number[]]
const embeddingList = docs.map((doc) => doc[2]); // Extracts the embedding from each tuple
// Re-rank the results using MMR
const mmrIndexes = (0, math_1.maximalMarginalRelevance)(queryEmbedding, embeddingList, lambda, k);
const mmrDocs = mmrIndexes.map((index) => docs[index][0]);
return mmrDocs;
}
}
exports.HanaDB = HanaDB;
// Compile pattern only once, for better performance
Object.defineProperty(HanaDB, "compiledPattern", {
enumerable: true,
configurable: true,
writable: true,
value: /^[a-zA-Z_][a-zA-Z0-9_]*$/
});