UNPKG

@langchain/community

Version:
632 lines (628 loc) 25.6 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.PGVectorStore = void 0; const pg_1 = __importDefault(require("pg")); const vectorstores_1 = require("@langchain/core/vectorstores"); const documents_1 = require("@langchain/core/documents"); const env_1 = require("@langchain/core/utils/env"); /** * Class that provides an interface to a Postgres vector database. It * extends the `VectorStore` base class and implements methods for adding * documents and vectors, performing similarity searches, and ensuring the * existence of a table in the database. */ class PGVectorStore extends vectorstores_1.VectorStore { _vectorstoreType() { return "pgvector"; } constructor(embeddings, config) { super(embeddings, config); Object.defineProperty(this, "tableName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "collectionTableName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "collectionName", { enumerable: true, configurable: true, writable: true, value: "langchain" }); Object.defineProperty(this, "collectionMetadata", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "schemaName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "idColumnName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "vectorColumnName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "contentColumnName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "extensionSchemaName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "metadataColumnName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "filter", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "_verbose", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "pool", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "client", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "chunkSize", { enumerable: true, configurable: true, writable: true, value: 500 }); Object.defineProperty(this, "distanceStrategy", { enumerable: true, configurable: true, writable: true, value: "cosine" }); this.tableName = config.tableName; if (config.collectionName !== undefined && config.collectionTableName === undefined) { throw new Error(`If supplying a "collectionName", you must also supply a "collectionTableName".`); } this.collectionTableName = config.collectionTableName; this.collectionName = config.collectionName ?? "langchain"; this.collectionMetadata = config.collectionMetadata ?? null; this.schemaName = config.schemaName ?? null; this.extensionSchemaName = config.extensionSchemaName ?? null; this.filter = config.filter; this.vectorColumnName = config.columns?.vectorColumnName ?? "embedding"; this.contentColumnName = config.columns?.contentColumnName ?? "text"; this.idColumnName = config.columns?.idColumnName ?? "id"; this.metadataColumnName = config.columns?.metadataColumnName ?? "metadata"; if (!config.postgresConnectionOptions && !config.pool) { throw new Error("You must provide either a `postgresConnectionOptions` object or a `pool` instance."); } const pool = config.pool ?? new pg_1.default.Pool(config.postgresConnectionOptions); this.pool = pool; this.chunkSize = config.chunkSize ?? 500; this.distanceStrategy = config.distanceStrategy ?? this.distanceStrategy; this._verbose = (0, env_1.getEnvironmentVariable)("LANGCHAIN_VERBOSE") === "true" ?? !!config.verbose; } get computedTableName() { return this.schemaName == null ? `${this.tableName}` : `"${this.schemaName}"."${this.tableName}"`; } get computedCollectionTableName() { return this.schemaName == null ? `${this.collectionTableName}` : `"${this.schemaName}"."${this.collectionTableName}"`; } get computedOperatorString() { let operator; switch (this.distanceStrategy) { case "cosine": operator = "<=>"; break; case "innerProduct": operator = "<#>"; break; case "euclidean": operator = "<->"; break; default: throw new Error(`Unknown distance strategy: ${this.distanceStrategy}`); } return this.extensionSchemaName !== null ? `OPERATOR(${this.extensionSchemaName}.${operator})` : operator; } /** * Static method to create a new `PGVectorStore` instance from a * connection. It creates a table if one does not exist, and calls * `connect` to return a new instance of `PGVectorStore`. * * @param embeddings - Embeddings instance. * @param fields - `PGVectorStoreArgs` instance. * @returns A new instance of `PGVectorStore`. */ static async initialize(embeddings, config) { const postgresqlVectorStore = new PGVectorStore(embeddings, config); await postgresqlVectorStore._initializeClient(); await postgresqlVectorStore.ensureTableInDatabase(); if (postgresqlVectorStore.collectionTableName) { await postgresqlVectorStore.ensureCollectionTableInDatabase(); } return postgresqlVectorStore; } async _initializeClient() { this.client = await this.pool.connect(); } /** * Method to add documents to the vector store. It converts the documents into * vectors, and adds them to the store. * * @param documents - Array of `Document` instances. * @param options - Optional arguments for adding documents * @returns Promise that resolves when the documents have been added. */ async addDocuments(documents, options) { const texts = documents.map(({ pageContent }) => pageContent); return this.addVectors(await this.embeddings.embedDocuments(texts), documents, options); } /** * Inserts a row for the collectionName provided at initialization if it does not * exist and returns the collectionId. * * @returns The collectionId for the given collectionName. */ async getOrCreateCollection() { const queryString = ` SELECT uuid from ${this.computedCollectionTableName} WHERE name = $1; `; const queryResult = await this.pool.query(queryString, [ this.collectionName, ]); let collectionId = queryResult.rows[0]?.uuid; if (!collectionId) { const insertString = ` INSERT INTO ${this.computedCollectionTableName}( uuid, name, cmetadata ) VALUES ( uuid_generate_v4(), $1, $2 ) RETURNING uuid; `; const insertResult = await this.pool.query(insertString, [ this.collectionName, this.collectionMetadata, ]); collectionId = insertResult.rows[0]?.uuid; } return collectionId; } /** * Generates the SQL placeholders for a specific row at the provided index. * * @param index - The index of the row for which placeholders need to be generated. * @param numOfColumns - The number of columns we are inserting data into. * @returns The SQL placeholders for the row values. */ generatePlaceholderForRowAt(index, numOfColumns) { const placeholders = []; for (let i = 0; i < numOfColumns; i += 1) { placeholders.push(`$${index * numOfColumns + i + 1}`); } return `(${placeholders.join(", ")})`; } /** * Constructs the SQL query for inserting rows into the specified table. * * @param rows - The rows of data to be inserted, consisting of values and records. * @param chunkIndex - The starting index for generating query placeholders based on chunk positioning. * @returns The complete SQL INSERT INTO query string. */ async buildInsertQuery(rows) { let collectionId; if (this.collectionTableName) { collectionId = await this.getOrCreateCollection(); } const columns = [ this.contentColumnName, this.vectorColumnName, this.metadataColumnName, ]; if (collectionId) { columns.push("collection_id"); } // Check if we have added ids to the rows. if (rows.length !== 0 && columns.length === rows[0].length - 1) { columns.push(this.idColumnName); } const valuesPlaceholders = rows .map((_, j) => this.generatePlaceholderForRowAt(j, columns.length)) .join(", "); const text = ` INSERT INTO ${this.computedTableName}( ${columns.map((column) => `"${column}"`).join(", ")} ) VALUES ${valuesPlaceholders} `; return text; } /** * Method to add vectors to the vector store. It converts the vectors into * rows and inserts them into the database. * * @param vectors - Array of vectors. * @param documents - Array of `Document` instances. * @param options - Optional arguments for adding documents * @returns Promise that resolves when the vectors have been added. */ async addVectors(vectors, documents, options) { const ids = options?.ids; // Either all documents have ids or none of them do to avoid confusion. if (ids !== undefined && ids.length !== vectors.length) { throw new Error("The number of ids must match the number of vectors provided."); } const rows = []; let collectionId; if (this.collectionTableName) { collectionId = await this.getOrCreateCollection(); } for (let i = 0; i < vectors.length; i += 1) { const values = []; const embedding = vectors[i]; const embeddingString = `[${embedding.join(",")}]`; values.push(documents[i].pageContent.replace(/\0/g, ""), embeddingString.replace(/\0/g, ""), documents[i].metadata); if (collectionId) { values.push(collectionId); } if (ids) { values.push(ids[i]); } rows.push(values); } for (let i = 0; i < rows.length; i += this.chunkSize) { const chunk = rows.slice(i, i + this.chunkSize); const insertQuery = await this.buildInsertQuery(chunk); const flatValues = chunk.flat(); try { await this.pool.query(insertQuery, flatValues); } catch (e) { console.error(e); throw new Error(`Error inserting: ${e.message}`); } } } /** * Method to delete documents from the vector store. It deletes the * documents that match the provided ids. * * @param ids - Array of document ids. * @returns Promise that resolves when the documents have been deleted. */ async deleteById(ids) { let collectionId; if (this.collectionTableName) { collectionId = await this.getOrCreateCollection(); } // Set parameters of dynamically generated query const params = collectionId ? [ids, collectionId] : [ids]; const queryString = ` DELETE FROM ${this.computedTableName} WHERE ${collectionId ? "collection_id = $2 AND " : ""}${this.idColumnName} = ANY($1::uuid[]) `; await this.pool.query(queryString, params); } /** * Method to delete documents from the vector store. It deletes the * documents whose metadata contains the filter. * * @param filter - An object representing the Metadata filter. * @returns Promise that resolves when the documents have been deleted. */ async deleteByFilter(filter) { let collectionId; if (this.collectionTableName) { collectionId = await this.getOrCreateCollection(); } // Set parameters of dynamically generated query const params = collectionId ? [filter, collectionId] : [filter]; const queryString = ` DELETE FROM ${this.computedTableName} WHERE ${collectionId ? "collection_id = $2 AND " : ""}${this.metadataColumnName}::jsonb @> $1 `; return await this.pool.query(queryString, params); } /** * Method to delete documents from the vector store. It deletes the * documents that match the provided ids or metadata filter. Matches ids * exactly and metadata filter according to postgres jsonb containment. Ids and filter * are mutually exclusive. * * @param params - Object containing either an array of ids or a metadata filter object. * @returns Promise that resolves when the documents have been deleted. * @throws Error if neither ids nor filter are provided, or if both are provided. * @example <caption>Delete by ids</caption> * await vectorStore.delete({ ids: ["id1", "id2"] }); * @example <caption>Delete by filter</caption> * await vectorStore.delete({ filter: { a: 1, b: 2 } }); */ async delete(params) { const { ids, filter } = params; if (!(ids || filter)) { throw new Error("You must specify either ids or a filter when deleting documents."); } if (ids && filter) { throw new Error("You cannot specify both ids and a filter when deleting documents."); } if (ids) { await this.deleteById(ids); } else if (filter) { await this.deleteByFilter(filter); } } /** * Method to perform a similarity search in the vector store. It returns * the `k` most similar documents to the query vector, along with their * similarity scores. * * @param query - Query vector. * @param k - Number of most similar documents to return. * @param filter - Optional filter to apply to the search. * @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score. */ async similaritySearchVectorWithScore(query, k, filter) { const embeddingString = `[${query.join(",")}]`; const _filter = filter ?? {}; let collectionId; if (this.collectionTableName) { collectionId = await this.getOrCreateCollection(); } // eslint-disable-next-line @typescript-eslint/no-explicit-any const parameters = [embeddingString, k]; const whereClauses = []; if (collectionId) { whereClauses.push("collection_id = $3"); parameters.push(collectionId); } let paramCount = parameters.length; for (const [key, value] of Object.entries(_filter)) { if (typeof value === "object" && value !== null) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const _value = value; const currentParamCount = paramCount; if (Array.isArray(_value.in)) { const placeholders = _value.in .map((_, index) => `$${currentParamCount + index + 1}`) .join(","); whereClauses.push(`${this.metadataColumnName}->>'${key}' IN (${placeholders})`); parameters.push(..._value.in); paramCount += _value.in.length; } if (Array.isArray(_value.arrayContains)) { const placeholders = _value.arrayContains .map((_, index) => `$${currentParamCount + index + 1}`) .join(","); whereClauses.push(`${this.metadataColumnName}->'${key}' ?| array[${placeholders}]`); parameters.push(..._value.arrayContains); paramCount += _value.arrayContains.length; } } else { paramCount += 1; whereClauses.push(`${this.metadataColumnName}->>'${key}' = $${paramCount}`); parameters.push(value); } } const whereClause = whereClauses.length ? `WHERE ${whereClauses.join(" AND ")}` : ""; const queryString = ` SELECT *, "${this.vectorColumnName}" ${this.computedOperatorString} $1 as "_distance" FROM ${this.computedTableName} ${whereClause} ORDER BY "_distance" ASC LIMIT $2; `; const documents = (await this.pool.query(queryString, parameters)).rows; const results = []; for (const doc of documents) { if (doc._distance != null && doc[this.contentColumnName] != null) { const document = new documents_1.Document({ pageContent: doc[this.contentColumnName], metadata: doc[this.metadataColumnName], }); results.push([document, doc._distance]); } } return results; } /** * Method to ensure the existence of the table in the database. It creates * the table if it does not already exist. * * @returns Promise that resolves when the table has been ensured. */ async ensureTableInDatabase() { const vectorQuery = this.extensionSchemaName == null ? "CREATE EXTENSION IF NOT EXISTS vector;" : `CREATE EXTENSION IF NOT EXISTS vector WITH SCHEMA "${this.extensionSchemaName}";`; const uuidQuery = this.extensionSchemaName == null ? 'CREATE EXTENSION IF NOT EXISTS "uuid-ossp";' : `CREATE EXTENSION IF NOT EXISTS "uuid-ossp" WITH SCHEMA "${this.extensionSchemaName}";`; const extensionName = this.extensionSchemaName == null ? "vector" : `"${this.extensionSchemaName}"."vector"`; const tableQuery = ` CREATE TABLE IF NOT EXISTS ${this.computedTableName} ( "${this.idColumnName}" uuid NOT NULL DEFAULT uuid_generate_v4() PRIMARY KEY, "${this.contentColumnName}" text, "${this.metadataColumnName}" jsonb, "${this.vectorColumnName}" ${extensionName} ); `; await this.pool.query(vectorQuery); await this.pool.query(uuidQuery); await this.pool.query(tableQuery); } /** * Method to ensure the existence of the collection table in the database. * It creates the table if it does not already exist. * * @returns Promise that resolves when the collection table has been ensured. */ async ensureCollectionTableInDatabase() { try { const queryString = ` CREATE TABLE IF NOT EXISTS ${this.computedCollectionTableName} ( uuid uuid NOT NULL DEFAULT uuid_generate_v4() PRIMARY KEY, name character varying, cmetadata jsonb ); CREATE INDEX IF NOT EXISTS idx_${this.collectionTableName}_name ON ${this.computedCollectionTableName}(name); ALTER TABLE ${this.computedTableName} ADD COLUMN collection_id uuid; ALTER TABLE ${this.computedTableName} ADD CONSTRAINT ${this.tableName}_collection_id_fkey FOREIGN KEY (collection_id) REFERENCES ${this.computedCollectionTableName}(uuid) ON DELETE CASCADE; `; await this.pool.query(queryString); } catch (e) { if (!e.message.includes("already exists")) { console.error(e); throw new Error(`Error adding column or creating index: ${e.message}`); } } } /** * Static method to create a new `PGVectorStore` instance from an * array of texts and their metadata. It converts the texts into * `Document` instances and adds them to the store. * * @param texts - Array of texts. * @param metadatas - Array of metadata objects or a single metadata object. * @param embeddings - Embeddings instance. * @param dbConfig - `PGVectorStoreArgs` instance. * @returns Promise that resolves with a new instance of `PGVectorStore`. */ static async fromTexts(texts, metadatas, embeddings, dbConfig) { const docs = []; for (let i = 0; i < texts.length; i += 1) { const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas; const newDoc = new documents_1.Document({ pageContent: texts[i], metadata, }); docs.push(newDoc); } return PGVectorStore.fromDocuments(docs, embeddings, dbConfig); } /** * Static method to create a new `PGVectorStore` instance from an * array of `Document` instances. It adds the documents to the store. * * @param docs - Array of `Document` instances. * @param embeddings - Embeddings instance. * @param dbConfig - `PGVectorStoreArgs` instance. * @returns Promise that resolves with a new instance of `PGVectorStore`. */ static async fromDocuments(docs, embeddings, dbConfig) { const instance = await PGVectorStore.initialize(embeddings, dbConfig); await instance.addDocuments(docs, { ids: dbConfig.ids }); return instance; } /** * Closes all the clients in the pool and terminates the pool. * * @returns Promise that resolves when all clients are closed and the pool is terminated. */ async end() { this.client?.release(); return this.pool.end(); } /** * Method to create the HNSW index on the vector column. * * @param dimensions - Defines the number of dimensions in your vector data type, up to 2000. For example, use 1536 for OpenAI's text-embedding-ada-002 and Amazon's amazon.titan-embed-text-v1 models. * @param m - The max number of connections per layer (16 by default). Index build time improves with smaller values, while higher values can speed up search queries. * @param efConstruction - The size of the dynamic candidate list for constructing the graph (64 by default). A higher value can potentially improve the index quality at the cost of index build time. * @param distanceFunction - The distance function name you want to use, is automatically selected based on the distanceStrategy. * @returns Promise that resolves with the query response of creating the index. */ async createHnswIndex(config) { let idxDistanceFunction = config?.distanceFunction || "vector_cosine_ops"; switch (this.distanceStrategy) { case "cosine": idxDistanceFunction = "vector_cosine_ops"; break; case "innerProduct": idxDistanceFunction = "vector_ip_ops"; break; case "euclidean": idxDistanceFunction = "vector_l2_ops"; break; default: throw new Error(`Unknown distance strategy: ${this.distanceStrategy}`); } const createIndexQuery = `CREATE INDEX IF NOT EXISTS ${this.vectorColumnName}_embedding_hnsw_idx ON ${this.computedTableName} USING hnsw ((${this.vectorColumnName}::vector(${config.dimensions})) ${idxDistanceFunction}) WITH ( m=${config?.m || 16}, ef_construction=${config?.efConstruction || 64} );`; try { await this.pool.query(createIndexQuery); } catch (e) { console.error(`Failed to create HNSW index on table ${this.computedTableName}, error: ${e}`); } } } exports.PGVectorStore = PGVectorStore;