UNPKG

llamaindex

Version:

<p align="center"> <img height="100" width="100" alt="LlamaIndex logo" src="https://ts.llamaindex.ai/square.svg" /> </p> <h1 align="center">LlamaIndex.TS</h1> <h3 align="center"> Data framework for your LLM application. </h3>

315 lines (314 loc) 13.2 kB
import { ContextChatEngine } from "@llamaindex/core/chat-engine"; import { IndexDict, IndexStructType } from "@llamaindex/core/data-structs"; import { DEFAULT_SIMILARITY_TOP_K } from "@llamaindex/core/embeddings"; import { BaseRetriever } from "@llamaindex/core/retriever"; import { ImageNode, ModalityType, ObjectType, splitNodesByType } from "@llamaindex/core/schema"; import { extractText } from "@llamaindex/core/utils"; import { VectorStoreQueryMode } from "@llamaindex/core/vector-store"; import { Settings } from "../../Settings.js"; import { RetrieverQueryEngine } from "../../engines/query/RetrieverQueryEngine.js"; import { addNodesToVectorStores, runTransformations } from "../../ingestion/IngestionPipeline.js"; import { createDocStoreStrategy, DocStoreStrategy } from "../../ingestion/strategies/index.js"; import { storageContextFromDefaults } from "../../storage/StorageContext.js"; import { BaseIndex } from "../BaseIndex.js"; /** * The VectorStoreIndex, an index that stores the nodes only according to their vector embeddings. */ export class VectorStoreIndex extends BaseIndex { indexStore; embedModel; vectorStores; constructor(init){ super(init); this.indexStore = init.indexStore; this.vectorStores = init.vectorStores ?? init.storageContext.vectorStores; this.embedModel = Settings.embedModel; } /** * The async init function creates a new VectorStoreIndex. * @param options * @returns */ static async init(options) { const storageContext = options.storageContext ?? await storageContextFromDefaults({}); const indexStore = storageContext.indexStore; const docStore = storageContext.docStore; let indexStruct = await VectorStoreIndex.setupIndexStructFromStorage(indexStore, options); if (!options.nodes && !indexStruct) { throw new Error("Cannot initialize VectorStoreIndex without nodes or indexStruct"); } indexStruct = indexStruct ?? new IndexDict(); const index = new this({ storageContext, docStore, indexStruct, indexStore, vectorStores: options.vectorStores }); if (options.nodes) { // If nodes are passed in, then we need to update the index await index.buildIndexFromNodes(options.nodes, { logProgress: options.logProgress }); } return index; } static async setupIndexStructFromStorage(indexStore, options) { const indexStructs = await indexStore.getIndexStructs(); let indexStruct; if (options.indexStruct && indexStructs.length > 0) { throw new Error("Cannot initialize index with both indexStruct and indexStore"); } if (options.indexStruct) { indexStruct = options.indexStruct; } else if (indexStructs.length == 1) { indexStruct = indexStructs[0].type === IndexStructType.SIMPLE_DICT ? indexStructs[0] : undefined; indexStruct = indexStructs[0]; } else if (indexStructs.length > 1 && options.indexId) { indexStruct = await indexStore.getIndexStruct(options.indexId); } // Check indexStruct type if (indexStruct && indexStruct.type !== IndexStructType.SIMPLE_DICT) { throw new Error("Attempting to initialize VectorStoreIndex with non-vector indexStruct"); } return indexStruct; } /** * Calculates the embeddings for the given nodes. * * @param nodes - An array of BaseNode objects representing the nodes for which embeddings are to be calculated. * @param {Object} [options] - An optional object containing additional parameters. * @param {boolean} [options.logProgress] - A boolean indicating whether to log progress to the console (useful for debugging). */ async getNodeEmbeddingResults(nodes, options) { const nodeMap = splitNodesByType(nodes); for(const type in nodeMap){ const nodes = nodeMap[type]; const embedModel = this.vectorStores[type]?.embedModel ?? this.embedModel; if (embedModel && nodes) { await embedModel(nodes, { logProgress: options?.logProgress }); } } return nodes; } /** * Get embeddings for nodes and place them into the index. * @param nodes * @returns */ async buildIndexFromNodes(nodes, options) { await this.insertNodes(nodes, options); } /** * High level API: split documents, get embeddings, and build index. * @param documents * @param args * @returns */ static async fromDocuments(documents, args = {}) { args.storageContext = args.storageContext ?? await storageContextFromDefaults({}); args.vectorStores = args.vectorStores ?? args.storageContext.vectorStores; args.docStoreStrategy = args.docStoreStrategy ?? // set doc store strategy defaults to the same as for the IngestionPipeline (args.vectorStores ? DocStoreStrategy.UPSERTS : DocStoreStrategy.DUPLICATES_ONLY); const docStore = args.storageContext.docStore; if (args.logProgress) { console.log("Using node parser on documents..."); } // use doc store strategy to avoid duplicates const vectorStores = Object.values(args.vectorStores ?? {}); const docStoreStrategy = createDocStoreStrategy(args.docStoreStrategy, docStore, vectorStores); args.nodes = await runTransformations(documents, [ Settings.nodeParser ], {}, { docStoreStrategy }); if (args.logProgress) { console.log("Finished parsing documents."); } try { return await this.init(args); } catch (error) { await docStoreStrategy.rollback(args.storageContext.docStore, args.nodes); throw error; } } static async fromVectorStores(vectorStores) { if (!vectorStores[ModalityType.TEXT]?.storesText) { throw new Error("Cannot initialize from a vector store that does not store text"); } const storageContext = await storageContextFromDefaults({ vectorStores }); const index = await this.init({ nodes: [], storageContext }); return index; } static async fromVectorStore(vectorStore) { return this.fromVectorStores({ [ModalityType.TEXT]: vectorStore }); } asRetriever(options) { return new VectorIndexRetriever({ index: this, ...options }); } /** * Create a RetrieverQueryEngine. * similarityTopK is only used if no existing retriever is provided. */ asQueryEngine(options) { const { retriever, responseSynthesizer, preFilters, nodePostprocessors, similarityTopK } = options ?? {}; return new RetrieverQueryEngine(retriever ?? this.asRetriever({ similarityTopK, filters: preFilters }), responseSynthesizer, nodePostprocessors); } /** * Convert the index to a chat engine. * @param options The options for creating the chat engine * @returns A ContextChatEngine that uses the index's retriever to get context for each query */ asChatEngine(options = {}) { const { retriever, similarityTopK, preFilters, ...contextChatEngineOptions } = options; return new ContextChatEngine({ retriever: retriever ?? this.asRetriever({ similarityTopK, filters: preFilters }), ...contextChatEngineOptions }); } async insertNodesToStore(newIds, nodes, vectorStore) { // NOTE: if the vector store doesn't store text, // we need to add the nodes to the index struct and document store // NOTE: if the vector store keeps text, // we only need to add image and index nodes for(let i = 0; i < nodes.length; ++i){ const { type } = nodes[i]; if (!vectorStore.storesText || type === ObjectType.INDEX || type === ObjectType.IMAGE) { const nodeWithoutEmbedding = nodes[i].clone(); nodeWithoutEmbedding.embedding = undefined; this.indexStruct.addNode(nodeWithoutEmbedding, newIds[i]); await this.docStore.addDocuments([ nodeWithoutEmbedding ], true); } } } async insertNodes(nodes, options) { if (!nodes || nodes.length === 0) { return; } nodes = await this.getNodeEmbeddingResults(nodes, options); await addNodesToVectorStores(nodes, this.vectorStores, this.insertNodesToStore.bind(this)); await this.indexStore.addIndexStruct(this.indexStruct); } async deleteRefDoc(refDocId, deleteFromDocStore = true) { for (const vectorStore of Object.values(this.vectorStores)){ await this.deleteRefDocFromStore(vectorStore, refDocId); } if (deleteFromDocStore) { await this.docStore.deleteDocument(refDocId, false); } } async deleteRefDocFromStore(vectorStore, refDocId) { await vectorStore.delete(refDocId); if (!vectorStore.storesText) { const refDocInfo = await this.docStore.getRefDocInfo(refDocId); if (refDocInfo) { for (const nodeId of refDocInfo.nodeIds){ this.indexStruct.delete(nodeId); await vectorStore.delete(nodeId); } } await this.indexStore.addIndexStruct(this.indexStruct); } } } export class VectorIndexRetriever extends BaseRetriever { index; topK; filters; queryMode; constructor(options){ super(); this.index = options.index; this.queryMode = options.mode ?? VectorStoreQueryMode.DEFAULT; if ("topK" in options && options.topK) { this.topK = options.topK; } else { this.topK = { [ModalityType.TEXT]: "similarityTopK" in options && options.similarityTopK ? options.similarityTopK : DEFAULT_SIMILARITY_TOP_K, [ModalityType.IMAGE]: DEFAULT_SIMILARITY_TOP_K }; } this.filters = options.filters; } /** * @deprecated, pass similarityTopK or topK in constructor instead or directly modify topK */ set similarityTopK(similarityTopK) { this.topK[ModalityType.TEXT] = similarityTopK; } async _retrieve(params) { const { query } = params; const vectorStores = this.index.vectorStores; let nodesWithScores = []; for(const type in vectorStores){ const vectorStore = vectorStores[type]; nodesWithScores = nodesWithScores.concat(await this.retrieveQuery(query, type, vectorStore)); } return nodesWithScores; } async retrieveQuery(query, type, vectorStore, filters) { // convert string message to multi-modal format let queryStr = query; if (typeof query === "string") { queryStr = query; query = [ { type: "text", text: queryStr } ]; } else { queryStr = extractText(query); } // overwrite embed model if specified, otherwise use the one from the vector store const embedModel = this.index.embedModel ?? vectorStore.embedModel; let nodes = []; // query each content item (e.g. text or image) separately for (const item of query){ const queryEmbedding = await embedModel.getQueryEmbedding(item); if (queryEmbedding) { const result = await vectorStore.query({ queryStr, queryEmbedding, mode: this.queryMode ?? VectorStoreQueryMode.DEFAULT, similarityTopK: this.topK[type], filters: this.filters ?? filters ?? undefined }); nodes = nodes.concat(this.buildNodeListFromQueryResult(result)); } } return nodes; } buildNodeListFromQueryResult(result) { const nodesWithScores = []; for(let i = 0; i < result.ids.length; i++){ const nodeFromResult = result.nodes?.[i]; if (!this.index.indexStruct.nodesDict[result.ids[i]] && nodeFromResult) { this.index.indexStruct.nodesDict[result.ids[i]] = nodeFromResult; } const node = this.index.indexStruct.nodesDict[result.ids[i]]; // XXX: Hack, if it's an image node, we reconstruct the image from the URL // Alternative: Store image in doc store and retrieve it here if (node instanceof ImageNode) { node.image = node.getUrl(); } nodesWithScores.push({ node: node, score: result.similarities[i] }); } return nodesWithScores; } }