llamaindex
Version:
<p align="center"> <img height="100" width="100" alt="LlamaIndex logo" src="https://ts.llamaindex.ai/square.svg" /> </p> <h1 align="center">LlamaIndex.TS</h1> <h3 align="center"> Data framework for your LLM application. </h3>
324 lines (313 loc) • 13 kB
JavaScript
Object.defineProperty(exports, '__esModule', { value: true });
var schema = require('@llamaindex/core/schema');
var docStore = require('@llamaindex/core/storage/doc-store');
var kvStore = require('@llamaindex/core/storage/kv-store');
var env = require('@llamaindex/env');
const transformToJSON = (obj)=>{
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const seen = [];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const replacer = (key, value)=>{
if (value != null && typeof value == "object") {
if (seen.indexOf(value) >= 0) {
return;
}
seen.push(value);
}
return value;
};
// this is a custom replacer function that will allow us to handle circular references
const jsonStr = JSON.stringify(obj, replacer);
return jsonStr;
};
function getTransformationHash(nodes, transform) {
const nodesStr = nodes.map((node)=>node.getContent(schema.MetadataMode.ALL)).join("");
const transformString = transformToJSON(transform);
const hash = env.createSHA256();
hash.update(nodesStr + transformString + transform.id);
return hash.digest();
}
class IngestionCache {
constructor(collection){
this.collection = "llama_cache";
this.nodesKey = "nodes";
if (collection) {
this.collection = collection;
}
this.cache = new kvStore.SimpleKVStore();
}
async put(hash, nodes) {
const val = {
[this.nodesKey]: nodes.map((node)=>docStore.docToJson(node, docStore.jsonSerializer))
};
await this.cache.put(hash, val, this.collection);
}
async get(hash) {
const json = await this.cache.get(hash, this.collection);
if (!json || !json[this.nodesKey] || !Array.isArray(json[this.nodesKey])) {
return undefined;
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
return json[this.nodesKey].map((doc)=>docStore.jsonToDoc(doc, docStore.jsonSerializer));
}
}
async function classify(docStore, nodes) {
const existingDocIds = Object.values(await docStore.getAllDocumentHashes());
const docIdsFromNodes = new Set();
const dedupedNodes = [];
const unusedDocs = [];
for (const node of nodes){
const refDocId = node.sourceNode?.nodeId || node.id_;
docIdsFromNodes.add(refDocId);
const existingHash = await docStore.getDocumentHash(refDocId);
if (!existingHash) {
// document doesn't exist, so add it
dedupedNodes.push(node);
} else if (existingHash && existingHash !== node.hash) {
// document exists but hash is different, so mark doc as unused and add node as deduped
unusedDocs.push(refDocId);
dedupedNodes.push(node);
}
// otherwise, document exists and hash is the same, so do nothing
}
const missingDocs = existingDocIds.filter((id)=>!docIdsFromNodes.has(id));
return {
dedupedNodes,
missingDocs,
unusedDocs
};
}
class RollbackableTransformComponent extends schema.TransformComponent {
// Remove unused docs from the doc store. It is useful in case
// generating embeddings fails and we want to remove the unused docs
// TODO: override this in UpsertsStrategy if we want to revert removed docs also
async rollback(docStore, nodes) {
const { unusedDocs } = await classify(docStore, nodes);
for (const docId of unusedDocs){
await docStore.deleteDocument(docId, false);
}
docStore.persist();
}
}
/**
* Handle doc store duplicates by checking all hashes.
*/ class DuplicatesStrategy extends RollbackableTransformComponent {
constructor(docStore){
super(async (nodes)=>{
const hashes = await this.docStore.getAllDocumentHashes();
const currentHashes = new Set();
const nodesToRun = [];
for (const node of nodes){
if (!(node.hash in hashes) && !currentHashes.has(node.hash)) {
await this.docStore.setDocumentHash(node.id_, node.hash);
nodesToRun.push(node);
currentHashes.add(node.hash);
}
}
await this.docStore.addDocuments(nodesToRun, true);
return nodesToRun;
});
this.docStore = docStore;
}
}
/**
* Handle docstore upserts by checking hashes and ids.
* Identify missing docs and delete them from docstore and vector store
*/ class UpsertsAndDeleteStrategy extends RollbackableTransformComponent {
constructor(docStore, vectorStores){
super(async (nodes)=>{
const { dedupedNodes, missingDocs, unusedDocs } = await classify(this.docStore, nodes);
// remove unused docs
for (const refDocId of unusedDocs){
await this.docStore.deleteRefDoc(refDocId, false);
if (this.vectorStores) {
for (const vectorStore of this.vectorStores){
await vectorStore.delete(refDocId);
}
}
}
// remove missing docs
for (const docId of missingDocs){
await this.docStore.deleteDocument(docId, true);
if (this.vectorStores) {
for (const vectorStore of this.vectorStores){
await vectorStore.delete(docId);
}
}
}
await this.docStore.addDocuments(dedupedNodes, true);
return dedupedNodes;
});
this.docStore = docStore;
this.vectorStores = vectorStores;
}
}
/**
* Handles doc store upserts by checking hashes and ids.
*/ class UpsertsStrategy extends RollbackableTransformComponent {
constructor(docStore, vectorStores){
super(async (nodes)=>{
const { dedupedNodes, unusedDocs } = await classify(this.docStore, nodes);
// remove unused docs
for (const refDocId of unusedDocs){
await this.docStore.deleteRefDoc(refDocId, false);
if (this.vectorStores) {
for (const vectorStore of this.vectorStores){
await vectorStore.delete(refDocId);
}
}
}
// add non-duplicate docs
await this.docStore.addDocuments(dedupedNodes, true);
return dedupedNodes;
});
this.docStore = docStore;
this.vectorStores = vectorStores;
}
}
/**
* Document de-deduplication strategies work by comparing the hashes or ids stored in the document store.
* They require a document store to be set which must be persisted across pipeline runs.
*/ var DocStoreStrategy = /*#__PURE__*/ function(DocStoreStrategy) {
// Use upserts to handle duplicates. Checks if the a document is already in the doc store based on its id. If it is not, or if the hash of the document is updated, it will update the document in the doc store and run the transformations.
DocStoreStrategy["UPSERTS"] = "upserts";
// Only handle duplicates. Checks if the hash of a document is already in the doc store. Only then it will add the document to the doc store and run the transformations
DocStoreStrategy["DUPLICATES_ONLY"] = "duplicates_only";
// Use upserts and delete to handle duplicates. Like the upsert strategy but it will also delete non-existing documents from the doc store
DocStoreStrategy["UPSERTS_AND_DELETE"] = "upserts_and_delete";
DocStoreStrategy["NONE"] = "none";
return DocStoreStrategy;
}({});
class NoOpStrategy extends RollbackableTransformComponent {
constructor(){
super(async (nodes)=>nodes);
}
}
function createDocStoreStrategy(docStoreStrategy, docStore, vectorStores = []) {
if (docStoreStrategy === "none") {
return new NoOpStrategy();
}
if (!docStore) {
throw new Error("docStore is required to create a doc store strategy.");
}
if (vectorStores.length > 0) {
if (docStoreStrategy === "upserts") {
return new UpsertsStrategy(docStore, vectorStores);
} else if (docStoreStrategy === "upserts_and_delete") {
return new UpsertsAndDeleteStrategy(docStore, vectorStores);
} else if (docStoreStrategy === "duplicates_only") {
return new DuplicatesStrategy(docStore);
} else {
throw new Error(`Invalid docstore strategy: ${docStoreStrategy}`);
}
} else {
if (docStoreStrategy === "upserts") {
console.warn("Docstore strategy set to upserts, but no vector store. Switching to duplicates_only strategy.");
} else if (docStoreStrategy === "upserts_and_delete") {
console.warn("Docstore strategy set to upserts and delete, but no vector store. Switching to duplicates_only strategy.");
}
return new DuplicatesStrategy(docStore);
}
}
async function runTransformations(nodesToRun, transformations, // eslint-disable-next-line @typescript-eslint/no-explicit-any
transformOptions = {}, { inPlace = true, cache, docStoreStrategy } = {}) {
let nodes = nodesToRun;
if (!inPlace) {
nodes = [
...nodesToRun
];
}
if (docStoreStrategy) {
nodes = await docStoreStrategy(nodes);
}
for (const transform of transformations){
if (cache) {
const hash = getTransformationHash(nodes, transform);
const cachedNodes = await cache.get(hash);
if (cachedNodes) {
nodes = cachedNodes;
} else {
nodes = await transform(nodes, transformOptions);
await cache.put(hash, nodes);
}
} else {
nodes = await transform(nodes, transformOptions);
}
}
return nodes;
}
class IngestionPipeline {
constructor(init){
this.transformations = [];
this.docStoreStrategy = DocStoreStrategy.UPSERTS;
this.disableCache = false;
Object.assign(this, init);
if (!this.docStore) {
this.docStoreStrategy = DocStoreStrategy.NONE;
}
this.vectorStores = this.vectorStores ?? (this.vectorStore ? {
[schema.ModalityType.TEXT]: this.vectorStore
} : undefined);
this._docStoreStrategy = createDocStoreStrategy(this.docStoreStrategy, this.docStore, this.vectorStores ? Object.values(this.vectorStores) : undefined);
if (!this.disableCache) {
this.cache = new IngestionCache();
}
}
async prepareInput(documents, nodes) {
const inputNodes = [];
if (documents) {
inputNodes.push(documents);
}
if (nodes) {
inputNodes.push(nodes);
}
if (this.documents) {
inputNodes.push(this.documents);
}
if (this.reader) {
// fixme: empty parameter might cause error
inputNodes.push(await this.reader.loadData());
}
return inputNodes.flat();
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
async run(args = {}, transformOptions) {
args.cache = args.cache ?? this.cache;
args.docStoreStrategy = args.docStoreStrategy ?? this._docStoreStrategy;
const inputNodes = await this.prepareInput(args.documents, args.nodes);
const nodes = await runTransformations(inputNodes, this.transformations, transformOptions, args);
if (this.vectorStores) {
const nodesToAdd = nodes.filter((node)=>node.embedding);
await addNodesToVectorStores(nodesToAdd, this.vectorStores);
}
return nodes;
}
}
async function addNodesToVectorStores(nodes, vectorStores, nodesAdded) {
const nodeMap = schema.splitNodesByType(nodes);
for(const type in nodeMap){
const nodes = nodeMap[type];
if (nodes) {
const vectorStore = vectorStores[type];
if (!vectorStore) {
throw new Error(`Cannot insert nodes of type ${type} without assigned vector store`);
}
const newIds = await vectorStore.add(nodes);
if (nodesAdded) {
await nodesAdded(newIds, nodes, vectorStore);
}
}
}
}
exports.DocStoreStrategy = DocStoreStrategy;
exports.DuplicatesStrategy = DuplicatesStrategy;
exports.IngestionCache = IngestionCache;
exports.IngestionPipeline = IngestionPipeline;
exports.RollbackableTransformComponent = RollbackableTransformComponent;
exports.UpsertsAndDeleteStrategy = UpsertsAndDeleteStrategy;
exports.UpsertsStrategy = UpsertsStrategy;
exports.addNodesToVectorStores = addNodesToVectorStores;
exports.classify = classify;
exports.createDocStoreStrategy = createDocStoreStrategy;
exports.getTransformationHash = getTransformationHash;
exports.runTransformations = runTransformations;