UNPKG

@langchain/core

Version:
1 lines 16.2 kB
{"version":3,"file":"base.cjs","names":["sha256","fields: HashedDocumentArgs","keyEncoderFn: HashKeyEncoder","Document","document: DocumentInterface","uid?: string","inputString: string","UUIDV5_NAMESPACE","data: Record<string, unknown>","size: number","iterable: T[]","batches: T[][]","currentBatch: T[]","hashedDocuments: HashedDocumentInterface[]","deduplicated: HashedDocumentInterface[]","sourceIdKey: StringOrDocFunc | null","_doc: DocumentInterface","doc: DocumentInterface","arg: any","args: IndexArgs","index","uids: string[]","docsToIndex: DocumentInterface[]","docsToUpdate: string[]"],"sources":["../../src/indexing/base.ts"],"sourcesContent":["import { v5 as uuidv5 } from \"uuid\";\nimport { VectorStore } from \"../vectorstores.js\";\nimport { RecordManagerInterface, UUIDV5_NAMESPACE } from \"./record_manager.js\";\nimport { sha256, type HashKeyEncoder } from \"../utils/hash.js\";\nimport { DocumentInterface, Document } from \"../documents/document.js\";\nimport { BaseDocumentLoader } from \"../document_loaders/base.js\";\n\ntype Metadata = Record<string, unknown>;\n\ntype IndexingResult = {\n numAdded: number;\n numDeleted: number;\n numUpdated: number;\n numSkipped: number;\n};\n\ntype StringOrDocFunc = string | ((doc: DocumentInterface) => string);\n\nexport interface HashedDocumentInterface extends DocumentInterface {\n uid: string;\n hash_?: string;\n contentHash?: string;\n metadataHash?: string;\n pageContent: string;\n metadata: Metadata;\n calculateHashes(): void;\n toDocument(): DocumentInterface;\n}\n\ninterface HashedDocumentArgs {\n pageContent: string;\n metadata: Metadata;\n uid: string;\n}\n\n/**\n * HashedDocument is a Document with hashes calculated.\n * Hashes are calculated based on page content and metadata.\n * It is used for indexing.\n */\nexport class _HashedDocument implements HashedDocumentInterface {\n uid: string;\n\n hash_?: string;\n\n contentHash?: string;\n\n metadataHash?: string;\n\n pageContent: string;\n\n metadata: Metadata;\n\n private keyEncoder: HashKeyEncoder = sha256;\n\n constructor(fields: HashedDocumentArgs) {\n this.uid = fields.uid;\n this.pageContent = fields.pageContent;\n this.metadata = fields.metadata;\n }\n\n makeDefaultKeyEncoder(keyEncoderFn: HashKeyEncoder): void {\n this.keyEncoder = keyEncoderFn;\n }\n\n calculateHashes(): void {\n const forbiddenKeys = [\"hash_\", \"content_hash\", \"metadata_hash\"];\n\n for (const key of forbiddenKeys) {\n if (key in this.metadata) {\n throw new Error(\n `Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(\n \", \"\n )}]`\n );\n }\n }\n\n const contentHash = this._hashStringToUUID(this.pageContent);\n\n try {\n const metadataHash = this._hashNestedDictToUUID(this.metadata);\n this.contentHash = contentHash;\n this.metadataHash = metadataHash;\n } catch (e) {\n throw new Error(\n `Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`\n );\n }\n\n this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);\n\n if (!this.uid) {\n this.uid = this.hash_;\n }\n }\n\n toDocument(): DocumentInterface {\n return new Document({\n pageContent: this.pageContent,\n metadata: this.metadata,\n });\n }\n\n static fromDocument(\n document: DocumentInterface,\n uid?: string\n ): _HashedDocument {\n const doc = new this({\n pageContent: document.pageContent,\n metadata: document.metadata,\n uid: uid || (document as DocumentInterface & { uid: string }).uid,\n });\n doc.calculateHashes();\n return doc;\n }\n\n private _hashStringToUUID(inputString: string): string {\n const hash_value = this.keyEncoder(inputString);\n return uuidv5(hash_value, UUIDV5_NAMESPACE);\n }\n\n private _hashNestedDictToUUID(data: Record<string, unknown>): string {\n const serialized_data = JSON.stringify(data, Object.keys(data).sort());\n const hash_value = this.keyEncoder(serialized_data);\n return uuidv5(hash_value, UUIDV5_NAMESPACE);\n }\n}\n\nexport type CleanupMode = \"full\" | \"incremental\";\n\nexport type IndexOptions = {\n /**\n * The number of documents to index in one batch.\n */\n batchSize?: number;\n /**\n * The cleanup mode to use. Can be \"full\", \"incremental\" or undefined.\n * - **Incremental**: Cleans up all documents that haven't been updated AND\n * that are associated with source ids that were seen\n * during indexing.\n * Clean up is done continuously during indexing helping\n * to minimize the probability of users seeing duplicated\n * content.\n * - **Full**: Delete all documents that haven to been returned by the loader.\n * Clean up runs after all documents have been indexed.\n * This means that users may see duplicated content during indexing.\n * - **undefined**: Do not delete any documents.\n */\n cleanup?: CleanupMode;\n /**\n * Optional key that helps identify the original source of the document.\n * Must either be a string representing the key of the source in the metadata\n * or a function that takes a document and returns a string representing the source.\n * **Required when cleanup is incremental**.\n */\n sourceIdKey?: StringOrDocFunc;\n /**\n * Batch size to use when cleaning up documents.\n */\n cleanupBatchSize?: number;\n /**\n * Force update documents even if they are present in the\n * record manager. Useful if you are re-indexing with updated embeddings.\n */\n forceUpdate?: boolean;\n};\n\nexport function _batch<T>(size: number, iterable: T[]): T[][] {\n const batches: T[][] = [];\n let currentBatch: T[] = [];\n\n iterable.forEach((item) => {\n currentBatch.push(item);\n\n if (currentBatch.length >= size) {\n batches.push(currentBatch);\n currentBatch = [];\n }\n });\n\n if (currentBatch.length > 0) {\n batches.push(currentBatch);\n }\n\n return batches;\n}\n\nexport function _deduplicateInOrder(\n hashedDocuments: HashedDocumentInterface[]\n): HashedDocumentInterface[] {\n const seen = new Set<string>();\n const deduplicated: HashedDocumentInterface[] = [];\n\n for (const hashedDoc of hashedDocuments) {\n if (!hashedDoc.hash_) {\n throw new Error(\"Hashed document does not have a hash\");\n }\n\n if (!seen.has(hashedDoc.hash_)) {\n seen.add(hashedDoc.hash_);\n deduplicated.push(hashedDoc);\n }\n }\n return deduplicated;\n}\n\nexport function _getSourceIdAssigner(\n sourceIdKey: StringOrDocFunc | null\n): (doc: DocumentInterface) => string | null {\n if (sourceIdKey === null) {\n return (_doc: DocumentInterface) => null;\n } else if (typeof sourceIdKey === \"string\") {\n return (doc: DocumentInterface) => doc.metadata[sourceIdKey];\n } else if (typeof sourceIdKey === \"function\") {\n return sourceIdKey;\n } else {\n throw new Error(\n `sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`\n );\n }\n}\n\n// eslint-disable-next-line @typescript-eslint/no-explicit-any\nexport const _isBaseDocumentLoader = (arg: any): arg is BaseDocumentLoader => {\n if (\n \"load\" in arg &&\n typeof arg.load === \"function\" &&\n \"loadAndSplit\" in arg &&\n typeof arg.loadAndSplit === \"function\"\n ) {\n return true;\n }\n return false;\n};\n\ninterface IndexArgs {\n docsSource: BaseDocumentLoader | DocumentInterface[];\n recordManager: RecordManagerInterface;\n vectorStore: VectorStore;\n options?: IndexOptions;\n}\n\n/**\n * Index data from the doc source into the vector store.\n *\n * Indexing functionality uses a manager to keep track of which documents\n * are in the vector store.\n *\n * This allows us to keep track of which documents were updated, and which\n * documents were deleted, which documents should be skipped.\n *\n * For the time being, documents are indexed using their hashes, and users\n * are not able to specify the uid of the document.\n *\n * @param {IndexArgs} args\n * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.\n * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.\n * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.\n * @param {IndexOptions | undefined} args.options Options for indexing.\n * @returns {Promise<IndexingResult>}\n */\nexport async function index(args: IndexArgs): Promise<IndexingResult> {\n const { docsSource, recordManager, vectorStore, options } = args;\n const {\n batchSize = 100,\n cleanup,\n sourceIdKey,\n cleanupBatchSize = 1000,\n forceUpdate = false,\n } = options ?? {};\n\n if (cleanup === \"incremental\" && !sourceIdKey) {\n throw new Error(\n \"sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.\"\n );\n }\n\n const docs = _isBaseDocumentLoader(docsSource)\n ? await docsSource.load()\n : docsSource;\n\n const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);\n\n const indexStartDt = await recordManager.getTime();\n let numAdded = 0;\n let numDeleted = 0;\n let numUpdated = 0;\n let numSkipped = 0;\n\n const batches = _batch<DocumentInterface>(batchSize ?? 100, docs);\n\n for (const batch of batches) {\n const hashedDocs = _deduplicateInOrder(\n batch.map((doc) => _HashedDocument.fromDocument(doc))\n );\n\n const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));\n\n if (cleanup === \"incremental\") {\n hashedDocs.forEach((_hashedDoc, index) => {\n const source = sourceIds[index];\n if (source === null) {\n throw new Error(\n \"sourceIdKey must be provided when cleanup is incremental\"\n );\n }\n });\n }\n\n const batchExists = await recordManager.exists(\n hashedDocs.map((doc) => doc.uid)\n );\n\n const uids: string[] = [];\n const docsToIndex: DocumentInterface[] = [];\n const docsToUpdate: string[] = [];\n const seenDocs = new Set<string>();\n hashedDocs.forEach((hashedDoc, i) => {\n const docExists = batchExists[i];\n if (docExists) {\n if (forceUpdate) {\n seenDocs.add(hashedDoc.uid);\n } else {\n docsToUpdate.push(hashedDoc.uid);\n return;\n }\n }\n uids.push(hashedDoc.uid);\n docsToIndex.push(hashedDoc.toDocument());\n });\n\n if (docsToUpdate.length > 0) {\n await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });\n numSkipped += docsToUpdate.length;\n }\n\n if (docsToIndex.length > 0) {\n await vectorStore.addDocuments(docsToIndex, { ids: uids });\n numAdded += docsToIndex.length - seenDocs.size;\n numUpdated += seenDocs.size;\n }\n\n await recordManager.update(\n hashedDocs.map((doc) => doc.uid),\n { timeAtLeast: indexStartDt, groupIds: sourceIds }\n );\n\n if (cleanup === \"incremental\") {\n sourceIds.forEach((sourceId) => {\n if (!sourceId) throw new Error(\"Source id cannot be null\");\n });\n const uidsToDelete = await recordManager.listKeys({\n before: indexStartDt,\n groupIds: sourceIds,\n });\n\n if (uidsToDelete.length > 0) {\n await vectorStore.delete({ ids: uidsToDelete });\n await recordManager.deleteKeys(uidsToDelete);\n numDeleted += uidsToDelete.length;\n }\n }\n }\n\n if (cleanup === \"full\") {\n let uidsToDelete = await recordManager.listKeys({\n before: indexStartDt,\n limit: cleanupBatchSize,\n });\n while (uidsToDelete.length > 0) {\n await vectorStore.delete({ ids: uidsToDelete });\n await recordManager.deleteKeys(uidsToDelete);\n numDeleted += uidsToDelete.length;\n uidsToDelete = await recordManager.listKeys({\n before: indexStartDt,\n limit: cleanupBatchSize,\n });\n }\n }\n\n return {\n numAdded,\n numDeleted,\n numUpdated,\n numSkipped,\n };\n}\n"],"mappings":";;;;;;;;;;;;;AAwCA,IAAa,kBAAb,MAAgE;CAC9D;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,AAAQ,aAA6BA;CAErC,YAAYC,QAA4B;EACtC,KAAK,MAAM,OAAO;EAClB,KAAK,cAAc,OAAO;EAC1B,KAAK,WAAW,OAAO;CACxB;CAED,sBAAsBC,cAAoC;EACxD,KAAK,aAAa;CACnB;CAED,kBAAwB;EACtB,MAAM,gBAAgB;GAAC;GAAS;GAAgB;EAAgB;AAEhE,OAAK,MAAM,OAAO,cAChB,KAAI,OAAO,KAAK,SACd,OAAM,IAAI,MACR,CAAC,4BAA4B,EAAE,IAAI,uDAAuD,EAAE,cAAc,KACxG,KACD,CAAC,CAAC,CAAC;EAKV,MAAM,cAAc,KAAK,kBAAkB,KAAK,YAAY;AAE5D,MAAI;GACF,MAAM,eAAe,KAAK,sBAAsB,KAAK,SAAS;GAC9D,KAAK,cAAc;GACnB,KAAK,eAAe;EACrB,SAAQ,GAAG;AACV,SAAM,IAAI,MACR,CAAC,yBAAyB,EAAE,EAAE,sDAAsD,CAAC;EAExF;EAED,KAAK,QAAQ,KAAK,kBAAkB,KAAK,cAAc,KAAK,aAAa;AAEzE,MAAI,CAAC,KAAK,KACR,KAAK,MAAM,KAAK;CAEnB;CAED,aAAgC;AAC9B,SAAO,IAAIC,0BAAS;GAClB,aAAa,KAAK;GAClB,UAAU,KAAK;EAChB;CACF;CAED,OAAO,aACLC,UACAC,KACiB;EACjB,MAAM,MAAM,IAAI,KAAK;GACnB,aAAa,SAAS;GACtB,UAAU,SAAS;GACnB,KAAK,OAAQ,SAAiD;EAC/D;EACD,IAAI,iBAAiB;AACrB,SAAO;CACR;CAED,AAAQ,kBAAkBC,aAA6B;EACrD,MAAM,aAAa,KAAK,WAAW,YAAY;AAC/C,sBAAc,YAAYC,wCAAiB;CAC5C;CAED,AAAQ,sBAAsBC,MAAuC;EACnE,MAAM,kBAAkB,KAAK,UAAU,MAAM,OAAO,KAAK,KAAK,CAAC,MAAM,CAAC;EACtE,MAAM,aAAa,KAAK,WAAW,gBAAgB;AACnD,sBAAc,YAAYD,wCAAiB;CAC5C;AACF;AAyCD,SAAgB,OAAUE,MAAcC,UAAsB;CAC5D,MAAMC,UAAiB,CAAE;CACzB,IAAIC,eAAoB,CAAE;CAE1B,SAAS,QAAQ,CAAC,SAAS;EACzB,aAAa,KAAK,KAAK;AAEvB,MAAI,aAAa,UAAU,MAAM;GAC/B,QAAQ,KAAK,aAAa;GAC1B,eAAe,CAAE;EAClB;CACF,EAAC;AAEF,KAAI,aAAa,SAAS,GACxB,QAAQ,KAAK,aAAa;AAG5B,QAAO;AACR;AAED,SAAgB,oBACdC,iBAC2B;CAC3B,MAAM,uBAAO,IAAI;CACjB,MAAMC,eAA0C,CAAE;AAElD,MAAK,MAAM,aAAa,iBAAiB;AACvC,MAAI,CAAC,UAAU,MACb,OAAM,IAAI,MAAM;AAGlB,MAAI,CAAC,KAAK,IAAI,UAAU,MAAM,EAAE;GAC9B,KAAK,IAAI,UAAU,MAAM;GACzB,aAAa,KAAK,UAAU;EAC7B;CACF;AACD,QAAO;AACR;AAED,SAAgB,qBACdC,aAC2C;AAC3C,KAAI,gBAAgB,KAClB,QAAO,CAACC,SAA4B;UAC3B,OAAO,gBAAgB,SAChC,QAAO,CAACC,QAA2B,IAAI,SAAS;UACvC,OAAO,gBAAgB,WAChC,QAAO;KAEP,OAAM,IAAI,MACR,CAAC,wDAAwD,EAAE,OAAO,aAAa;AAGpF;AAGD,MAAa,wBAAwB,CAACC,QAAwC;AAC5E,KACE,UAAU,OACV,OAAO,IAAI,SAAS,cACpB,kBAAkB,OAClB,OAAO,IAAI,iBAAiB,WAE5B,QAAO;AAET,QAAO;AACR;;;;;;;;;;;;;;;;;;;;AA4BD,eAAsB,MAAMC,MAA0C;CACpE,MAAM,EAAE,YAAY,eAAe,aAAa,SAAS,GAAG;CAC5D,MAAM,EACJ,YAAY,KACZ,SACA,aACA,mBAAmB,KACnB,cAAc,OACf,GAAG,WAAW,CAAE;AAEjB,KAAI,YAAY,iBAAiB,CAAC,YAChC,OAAM,IAAI,MACR;CAIJ,MAAM,OAAO,sBAAsB,WAAW,GAC1C,MAAM,WAAW,MAAM,GACvB;CAEJ,MAAM,mBAAmB,qBAAqB,eAAe,KAAK;CAElE,MAAM,eAAe,MAAM,cAAc,SAAS;CAClD,IAAI,WAAW;CACf,IAAI,aAAa;CACjB,IAAI,aAAa;CACjB,IAAI,aAAa;CAEjB,MAAM,UAAU,OAA0B,aAAa,KAAK,KAAK;AAEjE,MAAK,MAAM,SAAS,SAAS;EAC3B,MAAM,aAAa,oBACjB,MAAM,IAAI,CAAC,QAAQ,gBAAgB,aAAa,IAAI,CAAC,CACtD;EAED,MAAM,YAAY,WAAW,IAAI,CAAC,QAAQ,iBAAiB,IAAI,CAAC;AAEhE,MAAI,YAAY,eACd,WAAW,QAAQ,CAAC,YAAYC,YAAU;GACxC,MAAM,SAAS,UAAUA;AACzB,OAAI,WAAW,KACb,OAAM,IAAI,MACR;EAGL,EAAC;EAGJ,MAAM,cAAc,MAAM,cAAc,OACtC,WAAW,IAAI,CAAC,QAAQ,IAAI,IAAI,CACjC;EAED,MAAMC,OAAiB,CAAE;EACzB,MAAMC,cAAmC,CAAE;EAC3C,MAAMC,eAAyB,CAAE;EACjC,MAAM,2BAAW,IAAI;EACrB,WAAW,QAAQ,CAAC,WAAW,MAAM;GACnC,MAAM,YAAY,YAAY;AAC9B,OAAI,UACF,KAAI,aACF,SAAS,IAAI,UAAU,IAAI;QACtB;IACL,aAAa,KAAK,UAAU,IAAI;AAChC;GACD;GAEH,KAAK,KAAK,UAAU,IAAI;GACxB,YAAY,KAAK,UAAU,YAAY,CAAC;EACzC,EAAC;AAEF,MAAI,aAAa,SAAS,GAAG;GAC3B,MAAM,cAAc,OAAO,cAAc,EAAE,aAAa,aAAc,EAAC;GACvE,cAAc,aAAa;EAC5B;AAED,MAAI,YAAY,SAAS,GAAG;GAC1B,MAAM,YAAY,aAAa,aAAa,EAAE,KAAK,KAAM,EAAC;GAC1D,YAAY,YAAY,SAAS,SAAS;GAC1C,cAAc,SAAS;EACxB;EAED,MAAM,cAAc,OAClB,WAAW,IAAI,CAAC,QAAQ,IAAI,IAAI,EAChC;GAAE,aAAa;GAAc,UAAU;EAAW,EACnD;AAED,MAAI,YAAY,eAAe;GAC7B,UAAU,QAAQ,CAAC,aAAa;AAC9B,QAAI,CAAC,SAAU,OAAM,IAAI,MAAM;GAChC,EAAC;GACF,MAAM,eAAe,MAAM,cAAc,SAAS;IAChD,QAAQ;IACR,UAAU;GACX,EAAC;AAEF,OAAI,aAAa,SAAS,GAAG;IAC3B,MAAM,YAAY,OAAO,EAAE,KAAK,aAAc,EAAC;IAC/C,MAAM,cAAc,WAAW,aAAa;IAC5C,cAAc,aAAa;GAC5B;EACF;CACF;AAED,KAAI,YAAY,QAAQ;EACtB,IAAI,eAAe,MAAM,cAAc,SAAS;GAC9C,QAAQ;GACR,OAAO;EACR,EAAC;AACF,SAAO,aAAa,SAAS,GAAG;GAC9B,MAAM,YAAY,OAAO,EAAE,KAAK,aAAc,EAAC;GAC/C,MAAM,cAAc,WAAW,aAAa;GAC5C,cAAc,aAAa;GAC3B,eAAe,MAAM,cAAc,SAAS;IAC1C,QAAQ;IACR,OAAO;GACR,EAAC;EACH;CACF;AAED,QAAO;EACL;EACA;EACA;EACA;CACD;AACF"}