UNPKG

@langchain/core

Version:
1 lines 15.5 kB
{"version":3,"file":"base.cjs","names":["sha256","Document","UUIDV5_NAMESPACE"],"sources":["../../src/indexing/base.ts"],"sourcesContent":["import { v5 as uuidv5 } from \"uuid\";\nimport { VectorStore } from \"../vectorstores.js\";\nimport { RecordManagerInterface, UUIDV5_NAMESPACE } from \"./record_manager.js\";\nimport { sha256, type HashKeyEncoder } from \"../utils/hash.js\";\nimport { DocumentInterface, Document } from \"../documents/document.js\";\nimport { BaseDocumentLoader } from \"../document_loaders/base.js\";\n\ntype Metadata = Record<string, unknown>;\n\ntype IndexingResult = {\n numAdded: number;\n numDeleted: number;\n numUpdated: number;\n numSkipped: number;\n};\n\ntype StringOrDocFunc = string | ((doc: DocumentInterface) => string);\n\nexport interface HashedDocumentInterface extends DocumentInterface {\n uid: string;\n hash_?: string;\n contentHash?: string;\n metadataHash?: string;\n pageContent: string;\n metadata: Metadata;\n calculateHashes(): void;\n toDocument(): DocumentInterface;\n}\n\ninterface HashedDocumentArgs {\n pageContent: string;\n metadata: Metadata;\n uid: string;\n}\n\n/**\n * HashedDocument is a Document with hashes calculated.\n * Hashes are calculated based on page content and metadata.\n * It is used for indexing.\n */\nexport class _HashedDocument implements HashedDocumentInterface {\n uid: string;\n\n hash_?: string;\n\n contentHash?: string;\n\n metadataHash?: string;\n\n pageContent: string;\n\n metadata: Metadata;\n\n private keyEncoder: HashKeyEncoder = sha256;\n\n constructor(fields: HashedDocumentArgs) {\n this.uid = fields.uid;\n this.pageContent = fields.pageContent;\n this.metadata = fields.metadata;\n }\n\n makeDefaultKeyEncoder(keyEncoderFn: HashKeyEncoder): void {\n this.keyEncoder = keyEncoderFn;\n }\n\n calculateHashes(): void {\n const forbiddenKeys = [\"hash_\", \"content_hash\", \"metadata_hash\"];\n\n for (const key of forbiddenKeys) {\n if (key in this.metadata) {\n throw new Error(\n `Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(\n \", \"\n )}]`\n );\n }\n }\n\n const contentHash = this._hashStringToUUID(this.pageContent);\n\n try {\n const metadataHash = this._hashNestedDictToUUID(this.metadata);\n this.contentHash = contentHash;\n this.metadataHash = metadataHash;\n } catch (e) {\n throw new Error(\n `Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`\n );\n }\n\n this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);\n\n if (!this.uid) {\n this.uid = this.hash_;\n }\n }\n\n toDocument(): DocumentInterface {\n return new Document({\n pageContent: this.pageContent,\n metadata: this.metadata,\n });\n }\n\n static fromDocument(\n document: DocumentInterface,\n uid?: string\n ): _HashedDocument {\n const doc = new this({\n pageContent: document.pageContent,\n metadata: document.metadata,\n uid: uid || (document as DocumentInterface & { uid: string }).uid,\n });\n doc.calculateHashes();\n return doc;\n }\n\n private _hashStringToUUID(inputString: string): string {\n const hash_value = this.keyEncoder(inputString);\n return uuidv5(hash_value, UUIDV5_NAMESPACE);\n }\n\n private _hashNestedDictToUUID(data: Record<string, unknown>): string {\n const serialized_data = JSON.stringify(data, Object.keys(data).sort());\n const hash_value = this.keyEncoder(serialized_data);\n return uuidv5(hash_value, UUIDV5_NAMESPACE);\n }\n}\n\nexport type CleanupMode = \"full\" | \"incremental\";\n\nexport type IndexOptions = {\n /**\n * The number of documents to index in one batch.\n */\n batchSize?: number;\n /**\n * The cleanup mode to use. Can be \"full\", \"incremental\" or undefined.\n * - **Incremental**: Cleans up all documents that haven't been updated AND\n * that are associated with source ids that were seen\n * during indexing.\n * Clean up is done continuously during indexing helping\n * to minimize the probability of users seeing duplicated\n * content.\n * - **Full**: Delete all documents that haven to been returned by the loader.\n * Clean up runs after all documents have been indexed.\n * This means that users may see duplicated content during indexing.\n * - **undefined**: Do not delete any documents.\n */\n cleanup?: CleanupMode;\n /**\n * Optional key that helps identify the original source of the document.\n * Must either be a string representing the key of the source in the metadata\n * or a function that takes a document and returns a string representing the source.\n * **Required when cleanup is incremental**.\n */\n sourceIdKey?: StringOrDocFunc;\n /**\n * Batch size to use when cleaning up documents.\n */\n cleanupBatchSize?: number;\n /**\n * Force update documents even if they are present in the\n * record manager. Useful if you are re-indexing with updated embeddings.\n */\n forceUpdate?: boolean;\n};\n\nexport function _batch<T>(size: number, iterable: T[]): T[][] {\n const batches: T[][] = [];\n let currentBatch: T[] = [];\n\n iterable.forEach((item) => {\n currentBatch.push(item);\n\n if (currentBatch.length >= size) {\n batches.push(currentBatch);\n currentBatch = [];\n }\n });\n\n if (currentBatch.length > 0) {\n batches.push(currentBatch);\n }\n\n return batches;\n}\n\nexport function _deduplicateInOrder(\n hashedDocuments: HashedDocumentInterface[]\n): HashedDocumentInterface[] {\n const seen = new Set<string>();\n const deduplicated: HashedDocumentInterface[] = [];\n\n for (const hashedDoc of hashedDocuments) {\n if (!hashedDoc.hash_) {\n throw new Error(\"Hashed document does not have a hash\");\n }\n\n if (!seen.has(hashedDoc.hash_)) {\n seen.add(hashedDoc.hash_);\n deduplicated.push(hashedDoc);\n }\n }\n return deduplicated;\n}\n\nexport function _getSourceIdAssigner(\n sourceIdKey: StringOrDocFunc | null\n): (doc: DocumentInterface) => string | null {\n if (sourceIdKey === null) {\n return (_doc: DocumentInterface) => null;\n } else if (typeof sourceIdKey === \"string\") {\n return (doc: DocumentInterface) => doc.metadata[sourceIdKey];\n } else if (typeof sourceIdKey === \"function\") {\n return sourceIdKey;\n } else {\n throw new Error(\n `sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`\n );\n }\n}\n\n// eslint-disable-next-line @typescript-eslint/no-explicit-any\nexport const _isBaseDocumentLoader = (arg: any): arg is BaseDocumentLoader => {\n if (\n \"load\" in arg &&\n typeof arg.load === \"function\" &&\n \"loadAndSplit\" in arg &&\n typeof arg.loadAndSplit === \"function\"\n ) {\n return true;\n }\n return false;\n};\n\ninterface IndexArgs {\n docsSource: BaseDocumentLoader | DocumentInterface[];\n recordManager: RecordManagerInterface;\n vectorStore: VectorStore;\n options?: IndexOptions;\n}\n\n/**\n * Index data from the doc source into the vector store.\n *\n * Indexing functionality uses a manager to keep track of which documents\n * are in the vector store.\n *\n * This allows us to keep track of which documents were updated, and which\n * documents were deleted, which documents should be skipped.\n *\n * For the time being, documents are indexed using their hashes, and users\n * are not able to specify the uid of the document.\n *\n * @param {IndexArgs} args\n * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.\n * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.\n * @param {VectorStore} args.vectorStore The vector store to use for storing the documents.\n * @param {IndexOptions | undefined} args.options Options for indexing.\n * @returns {Promise<IndexingResult>}\n */\nexport async function index(args: IndexArgs): Promise<IndexingResult> {\n const { docsSource, recordManager, vectorStore, options } = args;\n const {\n batchSize = 100,\n cleanup,\n sourceIdKey,\n cleanupBatchSize = 1000,\n forceUpdate = false,\n } = options ?? {};\n\n if (cleanup === \"incremental\" && !sourceIdKey) {\n throw new Error(\n \"sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.\"\n );\n }\n\n const docs = _isBaseDocumentLoader(docsSource)\n ? await docsSource.load()\n : docsSource;\n\n const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);\n\n const indexStartDt = await recordManager.getTime();\n let numAdded = 0;\n let numDeleted = 0;\n let numUpdated = 0;\n let numSkipped = 0;\n\n const batches = _batch<DocumentInterface>(batchSize ?? 100, docs);\n\n for (const batch of batches) {\n const hashedDocs = _deduplicateInOrder(\n batch.map((doc) => _HashedDocument.fromDocument(doc))\n );\n\n const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));\n\n if (cleanup === \"incremental\") {\n hashedDocs.forEach((_hashedDoc, index) => {\n const source = sourceIds[index];\n if (source === null) {\n throw new Error(\n \"sourceIdKey must be provided when cleanup is incremental\"\n );\n }\n });\n }\n\n const batchExists = await recordManager.exists(\n hashedDocs.map((doc) => doc.uid)\n );\n\n const uids: string[] = [];\n const docsToIndex: DocumentInterface[] = [];\n const docsToUpdate: string[] = [];\n const seenDocs = new Set<string>();\n hashedDocs.forEach((hashedDoc, i) => {\n const docExists = batchExists[i];\n if (docExists) {\n if (forceUpdate) {\n seenDocs.add(hashedDoc.uid);\n } else {\n docsToUpdate.push(hashedDoc.uid);\n return;\n }\n }\n uids.push(hashedDoc.uid);\n docsToIndex.push(hashedDoc.toDocument());\n });\n\n if (docsToUpdate.length > 0) {\n await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });\n numSkipped += docsToUpdate.length;\n }\n\n if (docsToIndex.length > 0) {\n await vectorStore.addDocuments(docsToIndex, { ids: uids });\n numAdded += docsToIndex.length - seenDocs.size;\n numUpdated += seenDocs.size;\n }\n\n await recordManager.update(\n hashedDocs.map((doc) => doc.uid),\n { timeAtLeast: indexStartDt, groupIds: sourceIds }\n );\n\n if (cleanup === \"incremental\") {\n sourceIds.forEach((sourceId) => {\n if (!sourceId) throw new Error(\"Source id cannot be null\");\n });\n const uidsToDelete = await recordManager.listKeys({\n before: indexStartDt,\n groupIds: sourceIds,\n });\n\n if (uidsToDelete.length > 0) {\n await vectorStore.delete({ ids: uidsToDelete });\n await recordManager.deleteKeys(uidsToDelete);\n numDeleted += uidsToDelete.length;\n }\n }\n }\n\n if (cleanup === \"full\") {\n let uidsToDelete = await recordManager.listKeys({\n before: indexStartDt,\n limit: cleanupBatchSize,\n });\n while (uidsToDelete.length > 0) {\n await vectorStore.delete({ ids: uidsToDelete });\n await recordManager.deleteKeys(uidsToDelete);\n numDeleted += uidsToDelete.length;\n uidsToDelete = await recordManager.listKeys({\n before: indexStartDt,\n limit: cleanupBatchSize,\n });\n }\n }\n\n return {\n numAdded,\n numDeleted,\n numUpdated,\n numSkipped,\n };\n}\n"],"mappings":";;;;;;;;;;;;;AAwCA,IAAa,kBAAb,MAAgE;CAC9D;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,AAAQ,aAA6BA;CAErC,YAAY,QAA4B;AACtC,OAAK,MAAM,OAAO;AAClB,OAAK,cAAc,OAAO;AAC1B,OAAK,WAAW,OAAO;;CAGzB,sBAAsB,cAAoC;AACxD,OAAK,aAAa;;CAGpB,kBAAwB;EACtB,MAAM,gBAAgB;GAAC;GAAS;GAAgB;GAAgB;AAEhE,OAAK,MAAM,OAAO,cAChB,KAAI,OAAO,KAAK,SACd,OAAM,IAAI,MACR,+BAA+B,IAAI,yDAAyD,cAAc,KACxG,KACD,CAAC,GACH;EAIL,MAAM,cAAc,KAAK,kBAAkB,KAAK,YAAY;AAE5D,MAAI;GACF,MAAM,eAAe,KAAK,sBAAsB,KAAK,SAAS;AAC9D,QAAK,cAAc;AACnB,QAAK,eAAe;WACb,GAAG;AACV,SAAM,IAAI,MACR,4BAA4B,EAAE,wDAC/B;;AAGH,OAAK,QAAQ,KAAK,kBAAkB,KAAK,cAAc,KAAK,aAAa;AAEzE,MAAI,CAAC,KAAK,IACR,MAAK,MAAM,KAAK;;CAIpB,aAAgC;AAC9B,SAAO,IAAIC,0BAAS;GAClB,aAAa,KAAK;GAClB,UAAU,KAAK;GAChB,CAAC;;CAGJ,OAAO,aACL,UACA,KACiB;EACjB,MAAM,MAAM,IAAI,KAAK;GACnB,aAAa,SAAS;GACtB,UAAU,SAAS;GACnB,KAAK,OAAQ,SAAiD;GAC/D,CAAC;AACF,MAAI,iBAAiB;AACrB,SAAO;;CAGT,AAAQ,kBAAkB,aAA6B;AAErD,sBADmB,KAAK,WAAW,YAAY,EACrBC,wCAAiB;;CAG7C,AAAQ,sBAAsB,MAAuC;EACnE,MAAM,kBAAkB,KAAK,UAAU,MAAM,OAAO,KAAK,KAAK,CAAC,MAAM,CAAC;AAEtE,sBADmB,KAAK,WAAW,gBAAgB,EACzBA,wCAAiB;;;AA2C/C,SAAgB,OAAU,MAAc,UAAsB;CAC5D,MAAM,UAAiB,EAAE;CACzB,IAAI,eAAoB,EAAE;AAE1B,UAAS,SAAS,SAAS;AACzB,eAAa,KAAK,KAAK;AAEvB,MAAI,aAAa,UAAU,MAAM;AAC/B,WAAQ,KAAK,aAAa;AAC1B,kBAAe,EAAE;;GAEnB;AAEF,KAAI,aAAa,SAAS,EACxB,SAAQ,KAAK,aAAa;AAG5B,QAAO;;AAGT,SAAgB,oBACd,iBAC2B;CAC3B,MAAM,uBAAO,IAAI,KAAa;CAC9B,MAAM,eAA0C,EAAE;AAElD,MAAK,MAAM,aAAa,iBAAiB;AACvC,MAAI,CAAC,UAAU,MACb,OAAM,IAAI,MAAM,uCAAuC;AAGzD,MAAI,CAAC,KAAK,IAAI,UAAU,MAAM,EAAE;AAC9B,QAAK,IAAI,UAAU,MAAM;AACzB,gBAAa,KAAK,UAAU;;;AAGhC,QAAO;;AAGT,SAAgB,qBACd,aAC2C;AAC3C,KAAI,gBAAgB,KAClB,SAAQ,SAA4B;UAC3B,OAAO,gBAAgB,SAChC,SAAQ,QAA2B,IAAI,SAAS;UACvC,OAAO,gBAAgB,WAChC,QAAO;KAEP,OAAM,IAAI,MACR,2DAA2D,OAAO,cACnE;;AAKL,MAAa,yBAAyB,QAAwC;AAC5E,KACE,UAAU,OACV,OAAO,IAAI,SAAS,cACpB,kBAAkB,OAClB,OAAO,IAAI,iBAAiB,WAE5B,QAAO;AAET,QAAO;;;;;;;;;;;;;;;;;;;;;AA6BT,eAAsB,MAAM,MAA0C;CACpE,MAAM,EAAE,YAAY,eAAe,aAAa,YAAY;CAC5D,MAAM,EACJ,YAAY,KACZ,SACA,aACA,mBAAmB,KACnB,cAAc,UACZ,WAAW,EAAE;AAEjB,KAAI,YAAY,iBAAiB,CAAC,YAChC,OAAM,IAAI,MACR,0GACD;CAGH,MAAM,OAAO,sBAAsB,WAAW,GAC1C,MAAM,WAAW,MAAM,GACvB;CAEJ,MAAM,mBAAmB,qBAAqB,eAAe,KAAK;CAElE,MAAM,eAAe,MAAM,cAAc,SAAS;CAClD,IAAI,WAAW;CACf,IAAI,aAAa;CACjB,IAAI,aAAa;CACjB,IAAI,aAAa;CAEjB,MAAM,UAAU,OAA0B,aAAa,KAAK,KAAK;AAEjE,MAAK,MAAM,SAAS,SAAS;EAC3B,MAAM,aAAa,oBACjB,MAAM,KAAK,QAAQ,gBAAgB,aAAa,IAAI,CAAC,CACtD;EAED,MAAM,YAAY,WAAW,KAAK,QAAQ,iBAAiB,IAAI,CAAC;AAEhE,MAAI,YAAY,cACd,YAAW,SAAS,YAAY,UAAU;AAExC,OADe,UAAU,WACV,KACb,OAAM,IAAI,MACR,2DACD;IAEH;EAGJ,MAAM,cAAc,MAAM,cAAc,OACtC,WAAW,KAAK,QAAQ,IAAI,IAAI,CACjC;EAED,MAAM,OAAiB,EAAE;EACzB,MAAM,cAAmC,EAAE;EAC3C,MAAM,eAAyB,EAAE;EACjC,MAAM,2BAAW,IAAI,KAAa;AAClC,aAAW,SAAS,WAAW,MAAM;AAEnC,OADkB,YAAY,GAE5B,KAAI,YACF,UAAS,IAAI,UAAU,IAAI;QACtB;AACL,iBAAa,KAAK,UAAU,IAAI;AAChC;;AAGJ,QAAK,KAAK,UAAU,IAAI;AACxB,eAAY,KAAK,UAAU,YAAY,CAAC;IACxC;AAEF,MAAI,aAAa,SAAS,GAAG;AAC3B,SAAM,cAAc,OAAO,cAAc,EAAE,aAAa,cAAc,CAAC;AACvE,iBAAc,aAAa;;AAG7B,MAAI,YAAY,SAAS,GAAG;AAC1B,SAAM,YAAY,aAAa,aAAa,EAAE,KAAK,MAAM,CAAC;AAC1D,eAAY,YAAY,SAAS,SAAS;AAC1C,iBAAc,SAAS;;AAGzB,QAAM,cAAc,OAClB,WAAW,KAAK,QAAQ,IAAI,IAAI,EAChC;GAAE,aAAa;GAAc,UAAU;GAAW,CACnD;AAED,MAAI,YAAY,eAAe;AAC7B,aAAU,SAAS,aAAa;AAC9B,QAAI,CAAC,SAAU,OAAM,IAAI,MAAM,2BAA2B;KAC1D;GACF,MAAM,eAAe,MAAM,cAAc,SAAS;IAChD,QAAQ;IACR,UAAU;IACX,CAAC;AAEF,OAAI,aAAa,SAAS,GAAG;AAC3B,UAAM,YAAY,OAAO,EAAE,KAAK,cAAc,CAAC;AAC/C,UAAM,cAAc,WAAW,aAAa;AAC5C,kBAAc,aAAa;;;;AAKjC,KAAI,YAAY,QAAQ;EACtB,IAAI,eAAe,MAAM,cAAc,SAAS;GAC9C,QAAQ;GACR,OAAO;GACR,CAAC;AACF,SAAO,aAAa,SAAS,GAAG;AAC9B,SAAM,YAAY,OAAO,EAAE,KAAK,cAAc,CAAC;AAC/C,SAAM,cAAc,WAAW,aAAa;AAC5C,iBAAc,aAAa;AAC3B,kBAAe,MAAM,cAAc,SAAS;IAC1C,QAAQ;IACR,OAAO;IACR,CAAC;;;AAIN,QAAO;EACL;EACA;EACA;EACA;EACD"}