UNPKG

@langchain/core

Version:
219 lines (217 loc) 7.86 kB
const require_rolldown_runtime = require('../_virtual/rolldown_runtime.cjs'); const require_hash = require('../utils/js-sha256/hash.cjs'); require('../utils/hash.cjs'); const require_document = require('../documents/document.cjs'); const require_record_manager = require('./record_manager.cjs'); const uuid = require_rolldown_runtime.__toESM(require("uuid")); //#region src/indexing/base.ts /** * HashedDocument is a Document with hashes calculated. * Hashes are calculated based on page content and metadata. * It is used for indexing. */ var _HashedDocument = class { uid; hash_; contentHash; metadataHash; pageContent; metadata; keyEncoder = require_hash.sha256; constructor(fields) { this.uid = fields.uid; this.pageContent = fields.pageContent; this.metadata = fields.metadata; } makeDefaultKeyEncoder(keyEncoderFn) { this.keyEncoder = keyEncoderFn; } calculateHashes() { const forbiddenKeys = [ "hash_", "content_hash", "metadata_hash" ]; for (const key of forbiddenKeys) if (key in this.metadata) throw new Error(`Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(", ")}]`); const contentHash = this._hashStringToUUID(this.pageContent); try { const metadataHash = this._hashNestedDictToUUID(this.metadata); this.contentHash = contentHash; this.metadataHash = metadataHash; } catch (e) { throw new Error(`Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`); } this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash); if (!this.uid) this.uid = this.hash_; } toDocument() { return new require_document.Document({ pageContent: this.pageContent, metadata: this.metadata }); } static fromDocument(document, uid) { const doc = new this({ pageContent: document.pageContent, metadata: document.metadata, uid: uid || document.uid }); doc.calculateHashes(); return doc; } _hashStringToUUID(inputString) { const hash_value = this.keyEncoder(inputString); return (0, uuid.v5)(hash_value, require_record_manager.UUIDV5_NAMESPACE); } _hashNestedDictToUUID(data) { const serialized_data = JSON.stringify(data, Object.keys(data).sort()); const hash_value = this.keyEncoder(serialized_data); return (0, uuid.v5)(hash_value, require_record_manager.UUIDV5_NAMESPACE); } }; function _batch(size, iterable) { const batches = []; let currentBatch = []; iterable.forEach((item) => { currentBatch.push(item); if (currentBatch.length >= size) { batches.push(currentBatch); currentBatch = []; } }); if (currentBatch.length > 0) batches.push(currentBatch); return batches; } function _deduplicateInOrder(hashedDocuments) { const seen = /* @__PURE__ */ new Set(); const deduplicated = []; for (const hashedDoc of hashedDocuments) { if (!hashedDoc.hash_) throw new Error("Hashed document does not have a hash"); if (!seen.has(hashedDoc.hash_)) { seen.add(hashedDoc.hash_); deduplicated.push(hashedDoc); } } return deduplicated; } function _getSourceIdAssigner(sourceIdKey) { if (sourceIdKey === null) return (_doc) => null; else if (typeof sourceIdKey === "string") return (doc) => doc.metadata[sourceIdKey]; else if (typeof sourceIdKey === "function") return sourceIdKey; else throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`); } const _isBaseDocumentLoader = (arg) => { if ("load" in arg && typeof arg.load === "function" && "loadAndSplit" in arg && typeof arg.loadAndSplit === "function") return true; return false; }; /** * Index data from the doc source into the vector store. * * Indexing functionality uses a manager to keep track of which documents * are in the vector store. * * This allows us to keep track of which documents were updated, and which * documents were deleted, which documents should be skipped. * * For the time being, documents are indexed using their hashes, and users * are not able to specify the uid of the document. * * @param {IndexArgs} args * @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents. * @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents. * @param {VectorStore} args.vectorStore The vector store to use for storing the documents. * @param {IndexOptions | undefined} args.options Options for indexing. * @returns {Promise<IndexingResult>} */ async function index(args) { const { docsSource, recordManager, vectorStore, options } = args; const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1e3, forceUpdate = false } = options ?? {}; if (cleanup === "incremental" && !sourceIdKey) throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'."); const docs = _isBaseDocumentLoader(docsSource) ? await docsSource.load() : docsSource; const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null); const indexStartDt = await recordManager.getTime(); let numAdded = 0; let numDeleted = 0; let numUpdated = 0; let numSkipped = 0; const batches = _batch(batchSize ?? 100, docs); for (const batch of batches) { const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc))); const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc)); if (cleanup === "incremental") hashedDocs.forEach((_hashedDoc, index$1) => { const source = sourceIds[index$1]; if (source === null) throw new Error("sourceIdKey must be provided when cleanup is incremental"); }); const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid)); const uids = []; const docsToIndex = []; const docsToUpdate = []; const seenDocs = /* @__PURE__ */ new Set(); hashedDocs.forEach((hashedDoc, i) => { const docExists = batchExists[i]; if (docExists) if (forceUpdate) seenDocs.add(hashedDoc.uid); else { docsToUpdate.push(hashedDoc.uid); return; } uids.push(hashedDoc.uid); docsToIndex.push(hashedDoc.toDocument()); }); if (docsToUpdate.length > 0) { await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt }); numSkipped += docsToUpdate.length; } if (docsToIndex.length > 0) { await vectorStore.addDocuments(docsToIndex, { ids: uids }); numAdded += docsToIndex.length - seenDocs.size; numUpdated += seenDocs.size; } await recordManager.update(hashedDocs.map((doc) => doc.uid), { timeAtLeast: indexStartDt, groupIds: sourceIds }); if (cleanup === "incremental") { sourceIds.forEach((sourceId) => { if (!sourceId) throw new Error("Source id cannot be null"); }); const uidsToDelete = await recordManager.listKeys({ before: indexStartDt, groupIds: sourceIds }); if (uidsToDelete.length > 0) { await vectorStore.delete({ ids: uidsToDelete }); await recordManager.deleteKeys(uidsToDelete); numDeleted += uidsToDelete.length; } } } if (cleanup === "full") { let uidsToDelete = await recordManager.listKeys({ before: indexStartDt, limit: cleanupBatchSize }); while (uidsToDelete.length > 0) { await vectorStore.delete({ ids: uidsToDelete }); await recordManager.deleteKeys(uidsToDelete); numDeleted += uidsToDelete.length; uidsToDelete = await recordManager.listKeys({ before: indexStartDt, limit: cleanupBatchSize }); } } return { numAdded, numDeleted, numUpdated, numSkipped }; } //#endregion exports._HashedDocument = _HashedDocument; exports._batch = _batch; exports._deduplicateInOrder = _deduplicateInOrder; exports._getSourceIdAssigner = _getSourceIdAssigner; exports._isBaseDocumentLoader = _isBaseDocumentLoader; exports.index = index; //# sourceMappingURL=base.cjs.map