UNPKG

mongodb-rag-core

Version:

Common elements used by MongoDB Chatbot Framework components.

103 lines 4.75 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.updateEmbeddedContentForPage = exports.updateEmbeddedContent = void 0; const crypto_1 = require("crypto"); const promise_pool_1 = require("@supercharge/promise-pool"); const chunk_1 = require("../chunk"); const logger_1 = require("../logger"); /** (Re-)embeddedContent the pages in the page store that have changed since the given date and stores the embeddedContent in the embeddedContent store. */ const updateEmbeddedContent = async ({ since, embeddedContentStore, pageStore, sourceNames, embedder, chunkOptions, concurrencyOptions, }) => { const changedPages = await pageStore.loadPages({ updated: since, sources: sourceNames, }); logger_1.logger.info(`Found ${changedPages.length} changed pages since ${since}${sourceNames ? ` in sources: ${sourceNames.join(", ")}` : ""}`); await promise_pool_1.PromisePool.withConcurrency(concurrencyOptions?.processPages ?? 1) .for(changedPages) .process(async (page, index, pool) => { switch (page.action) { case "deleted": logger_1.logger.info(`Deleting embedded content for ${page.sourceName}: ${page.url}`); await embeddedContentStore.deleteEmbeddedContent({ page, }); break; case "created": // fallthrough case "updated": await (0, exports.updateEmbeddedContentForPage)({ store: embeddedContentStore, page, chunkOptions, embedder, concurrencyOptions, }); } }); }; exports.updateEmbeddedContent = updateEmbeddedContent; const chunkAlgoHashes = new Map(); const getHashForFunc = (f, o) => { const data = JSON.stringify(o ?? {}) + f.toString(); const existingHash = chunkAlgoHashes.get(data); if (existingHash) { return existingHash; } const hash = (0, crypto_1.createHash)("sha256"); hash.update(data); const digest = hash.digest("hex"); chunkAlgoHashes.set(data, digest); return digest; }; const updateEmbeddedContentForPage = async ({ page, store, embedder, chunkOptions, concurrencyOptions, }) => { const contentChunks = await (0, chunk_1.chunkPage)(page, chunkOptions); if (contentChunks.length === 0) { // This could happen if source returned a page with no content logger_1.logger.warn(`No content for page ${page.sourceName}:${page.url} - deleting any existing content and continuing`); await store.deleteEmbeddedContent({ page }); return; } // In order to resume where we left off (in case of script restart), compare // the date of any existing chunks with the page updated date. If the chunks // have been updated since the page was updated (and we have the expected // number of chunks) and chunkAlgoHash has not changed from what's in the // database, assume the embedded content for that page is complete and // up-to-date. To force an update, you can delete the chunks from the // collection. const existingContent = await store.loadEmbeddedContent({ page, }); const chunkAlgoHash = getHashForFunc(chunk_1.chunkPage, chunkOptions); if (existingContent.length && existingContent[0].updated > page.updated && contentChunks.length === existingContent.length && existingContent[0].chunkAlgoHash === chunkAlgoHash) { logger_1.logger.info(`Embedded content for ${page.sourceName}:${page.url} already updated (${existingContent[0].updated}) since page update date (${page.updated}). Skipping embedding.`); return; } logger_1.logger.info(`${page.action === "created" ? "Creating" : "Updating"} embedded content for ${page.sourceName}:${page.url}`); const { results: embeddedContent } = await promise_pool_1.PromisePool.withConcurrency(concurrencyOptions?.createChunks ?? 1) .for(contentChunks) .process(async (chunk, index, pool) => { logger_1.logger.info(`Vectorizing chunk ${index + 1}/${contentChunks.length} for ${page.sourceName}: ${page.url}`); const { embedding } = await embedder.embed({ text: chunk.text, }); return { ...chunk, embeddings: { [store.metadata.embeddingName]: embedding, }, updated: new Date(), chunkAlgoHash, }; }); await store.updateEmbeddedContent({ page, embeddedContent, }); }; exports.updateEmbeddedContentForPage = updateEmbeddedContentForPage; //# sourceMappingURL=updateEmbeddedContent.js.map