mongodb-rag-core
Version:
Common elements used by MongoDB Chatbot Framework components.
103 lines • 4.75 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.updateEmbeddedContentForPage = exports.updateEmbeddedContent = void 0;
const crypto_1 = require("crypto");
const promise_pool_1 = require("@supercharge/promise-pool");
const chunk_1 = require("../chunk");
const logger_1 = require("../logger");
/**
(Re-)embeddedContent the pages in the page store that have changed since the given date
and stores the embeddedContent in the embeddedContent store.
*/
const updateEmbeddedContent = async ({ since, embeddedContentStore, pageStore, sourceNames, embedder, chunkOptions, concurrencyOptions, }) => {
const changedPages = await pageStore.loadPages({
updated: since,
sources: sourceNames,
});
logger_1.logger.info(`Found ${changedPages.length} changed pages since ${since}${sourceNames ? ` in sources: ${sourceNames.join(", ")}` : ""}`);
await promise_pool_1.PromisePool.withConcurrency(concurrencyOptions?.processPages ?? 1)
.for(changedPages)
.process(async (page, index, pool) => {
switch (page.action) {
case "deleted":
logger_1.logger.info(`Deleting embedded content for ${page.sourceName}: ${page.url}`);
await embeddedContentStore.deleteEmbeddedContent({
page,
});
break;
case "created": // fallthrough
case "updated":
await (0, exports.updateEmbeddedContentForPage)({
store: embeddedContentStore,
page,
chunkOptions,
embedder,
concurrencyOptions,
});
}
});
};
exports.updateEmbeddedContent = updateEmbeddedContent;
const chunkAlgoHashes = new Map();
const getHashForFunc = (f, o) => {
const data = JSON.stringify(o ?? {}) + f.toString();
const existingHash = chunkAlgoHashes.get(data);
if (existingHash) {
return existingHash;
}
const hash = (0, crypto_1.createHash)("sha256");
hash.update(data);
const digest = hash.digest("hex");
chunkAlgoHashes.set(data, digest);
return digest;
};
const updateEmbeddedContentForPage = async ({ page, store, embedder, chunkOptions, concurrencyOptions, }) => {
const contentChunks = await (0, chunk_1.chunkPage)(page, chunkOptions);
if (contentChunks.length === 0) {
// This could happen if source returned a page with no content
logger_1.logger.warn(`No content for page ${page.sourceName}:${page.url} - deleting any existing content and continuing`);
await store.deleteEmbeddedContent({ page });
return;
}
// In order to resume where we left off (in case of script restart), compare
// the date of any existing chunks with the page updated date. If the chunks
// have been updated since the page was updated (and we have the expected
// number of chunks) and chunkAlgoHash has not changed from what's in the
// database, assume the embedded content for that page is complete and
// up-to-date. To force an update, you can delete the chunks from the
// collection.
const existingContent = await store.loadEmbeddedContent({
page,
});
const chunkAlgoHash = getHashForFunc(chunk_1.chunkPage, chunkOptions);
if (existingContent.length &&
existingContent[0].updated > page.updated &&
contentChunks.length === existingContent.length &&
existingContent[0].chunkAlgoHash === chunkAlgoHash) {
logger_1.logger.info(`Embedded content for ${page.sourceName}:${page.url} already updated (${existingContent[0].updated}) since page update date (${page.updated}). Skipping embedding.`);
return;
}
logger_1.logger.info(`${page.action === "created" ? "Creating" : "Updating"} embedded content for ${page.sourceName}:${page.url}`);
const { results: embeddedContent } = await promise_pool_1.PromisePool.withConcurrency(concurrencyOptions?.createChunks ?? 1)
.for(contentChunks)
.process(async (chunk, index, pool) => {
logger_1.logger.info(`Vectorizing chunk ${index + 1}/${contentChunks.length} for ${page.sourceName}: ${page.url}`);
const { embedding } = await embedder.embed({
text: chunk.text,
});
return {
...chunk,
embeddings: {
[store.metadata.embeddingName]: embedding,
},
updated: new Date(),
chunkAlgoHash,
};
});
await store.updateEmbeddedContent({
page,
embeddedContent,
});
};
exports.updateEmbeddedContentForPage = updateEmbeddedContentForPage;
//# sourceMappingURL=updateEmbeddedContent.js.map