UNPKG

mongodb-rag-core

Version:

Common elements used by MongoDB Chatbot Framework components.

86 lines 3.39 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.extractHtmlH1 = exports.handleHtmlDocument = void 0; const logger_1 = require("../logger"); const turndown_1 = __importDefault(require("turndown")); const turndownPluginGfm = __importStar(require("turndown-plugin-gfm")); const jsdom_1 = require("jsdom"); async function handleHtmlDocument(path, content, options) { const { extractTitle = extractHtmlH1, extractMetadata, removeElements, metadata, pathToPageUrl, postProcessMarkdown, } = options; const turndownService = new turndown_1.default({ codeBlockStyle: "fenced", headingStyle: "atx", bulletListMarker: "-", }); turndownService.use(turndownPluginGfm.gfm); // Remove links from Markdown turndownService.addRule("keepLinkText", { filter: ["a"], replacement: (content) => { return content; // Return the inner text of the link }, }); // Remove images from Markdown turndownService.addRule("removeImages", { filter: ["img"], replacement: () => { return ""; // Return an empty string to remove the image }, }); logger_1.logger.info(`Processing ${path}`); const dom = new jsdom_1.JSDOM(content); const { document: domDocument } = dom.window; const title = extractTitle(domDocument); let extractedMetadata = {}; if (extractMetadata) { extractedMetadata = extractMetadata(domDocument); } const elementsToRemove = removeElements(domDocument); elementsToRemove.forEach((el) => el.parentNode?.removeChild(el)); let body = turndownService.turndown(domDocument.body); body = postProcessMarkdown ? await postProcessMarkdown(body) : body; const page = { format: "md", title, body, url: pathToPageUrl(path), metadata: { ...metadata, ...extractedMetadata, }, }; return page; } exports.handleHtmlDocument = handleHtmlDocument; function extractHtmlH1(domDoc) { const h1 = domDoc.querySelector("h1"); return h1?.textContent ?? undefined; } exports.extractHtmlH1 = extractHtmlH1; //# sourceMappingURL=handleHtmlDocument.js.map