mongodb-rag-core
Version:
Common elements used by MongoDB Chatbot Framework components.
86 lines • 3.39 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractHtmlH1 = exports.handleHtmlDocument = void 0;
const logger_1 = require("../logger");
const turndown_1 = __importDefault(require("turndown"));
const turndownPluginGfm = __importStar(require("turndown-plugin-gfm"));
const jsdom_1 = require("jsdom");
async function handleHtmlDocument(path, content, options) {
const { extractTitle = extractHtmlH1, extractMetadata, removeElements, metadata, pathToPageUrl, postProcessMarkdown, } = options;
const turndownService = new turndown_1.default({
codeBlockStyle: "fenced",
headingStyle: "atx",
bulletListMarker: "-",
});
turndownService.use(turndownPluginGfm.gfm);
// Remove links from Markdown
turndownService.addRule("keepLinkText", {
filter: ["a"],
replacement: (content) => {
return content; // Return the inner text of the link
},
});
// Remove images from Markdown
turndownService.addRule("removeImages", {
filter: ["img"],
replacement: () => {
return ""; // Return an empty string to remove the image
},
});
logger_1.logger.info(`Processing ${path}`);
const dom = new jsdom_1.JSDOM(content);
const { document: domDocument } = dom.window;
const title = extractTitle(domDocument);
let extractedMetadata = {};
if (extractMetadata) {
extractedMetadata = extractMetadata(domDocument);
}
const elementsToRemove = removeElements(domDocument);
elementsToRemove.forEach((el) => el.parentNode?.removeChild(el));
let body = turndownService.turndown(domDocument.body);
body = postProcessMarkdown ? await postProcessMarkdown(body) : body;
const page = {
format: "md",
title,
body,
url: pathToPageUrl(path),
metadata: {
...metadata,
...extractedMetadata,
},
};
return page;
}
exports.handleHtmlDocument = handleHtmlDocument;
function extractHtmlH1(domDoc) {
const h1 = domDoc.querySelector("h1");
return h1?.textContent ?? undefined;
}
exports.extractHtmlH1 = extractHtmlH1;
//# sourceMappingURL=handleHtmlDocument.js.map