UNPKG

@kareemaly/researcher

Version:
54 lines (53 loc) 2.02 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.TurndownContentProcessor = void 0; const turndown_1 = __importDefault(require("turndown")); const markdown_to_txt_1 = require("markdown-to-txt"); const html_metadata_1 = require("@jcottam/html-metadata"); const httpClients_1 = require("../utils/httpClients"); const logger_1 = require("../utils/logger"); const log = (0, logger_1.createLogger)("content-processor"); class TurndownContentProcessor { constructor() { this.turndown = new turndown_1.default({ headingStyle: "atx", codeBlockStyle: "fenced", emDelimiter: "_", }); } async process(url) { try { // Fetch HTML content const response = await httpClients_1.contentClient.get(url); const html = response.data; // Extract metadata const metadata = await (0, html_metadata_1.extractFromHTML)(html); // Convert HTML to Markdown const markdown = this.turndown.turndown(html); // Convert Markdown to plain text const text = (0, markdown_to_txt_1.markdownToTxt)(markdown); return { url, title: metadata.title || "", text, html, markdown, metadata: { title: metadata.title, description: metadata.description, keywords: metadata.keywords?.split(",").map((k) => k.trim()), author: metadata.author, published: metadata.published } }; } catch (error) { log.error("Failed to process content: %O", error); throw error; } } } exports.TurndownContentProcessor = TurndownContentProcessor;