@kareemaly/researcher
Version:
CLI tool for web research
54 lines (53 loc) • 2.02 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.TurndownContentProcessor = void 0;
const turndown_1 = __importDefault(require("turndown"));
const markdown_to_txt_1 = require("markdown-to-txt");
const html_metadata_1 = require("@jcottam/html-metadata");
const httpClients_1 = require("../utils/httpClients");
const logger_1 = require("../utils/logger");
const log = (0, logger_1.createLogger)("content-processor");
class TurndownContentProcessor {
constructor() {
this.turndown = new turndown_1.default({
headingStyle: "atx",
codeBlockStyle: "fenced",
emDelimiter: "_",
});
}
async process(url) {
try {
// Fetch HTML content
const response = await httpClients_1.contentClient.get(url);
const html = response.data;
// Extract metadata
const metadata = await (0, html_metadata_1.extractFromHTML)(html);
// Convert HTML to Markdown
const markdown = this.turndown.turndown(html);
// Convert Markdown to plain text
const text = (0, markdown_to_txt_1.markdownToTxt)(markdown);
return {
url,
title: metadata.title || "",
text,
html,
markdown,
metadata: {
title: metadata.title,
description: metadata.description,
keywords: metadata.keywords?.split(",").map((k) => k.trim()),
author: metadata.author,
published: metadata.published
}
};
}
catch (error) {
log.error("Failed to process content: %O", error);
throw error;
}
}
}
exports.TurndownContentProcessor = TurndownContentProcessor;