UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

233 lines (232 loc) 9.68 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.extractContentAuto = exports.cleanHtmlAuto = exports.htmlToMarkdownAuto = exports.createProcessor = exports.gentleCleanHtml = exports.strictCleanHtml = exports.htmlToNewsMarkdown = exports.htmlToBlogMarkdown = exports.htmlToArticleMarkdown = exports.extractContent = exports.cleanHtml = exports.htmlToText = exports.htmlToMarkdownWithCitations = exports.htmlToMarkdown = void 0; const html_processor_1 = require("./html-processor"); /** * Convert HTML to Markdown with optional configuration * @param html HTML content to convert * @param options Conversion options * @returns Markdown string */ function htmlToMarkdown(html, options) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, (options === null || options === void 0 ? void 0 : options.baseUrl) ? { baseUrl: options.baseUrl } : {}); // Apply filtering if filter options are provided if (options && (options.threshold !== undefined || options.strategy !== undefined || options.ratio !== undefined)) { yield processor.filter(options); } else { yield processor.filter(); } const result = yield processor.toMarkdown(options); return result.content; }); } exports.htmlToMarkdown = htmlToMarkdown; /** * Convert HTML to Markdown with citations * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @param options Conversion options * @returns Markdown string with citations */ function htmlToMarkdownWithCitations(html, baseUrl, options) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { baseUrl }); if (options && (options.threshold !== undefined || options.strategy !== undefined || options.ratio !== undefined)) { yield processor.filter(options); } else { yield processor.filter(); } const result = yield processor.toMarkdown(Object.assign(Object.assign({}, options), { citations: true })); return result.contentWithCitations + (result.references ? '\n\n' + result.references : ''); }); } exports.htmlToMarkdownWithCitations = htmlToMarkdownWithCitations; /** * Convert HTML to plain text * @param html HTML content to convert * @param options Filter options * @returns Plain text string */ function htmlToText(html, options) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { filter: options, converter: { ignoreLinks: true, ignoreImages: true } }); yield processor.filter(options); return yield processor.toText(); }); } exports.htmlToText = htmlToText; /** * Clean HTML by removing unwanted elements and content * @param html HTML content to clean * @param options Filter options * @returns Cleaned HTML string */ function cleanHtml(html, options) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { filter: options }); yield processor.filter(options); return processor.toString(); }); } exports.cleanHtml = cleanHtml; /** * Extract main content from HTML as array of fragments * @param html HTML content to process * @param options Filter options * @returns Array of HTML content fragments */ function extractContent(html, options) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { filter: options }); yield processor.filter(options); return yield processor.toArray(); }); } exports.extractContent = extractContent; /** * Convert HTML to Markdown using article preset (optimized for long-form content) * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @returns Markdown string */ function htmlToArticleMarkdown(html, baseUrl) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { preset: 'article', baseUrl }); yield processor.filter(); const result = yield processor.toMarkdown(); return result.content; }); } exports.htmlToArticleMarkdown = htmlToArticleMarkdown; /** * Convert HTML to Markdown using blog preset (optimized for blog posts) * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @returns Markdown string */ function htmlToBlogMarkdown(html, baseUrl) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { preset: 'blog', baseUrl }); yield processor.filter(); const result = yield processor.toMarkdown(); return result.content; }); } exports.htmlToBlogMarkdown = htmlToBlogMarkdown; /** * Convert HTML to Markdown using news preset (optimized for news articles) * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @returns Markdown string */ function htmlToNewsMarkdown(html, baseUrl) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { preset: 'news', baseUrl }); yield processor.filter(); const result = yield processor.toMarkdown(); return result.content; }); } exports.htmlToNewsMarkdown = htmlToNewsMarkdown; /** * Quick and aggressive HTML cleaning using strict preset * @param html HTML content to clean * @returns Cleaned HTML string */ function strictCleanHtml(html) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { preset: 'strict' }); yield processor.filter(); return processor.toString(); }); } exports.strictCleanHtml = strictCleanHtml; /** * Gentle HTML cleaning using loose preset * @param html HTML content to clean * @returns Cleaned HTML string */ function gentleCleanHtml(html) { return __awaiter(this, void 0, void 0, function* () { const processor = html_processor_1.HtmlProcessor.from(html, { preset: 'loose' }); yield processor.filter(); return processor.toString(); }); } exports.gentleCleanHtml = gentleCleanHtml; /** * Create a processor instance with custom configuration * @param options Processor configuration options * @returns Configured HtmlProcessor instance */ function createProcessor(options) { return new html_processor_1.HtmlProcessor(options); } exports.createProcessor = createProcessor; /** * Convert HTML to Markdown with automatic page type detection * @param html HTML content * @param url Optional URL for better detection accuracy * @param options Additional processing options * @returns Markdown result */ function htmlToMarkdownAuto(html, url, options = {}) { return __awaiter(this, void 0, void 0, function* () { const processor = yield html_processor_1.HtmlProcessor.from(html, options) .withAutoDetection(url); yield processor.filter(); return yield processor.toMarkdown(); }); } exports.htmlToMarkdownAuto = htmlToMarkdownAuto; /** * Clean HTML with automatic page type detection * @param html HTML content * @param url Optional URL for better detection accuracy * @param options Additional processing options * @returns Clean HTML string */ function cleanHtmlAuto(html, url, options = {}) { return __awaiter(this, void 0, void 0, function* () { const processor = yield html_processor_1.HtmlProcessor.from(html, options) .withAutoDetection(url); yield processor.filter(); return processor.toString(); }); } exports.cleanHtmlAuto = cleanHtmlAuto; /** * Extract content with automatic page type detection and return detailed result * @param html HTML content * @param url Optional URL for better detection accuracy * @param options Additional processing options * @returns Detailed extraction result with page type information */ function extractContentAuto(html, url, options = {}) { return __awaiter(this, void 0, void 0, function* () { const processor = yield html_processor_1.HtmlProcessor.from(html, options) .withAutoDetection(url); yield processor.filter(); return { markdown: yield processor.toMarkdown(), pageType: processor.getPageTypeResult(), cleanHtml: processor.toString() }; }); } exports.extractContentAuto = extractContentAuto;