html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

github.com/kamjin3086/html-content-processor

kamjin3086/html-content-processor

107 lines (106 loc) • 4.45 kB

TypeScript

import { HtmlProcessor } from './html-processor'; import { ConvertOptions, ProcessorOptions, FilterOptions } from './types'; import { MarkdownResult } from './types'; /** * Convert HTML to Markdown with optional configuration * @param html HTML content to convert * @param options Conversion options * @returns Markdown string */ export declare function htmlToMarkdown(html: string, options?: ConvertOptions): Promise<string>; /** * Convert HTML to Markdown with citations * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @param options Conversion options * @returns Markdown string with citations */ export declare function htmlToMarkdownWithCitations(html: string, baseUrl?: string, options?: ConvertOptions): Promise<string>; /** * Convert HTML to plain text * @param html HTML content to convert * @param options Filter options * @returns Plain text string */ export declare function htmlToText(html: string, options?: FilterOptions): Promise<string>; /** * Clean HTML by removing unwanted elements and content * @param html HTML content to clean * @param options Filter options * @returns Cleaned HTML string */ export declare function cleanHtml(html: string, options?: FilterOptions): Promise<string>; /** * Extract main content from HTML as array of fragments * @param html HTML content to process * @param options Filter options * @returns Array of HTML content fragments */ export declare function extractContent(html: string, options?: FilterOptions): Promise<string[]>; /** * Convert HTML to Markdown using article preset (optimized for long-form content) * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @returns Markdown string */ export declare function htmlToArticleMarkdown(html: string, baseUrl?: string): Promise<string>; /** * Convert HTML to Markdown using blog preset (optimized for blog posts) * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @returns Markdown string */ export declare function htmlToBlogMarkdown(html: string, baseUrl?: string): Promise<string>; /** * Convert HTML to Markdown using news preset (optimized for news articles) * @param html HTML content to convert * @param baseUrl Base URL for resolving relative links * @returns Markdown string */ export declare function htmlToNewsMarkdown(html: string, baseUrl?: string): Promise<string>; /** * Quick and aggressive HTML cleaning using strict preset * @param html HTML content to clean * @returns Cleaned HTML string */ export declare function strictCleanHtml(html: string): Promise<string>; /** * Gentle HTML cleaning using loose preset * @param html HTML content to clean * @returns Cleaned HTML string */ export declare function gentleCleanHtml(html: string): Promise<string>; /** * Create a processor instance with custom configuration * @param options Processor configuration options * @returns Configured HtmlProcessor instance */ export declare function createProcessor(options?: ProcessorOptions): HtmlProcessor; /** * Convert HTML to Markdown with automatic page type detection * @param html HTML content * @param url Optional URL for better detection accuracy * @param options Additional processing options * @returns Markdown result */ export declare function htmlToMarkdownAuto(html: string, url?: string, options?: Partial<ProcessorOptions>): Promise<MarkdownResult>; /** * Clean HTML with automatic page type detection * @param html HTML content * @param url Optional URL for better detection accuracy * @param options Additional processing options * @returns Clean HTML string */ export declare function cleanHtmlAuto(html: string, url?: string, options?: Partial<ProcessorOptions>): Promise<string>; /** * Extract content with automatic page type detection and return detailed result * @param html HTML content * @param url Optional URL for better detection accuracy * @param options Additional processing options * @returns Detailed extraction result with page type information */ export declare function extractContentAuto(html: string, url?: string, options?: Partial<ProcessorOptions>): Promise<{ markdown: MarkdownResult; pageType: import('./page-type-detector').PageTypeResult | null; cleanHtml: string; }>;