html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
107 lines (106 loc) • 4.45 kB
TypeScript
import { HtmlProcessor } from './html-processor';
import { ConvertOptions, ProcessorOptions, FilterOptions } from './types';
import { MarkdownResult } from './types';
/**
* Convert HTML to Markdown with optional configuration
* @param html HTML content to convert
* @param options Conversion options
* @returns Markdown string
*/
export declare function htmlToMarkdown(html: string, options?: ConvertOptions): Promise<string>;
/**
* Convert HTML to Markdown with citations
* @param html HTML content to convert
* @param baseUrl Base URL for resolving relative links
* @param options Conversion options
* @returns Markdown string with citations
*/
export declare function htmlToMarkdownWithCitations(html: string, baseUrl?: string, options?: ConvertOptions): Promise<string>;
/**
* Convert HTML to plain text
* @param html HTML content to convert
* @param options Filter options
* @returns Plain text string
*/
export declare function htmlToText(html: string, options?: FilterOptions): Promise<string>;
/**
* Clean HTML by removing unwanted elements and content
* @param html HTML content to clean
* @param options Filter options
* @returns Cleaned HTML string
*/
export declare function cleanHtml(html: string, options?: FilterOptions): Promise<string>;
/**
* Extract main content from HTML as array of fragments
* @param html HTML content to process
* @param options Filter options
* @returns Array of HTML content fragments
*/
export declare function extractContent(html: string, options?: FilterOptions): Promise<string[]>;
/**
* Convert HTML to Markdown using article preset (optimized for long-form content)
* @param html HTML content to convert
* @param baseUrl Base URL for resolving relative links
* @returns Markdown string
*/
export declare function htmlToArticleMarkdown(html: string, baseUrl?: string): Promise<string>;
/**
* Convert HTML to Markdown using blog preset (optimized for blog posts)
* @param html HTML content to convert
* @param baseUrl Base URL for resolving relative links
* @returns Markdown string
*/
export declare function htmlToBlogMarkdown(html: string, baseUrl?: string): Promise<string>;
/**
* Convert HTML to Markdown using news preset (optimized for news articles)
* @param html HTML content to convert
* @param baseUrl Base URL for resolving relative links
* @returns Markdown string
*/
export declare function htmlToNewsMarkdown(html: string, baseUrl?: string): Promise<string>;
/**
* Quick and aggressive HTML cleaning using strict preset
* @param html HTML content to clean
* @returns Cleaned HTML string
*/
export declare function strictCleanHtml(html: string): Promise<string>;
/**
* Gentle HTML cleaning using loose preset
* @param html HTML content to clean
* @returns Cleaned HTML string
*/
export declare function gentleCleanHtml(html: string): Promise<string>;
/**
* Create a processor instance with custom configuration
* @param options Processor configuration options
* @returns Configured HtmlProcessor instance
*/
export declare function createProcessor(options?: ProcessorOptions): HtmlProcessor;
/**
* Convert HTML to Markdown with automatic page type detection
* @param html HTML content
* @param url Optional URL for better detection accuracy
* @param options Additional processing options
* @returns Markdown result
*/
export declare function htmlToMarkdownAuto(html: string, url?: string, options?: Partial<ProcessorOptions>): Promise<MarkdownResult>;
/**
* Clean HTML with automatic page type detection
* @param html HTML content
* @param url Optional URL for better detection accuracy
* @param options Additional processing options
* @returns Clean HTML string
*/
export declare function cleanHtmlAuto(html: string, url?: string, options?: Partial<ProcessorOptions>): Promise<string>;
/**
* Extract content with automatic page type detection and return detailed result
* @param html HTML content
* @param url Optional URL for better detection accuracy
* @param options Additional processing options
* @returns Detailed extraction result with page type information
*/
export declare function extractContentAuto(html: string, url?: string, options?: Partial<ProcessorOptions>): Promise<{
markdown: MarkdownResult;
pageType: import('./page-type-detector').PageTypeResult | null;
cleanHtml: string;
}>;