html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
161 lines (160 loc) • 4.84 kB
TypeScript
import { ProcessorOptions, FilterOptions, ConverterOptions, FilterResult, MarkdownResult } from './types';
import { PageTypeResult } from './page-type-detector';
/**
* Main HTML processor class with fluent API
*/
export declare class HtmlProcessor {
private htmlFilter;
private markdownGenerator;
private options;
private currentHtml;
private baseUrl;
private processed;
private dom;
private filteredDom;
private filterStats;
private pageTypeResult;
private autoDetectEnabled;
/**
* Create a new HtmlProcessor instance
* @param options Processing options
*/
constructor(options?: ProcessorOptions);
/**
* Static factory method to create processor from HTML
* @param html HTML content to process
* @param options Processing options
* @returns New HtmlProcessor instance
*/
static from(html: string, options?: ProcessorOptions): HtmlProcessor;
/**
* Set the base URL for resolving relative links
* @param url Base URL
* @returns This processor instance for chaining
*/
withBaseUrl(url: string): HtmlProcessor;
/**
* Update processor options
* @param options New options to merge
* @returns This processor instance for chaining
*/
withOptions(options: Partial<ProcessorOptions>): HtmlProcessor;
/**
* Apply HTML filtering
* @param options Filter options (optional)
* @returns This processor instance for chaining
*/
filter(options?: FilterOptions): Promise<HtmlProcessor>;
/**
* Convert to Markdown with optional custom options
* @param options Converter options (optional)
* @returns Markdown result
*/
toMarkdown(options?: ConverterOptions): Promise<MarkdownResult>;
/**
* Convert to plain text
* @returns Plain text content
*/
toText(): Promise<string>;
/**
* Convert to array of HTML fragments
* @returns Array of HTML fragments
*/
toArray(): Promise<string[]>;
/**
* Get filtered HTML as string
* @returns Filtered HTML string
*/
toString(): string;
/**
* Get clean HTML (alias for toString)
* @returns Clean HTML string
*/
toClean(): string;
/**
* Get detailed filter result with metadata
* @returns Filter result with metadata
*/
getFilterResult(): Promise<FilterResult>;
/**
* Get current processing options
* @returns Current options
*/
getOptions(): ProcessorOptions;
/**
* Check if content has been processed
* @returns True if content has been filtered
*/
isProcessed(): boolean;
/**
* Get current HTML content
* @returns Current HTML content
*/
getHtml(): string;
/**
* Get current base URL
* @returns Current base URL
*/
getBaseUrl(): string;
/**
* Resolve processing options with presets
* @param options Input options
* @returns Resolved options
*/
private resolveOptions;
/**
* Create HTML filter instance based on current options
* @returns HtmlFilter instance
*/
private createHtmlFilter;
/**
* Create markdown generator instance based on current options
* @returns DefaultMarkdownGenerator instance
*/
private createMarkdownGenerator;
/**
* Count words in text
* @param text Text to count
* @returns Word count
*/
private countWords;
/**
* Count regex matches in text
* @param text Text to search
* @param regex Regular expression
* @returns Match count
*/
private countMatches;
/**
* Count HTML elements in content
* @param html HTML content
* @returns Element count
*/
private countElements;
/**
* Enable page type auto-detection with optional URL hint
* @param url Optional URL for better detection accuracy
* @returns This processor instance for chaining
*/
withAutoDetection(url?: string): Promise<HtmlProcessor>;
/**
* Get page type detection result
* @returns Page type detection result or null if not detected
*/
getPageTypeResult(): PageTypeResult | null;
/**
* Check if auto-detection is enabled
* @returns True if auto-detection is enabled
*/
isAutoDetectionEnabled(): boolean;
/**
* Manually set page type (disables auto-detection)
* @param pageType Page type to set
* @returns This processor instance for chaining
*/
withPageType(pageType: string): Promise<HtmlProcessor>;
/**
* Internal method to detect page type
*/
private detectPageType;
}