html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
72 lines (71 loc) • 2.28 kB
TypeScript
/**
* HTML Filter - for cleaning and filtering HTML content
* Based on the Python version of PruningContentFilter
*/
export declare class HtmlFilter {
private includedTags;
private excludedTags;
private headerTags;
private negativePattern;
private minWordCount;
private threshold;
private thresholdType;
private tagImportance;
private metricConfig;
private metricWeights;
constructor(minWordThreshold?: number, thresholdType?: 'fixed' | 'dynamic', threshold?: number);
/**
* Filters HTML content and returns an array of HTML blocks.
* @param html HTML string
* @returns Array of HTML blocks after filtering
*/
filterContent(html: string): Promise<string[]>;
/**
* Filters HTML content and returns a concatenated HTML string.
* @param html HTML string
* @returns Concatenated HTML string after filtering
*/
filterContentAsString(html: string): Promise<string>;
/**
* Removes HTML comments from the document.
* @param doc DOM document
*/
private removeComments;
/**
* Removes unwanted tags from the document.
* @param doc DOM document
*/
private removeUnwantedTags;
/**
* Check if element is a main content container that should be preserved
*/
private isMainContentContainer;
/**
* Prunes the tree structure.
* @param node Current node
*/
private pruneTree;
private isEssentialTag;
/**
* Computes a composite score for a node based on various metrics.
* @param metrics Filter metrics for the node
* @param textLen Length of text content
* @param tagLen Length of HTML content
* @param linkTextLen Length of text within links
* @returns Composite score
*/
private computeCompositeScore;
/**
* Computes a weight based on class names and ID.
* Negative patterns (like 'comment', 'nav') decrease the score.
* @param node HTML element
* @returns Weight based on class/ID
*/
private computeClassIdWeight;
/**
* Counts words in a string.
* @param text Input string
* @returns Number of words
*/
private countWords;
}