UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

117 lines (116 loc) 3.76 kB
/** * Intelligent Content Quality Filter * A universal content filtering system that identifies and preserves valuable text content * while removing noise, regardless of the website or page type. */ export interface ContentQualityOptions { /** Filtering intensity level */ intensity: 'minimal' | 'moderate' | 'aggressive' | 'maximum'; /** Minimum text density ratio for content blocks */ minTextDensity: number; /** Minimum meaningful text length */ minTextLength: number; /** Maximum allowed style/script ratio */ maxNoiseRatio: number; /** Whether to preserve navigation elements */ preserveNavigation: boolean; /** Whether to preserve structured data */ preserveStructuredData: boolean; /** Custom content quality scoring weights */ qualityWeights: { textDensity: number; semanticValue: number; structuralImportance: number; userEngagement: number; }; } export declare const DEFAULT_CONTENT_QUALITY_OPTIONS: ContentQualityOptions; export declare class IntelligentContentFilter { private options; constructor(options?: Partial<ContentQualityOptions>); /** * Apply intelligent content filtering to HTML string */ filterHtmlString(html: string): FilterResult; /** * Apply intelligent content filtering to the document */ filter(document: Document): FilterResult; /** * Remove elements that are obviously noise (styles, scripts, ads, etc.) */ private removeNoiseElements; /** * Analyze content blocks and assign quality scores */ private analyzeContentBlocks; /** * Analyze individual element and calculate quality score */ private analyzeElement; /** * Calculate text density (ratio of text to HTML) */ private calculateTextDensity; /** * Calculate semantic value based on content characteristics */ private calculateSemanticValue; /** * Calculate structural importance based on element position and semantics */ private calculateStructuralImportance; /** * Calculate user engagement value (links, interactive elements) */ private calculateUserEngagementValue; /** * Apply quality-based filtering to content blocks */ private applyQualityFiltering; /** * Get quality threshold based on intensity setting */ private getQualityThreshold; /** * Clean up empty containers after filtering */ private cleanupEmptyContainers; /** * Preserve elements that are definitely valuable */ private preserveHighValueContent; private isProtectedElement; private isNavigationElement; private hasLinks; private hasStructuredContent; private isReadableText; private containsCodeOrStyles; private getElementPosition; private isEmptyContainer; private simplifyElement; private calculateOverallQuality; } export interface ContentBlock { element: Element; index: number; textContent: string; textLength: number; textDensity: number; semanticValue: number; structuralImportance: number; userEngagement: number; qualityScore: number; shouldPreserve: boolean; } export interface FilterResult { removedElements: number; preservedElements: number; processingTime: number; qualityScore: number; contentBlocks: ContentBlock[]; } /** * Convenience function to create preset configurations */ export declare function createContentQualityPreset(preset: 'clean' | 'balanced' | 'aggressive' | 'maximum'): Partial<ContentQualityOptions>;