html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
80 lines (79 loc) • 2.58 kB
TypeScript
/**
* Page Type Detector - Automatically detects page types and suggests optimal filtering parameters
* Supports search engines, blogs, news sites, documentation, e-commerce, and more
*/
import { FilterOptions } from './types';
export interface PageTypeResult {
/** Detected page type */
type: PageType;
/** Confidence score (0-1) */
confidence: number;
/** Recommended filter options */
filterOptions: FilterOptions;
/** Detection reasons */
reasons: string[];
/** Page characteristics */
characteristics: PageCharacteristics;
}
export type PageType = 'search-engine' | 'blog' | 'news' | 'documentation' | 'e-commerce' | 'social-media' | 'forum' | 'landing-page' | 'article' | 'unknown';
export interface PageCharacteristics {
/** Has search functionality */
hasSearch: boolean;
/** Has navigation menu */
hasNavigation: boolean;
/** Has article content */
hasArticleContent: boolean;
/** Has product listings */
hasProductListings: boolean;
/** Has social features */
hasSocialFeatures: boolean;
/** Has comments section */
hasComments: boolean;
/** Has code blocks */
hasCodeBlocks: boolean;
/** Link density ratio */
linkDensity: number;
/** Text to HTML ratio */
textDensity: number;
/** Number of forms */
formCount: number;
/** Number of images */
imageCount: number;
}
export declare class PageTypeDetector {
private detectionRules;
/**
* Detects the page type based on HTML content and optional URL
*/
detectPageType(html: string, url?: string): Promise<PageTypeResult>;
/**
* Analyze page characteristics
*/
private analyzePageCharacteristics;
/**
* Calculate scores for each page type
*/
private calculateTypeScores;
/**
* Apply URL-based hints for page type detection
*/
private applyUrlHints;
/**
* Get optimal filter options for detected page type
*/
private getFilterOptionsForType;
private hasSearchElements;
private hasNavigationElements;
private hasArticleElements;
private hasProductElements;
private hasSocialElements;
private hasCommentElements;
private hasCodeElements;
private hasSearchEngineIndicators;
private hasBlogIndicators;
private hasNewsIndicators;
private hasDocIndicators;
private hasForumIndicators;
private createUnknownResult;
}
export declare const pageTypeDetector: PageTypeDetector;