UNPKG

shrink-dom

Version:

适用于网页分析、内容提取、AI训练数据准备和网页爬虫等场景,帮助开发者更高效地处理和优化DOM结构。

80 lines (79 loc) 1.8 kB
/** * DOM节点类型常量 */ export declare enum NodeTypeEnum { ELEMENT_NODE = 1, TEXT_NODE = 3, DOCUMENT_NODE = 9 } /** * 节点的JSON表示 */ export interface JsonNode { type: 'element' | 'text' | 'template'; path?: string; tag?: string; attrs?: Record<string, string>; children?: JsonNode[]; text?: string; templateHash?: string; templateId?: string; params?: number[]; } /** * 潜在模板 */ export interface PossibleTemplate { hash: string; structure: JsonNode; occurrences: JsonNode[]; depth: number; path?: string; } /** * 优化后的模板 */ export interface OptimizedTemplate { structure: JsonNode; inlineValues: Record<string, string | null>; occurrences: JsonNode[]; depth: number; hash: string; path?: string; } /** * 选择的模板 */ export interface ChosenTemplate extends OptimizedTemplate { id: string; replacements: Map<JsonNode, number[]>; } export interface DOMShrinkerOptions { minTemplateDepth?: number; minTemplateOccurrences?: number; templateIdPrefix?: string; semanticAttributes?: string[]; useHeuristicRules?: boolean; uiPatterns?: { forms?: boolean; navigation?: boolean; cards?: boolean; tables?: boolean; custom?: Record<string, string[]>; }; semanticPreservationLevel?: 'low' | 'medium' | 'high'; preserveDataAttributes?: boolean; preserveAriaAttributes?: boolean; preserveRoles?: boolean; criticalAttributes?: string[]; } /** * ContentExtractor 的配置选项 */ export interface ExtractorOptions { skipKeywords?: string[]; preservedAttributes?: string[]; formElementTags?: string[]; emphasisTags?: string[]; briefTextThreshold?: number; }