shrink-dom
Version:
适用于网页分析、内容提取、AI训练数据准备和网页爬虫等场景,帮助开发者更高效地处理和优化DOM结构。
80 lines (79 loc) • 1.8 kB
TypeScript
/**
* DOM节点类型常量
*/
export declare enum NodeTypeEnum {
ELEMENT_NODE = 1,
TEXT_NODE = 3,
DOCUMENT_NODE = 9
}
/**
* 节点的JSON表示
*/
export interface JsonNode {
type: 'element' | 'text' | 'template';
path?: string;
tag?: string;
attrs?: Record<string, string>;
children?: JsonNode[];
text?: string;
templateHash?: string;
templateId?: string;
params?: number[];
}
/**
* 潜在模板
*/
export interface PossibleTemplate {
hash: string;
structure: JsonNode;
occurrences: JsonNode[];
depth: number;
path?: string;
}
/**
* 优化后的模板
*/
export interface OptimizedTemplate {
structure: JsonNode;
inlineValues: Record<string, string | null>;
occurrences: JsonNode[];
depth: number;
hash: string;
path?: string;
}
/**
* 选择的模板
*/
export interface ChosenTemplate extends OptimizedTemplate {
id: string;
replacements: Map<JsonNode, number[]>;
}
export interface DOMShrinkerOptions {
minTemplateDepth?: number;
minTemplateOccurrences?: number;
templateIdPrefix?: string;
semanticAttributes?: string[];
useHeuristicRules?: boolean;
uiPatterns?: {
forms?: boolean;
navigation?: boolean;
cards?: boolean;
tables?: boolean;
custom?: Record<string, string[]>;
};
semanticPreservationLevel?: 'low' | 'medium' | 'high';
preserveDataAttributes?: boolean;
preserveAriaAttributes?: boolean;
preserveRoles?: boolean;
criticalAttributes?: string[];
}
/**
* ContentExtractor 的配置选项
*/
export interface ExtractorOptions {
skipKeywords?: string[];
preservedAttributes?: string[];
formElementTags?: string[];
emphasisTags?: string[];
briefTextThreshold?: number;
}