shrink-dom
Version:
适用于网页分析、内容提取、AI训练数据准备和网页爬虫等场景,帮助开发者更高效地处理和优化DOM结构。
832 lines (827 loc) • 30.5 kB
JavaScript
import { JSDOM } from "jsdom";
//#region src/dom-domain-analyzer.ts
var DOMDomainAnalyzer = class DOMDomainAnalyzer {
constructor(options = {}) {
this.annotatedElements = [];
this.interactiveElements = [];
this.annotationAttribute = options.annotationAttribute || DOMDomainAnalyzer.DEFAULT_CONFIG.ANNOTATION_ATTRIBUTE;
this.minInteractiveSize = options.minInteractiveSize || DOMDomainAnalyzer.DEFAULT_CONFIG.MIN_INTERACTIVE_SIZE;
this.highlightStyle = options.highlightStyle || DOMDomainAnalyzer.DEFAULT_CONFIG.HIGHLIGHT_STYLE;
this.highlightDuration = options.highlightDuration || DOMDomainAnalyzer.DEFAULT_CONFIG.HIGHLIGHT_DURATION;
this.enableHighlight = options.enableHighlight !== void 0 ? options.enableHighlight : DOMDomainAnalyzer.DEFAULT_CONFIG.ENABLE_HIGHLIGHT;
}
getElements() {
return [...this.annotatedElements];
}
isElementVisible(element, computedStyle) {
return !(computedStyle.display === "none" || computedStyle.visibility === "hidden" || computedStyle.opacity === "0" || element.getAttribute("data-visible") === "false" || element.getAttribute("data-hidden") === "true" || element.getAttribute("data-ignore") === "true" || element.getAttribute("hidden") === "true");
}
isElementInteractive(element, computedStyle) {
const tagName = element.tagName;
const isInteractiveTag = DOMDomainAnalyzer.INTERACTIVE_TAGS.includes(tagName) || tagName === "INPUT" && DOMDomainAnalyzer.INTERACTIVE_INPUT_TYPES.includes(element.type) || tagName === "LABEL" && element.hasAttribute("for");
const hasInteractiveAttr = Boolean(element.onclick) || Boolean(element.onmousedown) || Boolean(element.onmouseup) || Boolean(element.hasAttribute("onclick")) || Boolean(element.hasAttribute("onmousedown")) || Boolean(element.hasAttribute("onmouseup")) || element.hasAttribute("role") && DOMDomainAnalyzer.INTERACTIVE_ROLES.includes(element.getAttribute("role") || "") || element.hasAttribute("tabindex");
return isInteractiveTag || hasInteractiveAttr || computedStyle.cursor === "pointer";
}
isElementEditable(element) {
const tagName = element.tagName;
const isFormElement = (tagName === "INPUT" && !DOMDomainAnalyzer.NON_EDITABLE_INPUT_TYPES.includes(element.type) || tagName === "SELECT" || tagName === "TEXTAREA") && !(element.hasAttribute("readonly") || element.hasAttribute("disabled"));
const isContentEditable = element.hasAttribute("contenteditable") && element.getAttribute("contenteditable") !== "false";
return isFormElement || isContentEditable;
}
traverseNode(node) {
const annotatedNode = node.cloneNode(false);
if (node.nodeType === DOMDomainAnalyzer.ELEMENT_NODE) {
const htmlElement = node;
const computedStyle = window.getComputedStyle(htmlElement);
this.annotatedElements.push(htmlElement);
const elementIndex = this.annotatedElements.length - 1;
annotatedNode.setAttribute(this.annotationAttribute, elementIndex.toString());
const isVisible = this.isElementVisible(htmlElement, computedStyle);
annotatedNode.setAttribute("data-visible", String(isVisible));
const isInteractive = this.isElementInteractive(htmlElement, computedStyle);
annotatedNode.setAttribute("data-interactive", String(isInteractive));
const isEditable = this.isElementEditable(htmlElement);
annotatedNode.setAttribute("data-editable", String(isEditable));
if (isEditable || isInteractive) this.interactiveElements.push(htmlElement);
if (htmlElement.hasAttribute("id")) annotatedNode.setAttribute("data-original-id", htmlElement.id);
if (htmlElement.hasAttribute("class")) annotatedNode.setAttribute("data-original-class", htmlElement.className);
if (htmlElement.hasAttribute("name")) annotatedNode.setAttribute("data-original-name", htmlElement.getAttribute("name") || "");
if (htmlElement.tagName === "INPUT" || htmlElement.tagName === "TEXTAREA" || htmlElement.tagName === "SELECT") {
const inputElement = htmlElement;
if ("value" in inputElement) annotatedNode.setAttribute("data-value", inputElement.value);
if ("checked" in inputElement && typeof inputElement.checked === "boolean") annotatedNode.setAttribute("data-checked", inputElement.checked.toString());
if ("disabled" in inputElement && typeof inputElement.disabled === "boolean") annotatedNode.setAttribute("data-disabled", inputElement.disabled.toString());
}
}
for (const child of node.childNodes) {
const childNode = this.traverseNode(child);
annotatedNode.appendChild(childNode);
}
return annotatedNode;
}
highlightInteractiveElements() {
if (!this.enableHighlight) return;
if (!document || !window || typeof document.createElement !== "function") {
console.warn("当前环境不支持 DOM 高亮功能");
return;
}
const elementsToHighlight = this.interactiveElements.filter((el) => {
try {
const computedStyle = window.getComputedStyle(el);
const width = Number.parseInt(computedStyle.width, 10);
const height = Number.parseInt(computedStyle.height, 10);
return width >= this.minInteractiveSize && height >= this.minInteractiveSize;
} catch (e) {
console.warn("获取元素样式时出错", e);
return false;
}
});
const transitionStyle = DOMDomainAnalyzer.DEFAULT_CONFIG.TRANSITION_STYLE;
for (const el of elementsToHighlight) try {
const originalFilter = el.style.filter;
el.style.filter = this.highlightStyle;
el.style.transition = transitionStyle;
el.setAttribute("data-original-filter", originalFilter);
} catch (e) {
console.warn("应用高亮样式时出错", e);
}
setTimeout(() => {
for (const el of elementsToHighlight) try {
el.style.filter = el.getAttribute("data-original-filter") || "";
el.removeAttribute("data-original-filter");
} catch (e) {
console.warn("恢复原始样式时出错", e);
}
}, this.highlightDuration);
}
getAnalyzedHTML(element) {
this.annotatedElements = [];
this.interactiveElements = [];
const analyzedDocument = this.traverseNode(element ?? document.documentElement);
this.highlightInteractiveElements();
return analyzedDocument.outerHTML;
}
static analyze(options) {
const analyzer = new DOMDomainAnalyzer(options);
return analyzer.getAnalyzedHTML();
}
};
DOMDomainAnalyzer.DEFAULT_CONFIG = {
ANNOTATION_ATTRIBUTE: "data-element-id",
MIN_INTERACTIVE_SIZE: 20,
HIGHLIGHT_STYLE: "brightness(1.2) contrast(1.1)",
HIGHLIGHT_DURATION: 500,
ENABLE_HIGHLIGHT: true,
TRANSITION_STYLE: "filter 0.3s ease"
};
DOMDomainAnalyzer.INTERACTIVE_TAGS = ["A", "BUTTON"];
DOMDomainAnalyzer.INTERACTIVE_INPUT_TYPES = [
"button",
"submit",
"reset",
"checkbox",
"radio",
"file"
];
DOMDomainAnalyzer.INTERACTIVE_ROLES = [
"button",
"link",
"checkbox",
"radio",
"menuitem",
"tab"
];
DOMDomainAnalyzer.NON_EDITABLE_INPUT_TYPES = [
"button",
"submit",
"reset",
"hidden",
"file"
];
DOMDomainAnalyzer.ELEMENT_NODE = 1;
//#endregion
//#region src/types.ts
var NodeTypeEnum;
(function(NodeTypeEnum$1) {
NodeTypeEnum$1[NodeTypeEnum$1["ELEMENT_NODE"] = 1] = "ELEMENT_NODE";
NodeTypeEnum$1[NodeTypeEnum$1["TEXT_NODE"] = 3] = "TEXT_NODE";
NodeTypeEnum$1[NodeTypeEnum$1["DOCUMENT_NODE"] = 9] = "DOCUMENT_NODE";
})(NodeTypeEnum || (NodeTypeEnum = {}));
//#endregion
//#region src/shrink-dom.ts
var DOMShrinker = class DOMShrinker {
/**
* 创建一个新的DOM压缩器实例
* @param options 配置选项
*/
constructor(options = {}) {
this.options = {
minTemplateDepth: options.minTemplateDepth ?? DOMShrinker.DEFAULT_MIN_TEMPLATE_DEPTH,
minTemplateOccurrences: options.minTemplateOccurrences ?? DOMShrinker.DEFAULT_MIN_TEMPLATE_OCCURRENCES,
templateIdPrefix: options.templateIdPrefix ?? DOMShrinker.DEFAULT_TEMPLATE_ID_PREFIX,
semanticAttributes: options.semanticAttributes ?? DOMShrinker.DEFAULT_SEMANTIC_ATTRIBUTES,
useHeuristicRules: options.useHeuristicRules ?? true,
uiPatterns: {
forms: options.uiPatterns?.forms ?? true,
navigation: options.uiPatterns?.navigation ?? true,
cards: options.uiPatterns?.cards ?? true,
tables: options.uiPatterns?.tables ?? true,
custom: options.uiPatterns?.custom ?? {}
},
semanticPreservationLevel: options.semanticPreservationLevel ?? "medium",
preserveDataAttributes: options.preserveDataAttributes ?? true,
preserveAriaAttributes: options.preserveAriaAttributes ?? true,
preserveRoles: options.preserveRoles ?? true,
criticalAttributes: options.criticalAttributes ?? [
"id",
"name",
"action",
"method"
]
};
}
/**
* 将DOM节点转换为JSON表示
*/
nodeToJson(node) {
if (node.nodeType === NodeTypeEnum.TEXT_NODE) {
const text = node.textContent?.trim() || "";
if (!text) return {
type: "text",
text: ""
};
return {
type: "text",
text
};
}
if (node.nodeType === NodeTypeEnum.ELEMENT_NODE) {
const element = node;
const attrs = {};
for (const attr of Array.from(element.attributes)) if (attr) attrs[attr.name] = attr.value || "";
const children = Array.from(node.childNodes).map((child) => this.nodeToJson(child)).filter((childJson) => childJson.type === "text" && childJson.text || childJson.type === "element");
return {
type: "element",
tag: element.tagName.toLowerCase(),
attrs: Object.keys(attrs).length > 0 ? attrs : void 0,
children: children.length > 0 ? children : void 0
};
}
return {
type: "text",
text: ""
};
}
/**
* 检查属性是否为语义属性
*/
isSemanticAttribute(attrName) {
if (this.options.semanticAttributes.includes(attrName)) return true;
if (this.options.semanticPreservationLevel !== "low") {
if (this.options.preserveDataAttributes && attrName.startsWith("data-")) return true;
if (this.options.preserveAriaAttributes && attrName.startsWith("aria-")) return true;
if (this.options.preserveRoles && attrName === "role") return true;
}
return this.options.criticalAttributes.includes(attrName);
}
/**
* 改进的计算节点哈希函数,考虑语义因素
*/
computeNodeHash(node) {
if (node.type === "text") return `text:${node.text?.length || 0}`;
const regularAttrs = [];
const semanticAttrs = [];
if (node.attrs) for (const [key, value] of Object.entries(node.attrs)) if (this.isSemanticAttribute(key)) semanticAttrs.push(`${key}=${value}`);
else regularAttrs.push(key);
let patternSignature = "";
if (this.options.useHeuristicRules && node.type === "element") patternSignature = this.detectUIPattern(node);
const childrenHashes = node.children ? node.children.map((child) => this.computeNodeHash(child)).join(",") : "";
return `${node.tag}[pattern:${patternSignature}][semantic:${semanticAttrs.sort().join(",")}][attr:${regularAttrs.sort().join(",")}](${childrenHashes})`;
}
/**
* 检测节点的UI模式
*/
detectUIPattern(node) {
if (node.type !== "element") return "";
const tag = node.tag?.toLowerCase() || "";
const classAttr = node.attrs?.class || "";
const classes = classAttr.split(/\s+/).filter(Boolean);
const role = node.attrs?.role || "";
if (this.options.uiPatterns.forms && (DOMShrinker.UI_PATTERN_SELECTORS.forms.includes(tag) || role === "form" || classAttr.includes("form"))) return "form";
if (this.options.uiPatterns.navigation && (DOMShrinker.UI_PATTERN_SELECTORS.navigation.includes(tag) || role === "navigation" || classAttr.includes("nav"))) return "navigation";
if (this.options.uiPatterns.cards && DOMShrinker.UI_PATTERN_SELECTORS.cards.some((selector) => tag === selector || classes.some((cls) => selector === `.${cls}`))) return "card";
if (this.options.uiPatterns.tables && (DOMShrinker.UI_PATTERN_SELECTORS.tables.includes(tag) || role === "table" || role === "grid")) return "table";
for (const [patternName, selectors] of Object.entries(this.options.uiPatterns.custom || {})) if (selectors.some((selector) => tag === selector || classes.some((cls) => selector === `.${cls}`) || selector.startsWith("[") && selector.endsWith("]") && node.attrs?.[selector.slice(1, -1)])) return patternName;
return "";
}
/**
* 改进的模板检测方法,考虑节点的上下文位置
*/
detectTemplates(root) {
const templates = {};
const minDepth = this.options.minTemplateDepth;
const minOccurrences = this.options.minTemplateOccurrences;
const traverse = (node, depth, path = "") => {
node.path = path;
const basicHash = this.computeNodeHash(node);
node.templateHash = basicHash;
if (depth >= minDepth && node.type === "element") {
const contextualHash = this.createContextualHash(node, basicHash, path);
if (!templates[contextualHash]) templates[contextualHash] = {
hash: contextualHash,
structure: JSON.parse(JSON.stringify(node)),
occurrences: [],
depth,
path
};
templates[contextualHash].occurrences.push(node);
}
if (node.children) for (let i = 0; i < node.children.length; i++) traverse(node.children[i], depth + 1, `${path}/${node.tag}[${i}]`);
};
traverse(root, 0, "");
return Object.values(templates).filter((t) => t.occurrences.length >= minOccurrences);
}
/**
* 创建考虑上下文的哈希值
*/
createContextualHash(node, basicHash, path) {
const pathSegments = path.split("/").filter(Boolean);
const contextDepth = Math.min(3, pathSegments.length);
const relevantPath = pathSegments.slice(-contextDepth).join("/");
const semanticId = this.extractSemanticIdentifier(node);
if (this.options.semanticPreservationLevel === "high" && semanticId) return `${basicHash}#${relevantPath}#${semanticId}`;
if (this.options.semanticPreservationLevel === "medium") {
const pattern = this.detectUIPattern(node);
if (pattern) return `${basicHash}#${pattern}#${relevantPath}`;
}
return basicHash;
}
/**
* 提取节点的语义标识符
*/
extractSemanticIdentifier(node) {
if (node.type !== "element" || !node.attrs) return "";
for (const attr of [
"id",
"data-testid",
"data-purpose",
"name"
]) if (node.attrs[attr]) return `${attr}:${node.attrs[attr]}`;
if (node.attrs.role) return `role:${node.attrs.role}`;
const classAttr = node.attrs.class || "";
const semanticClasses = classAttr.split(/\s+/).filter((cls) => /-(form|container|section|card|wrapper|box|panel)$/.test(cls));
if (semanticClasses.length > 0) return `class:${semanticClasses[0]}`;
return "";
}
/**
* 分析并优化模板结构
*/
analyzeTemplate(template) {
const inlineValues = {};
const nodeCache = new Map();
const analyzeNode = (nodes, path) => {
if (nodes.length === 0) return;
const cacheKey = `${path}-${nodes.length}`;
if (nodeCache.has(cacheKey)) return;
nodeCache.set(cacheKey, nodes);
if (nodes[0]?.type === "text") {
const firstText = nodes[0]?.text;
const allSame = nodes.every((n) => n.text === firstText);
if (allSame) inlineValues[path] = firstText || null;
else inlineValues[path] = null;
return;
}
if (nodes[0]?.attrs) {
const attrKeys = Object.keys(nodes[0]?.attrs || {});
for (const key of attrKeys) {
const attrPath = `${path}.attrs.${key}`;
const firstValue = nodes[0]?.attrs?.[key] || "";
if (this.isSemanticAttribute(key)) if (this.options.semanticPreservationLevel === "high") inlineValues[attrPath] = firstValue;
else if (this.options.semanticPreservationLevel === "medium") {
const allSame = nodes.every((n) => n.attrs && n.attrs[key] === firstValue);
inlineValues[attrPath] = allSame ? firstValue : null;
} else {
const allSame = nodes.every((n) => n.attrs && n.attrs[key] === firstValue);
inlineValues[attrPath] = allSame ? firstValue : null;
}
else {
const allSame = nodes.every((n) => n.attrs && n.attrs[key] === firstValue);
inlineValues[attrPath] = allSame ? firstValue : null;
}
}
}
if (nodes[0]?.children) nodes[0]?.children.forEach((_, i) => {
const childNodes = nodes.map((n) => n.children?.[i]).filter((node) => Boolean(node));
analyzeNode(childNodes, `${path}.children.${i}`);
});
};
analyzeNode(template.occurrences, "");
return {
structure: template.structure,
inlineValues,
occurrences: template.occurrences,
depth: template.depth,
hash: template.hash,
path: template.path
};
}
/**
* 筛选最佳模板
*/
selectBestTemplates(templates) {
const sortedTemplates = [...templates].sort((a, b) => {
const scoreA = a.occurrences.length * a.depth;
const scoreB = b.occurrences.length * b.depth;
return scoreB - scoreA;
});
const chosen = [];
const usedNodes = new Set();
for (const template of sortedTemplates) {
const availableOccurrences = template.occurrences.filter((node) => !usedNodes.has(node));
if (availableOccurrences.length >= 2) {
const replacements = new Map();
for (const node of availableOccurrences) {
const params = [];
Object.entries(template.inlineValues).filter(([_, value]) => value === null).forEach((_, index) => {
params.push(index);
});
replacements.set(node, params);
usedNodes.add(node);
}
chosen.push({
...template,
id: `${this.options.templateIdPrefix}${chosen.length + 1}`,
occurrences: availableOccurrences,
replacements
});
}
}
return chosen;
}
/**
* 构建模板化的节点树
*/
buildTemplateTree(root, templates) {
const cloneWithTemplates = (node) => {
for (const template of templates) if (template.replacements.has(node)) {
const params = template.replacements.get(node);
if (params) return {
type: "template",
templateId: template.id,
params
};
}
const clone = { ...node };
if (node.children) clone.children = node.children.map(cloneWithTemplates);
return clone;
};
return cloneWithTemplates(root);
}
/**
* 将节点树序列化为字符串
*/
stringifyTree(root, templates) {
const parts = [];
for (const template of templates) {
const templateDef = this.stringifyTemplate(template);
parts.push(`${template.id}: ${templateDef}`);
}
parts.push("");
parts.push(this.stringifyNode(root));
return parts.join("\n");
}
/**
* 将模板结构序列化为字符串
*/
stringifyTemplate(template) {
const serializeStructure = (node) => {
if (node.type === "text") return this.escapeHtml(node.text || "");
let result = `<${node.tag}`;
if (node.attrs) for (const [key, value] of Object.entries(node.attrs)) result += ` ${key}="${value}"`;
if (!node.children || node.children.length === 0) {
if (DOMShrinker.SELF_CLOSING_TAGS.has(node.tag || "")) return `${result} />`;
return `${result}></${node.tag}>`;
}
result += ">";
if (node.children) for (const child of node.children) result += serializeStructure(child);
return `${result}</${node.tag}>`;
};
return serializeStructure(template.structure);
}
/**
* 将单个节点序列化为字符串
*/
stringifyNode(node) {
if (node.type === "template") return `{${node.templateId}(${node.params?.join(",") || ""})}`;
if (node.type === "text") return this.escapeHtml(node.text || "");
let result = `<${node.tag}`;
if (node.attrs) {
const attrParts = [];
for (const [key, value] of Object.entries(node.attrs)) attrParts.push(` ${key}="${value}"`);
result += attrParts.join("");
}
if (!node.children || node.children.length === 0) {
if (DOMShrinker.SELF_CLOSING_TAGS.has(node.tag || "")) return `${result} />`;
return `${result}></${node.tag}>`;
}
result += ">";
if (node.children) {
const childParts = [];
for (const child of node.children) childParts.push(this.stringifyNode(child));
result += childParts.join("");
}
return `${result}</${node.tag}>`;
}
escapeHtml(text) {
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
}
calculateNodeSize(node) {
let size = 1;
if (node.attrs) size += Object.keys(node.attrs).length;
if (node.children) size += node.children.reduce((sum, child) => sum + this.calculateNodeSize(child), 0);
return size;
}
/**
* 压缩HTML
* @param document DOM文档或文档元素
* @returns 压缩后的HTML字符串
*/
compressHTML(document$1) {
const cache = new Map();
const nodeToJsonWithCache = (node) => {
if (cache.has(node)) return cache.get(node);
const result = this.nodeToJson(node);
cache.set(node, result);
return result;
};
const rootElement = document$1.nodeType === NodeTypeEnum.DOCUMENT_NODE ? document$1.body : document$1;
const jsonTree = nodeToJsonWithCache(rootElement);
const templates = this.detectTemplates(jsonTree);
const optimizedTemplates = templates.map((template) => this.analyzeTemplate(template));
const chosenTemplates = this.selectBestTemplates(optimizedTemplates);
const templateTree = this.buildTemplateTree(jsonTree, chosenTemplates);
return this.stringifyTree(templateTree, chosenTemplates);
}
/**
* 从HTML字符串创建压缩版本
* @param html HTML字符串
* @param options 配置选项
* @returns 压缩后的HTML字符串
*/
static compressHTMLString(html, options) {
const dom = new JSDOM(html);
return new DOMShrinker(options).compressHTML(dom.window.document.body);
}
};
DOMShrinker.SELF_CLOSING_TAGS = new Set([
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr"
]);
DOMShrinker.DEFAULT_MIN_TEMPLATE_DEPTH = 2;
DOMShrinker.DEFAULT_MIN_TEMPLATE_OCCURRENCES = 2;
DOMShrinker.DEFAULT_TEMPLATE_ID_PREFIX = "T";
DOMShrinker.DEFAULT_SEMANTIC_ATTRIBUTES = [
"id",
"role",
"data-testid",
"data-purpose",
"data-section",
"aria-label",
"aria-labelledby",
"aria-describedby",
"name",
"type",
"for"
];
DOMShrinker.UI_PATTERN_SELECTORS = {
forms: [
"form",
"input",
"button",
"select",
"textarea",
"label"
],
navigation: [
"nav",
"a",
"ul.menu",
"ul.nav",
".navigation"
],
cards: [
".card",
"article",
".item",
".product"
],
tables: [
"table",
"tr",
"td",
"th",
"thead",
"tbody"
]
};
//#endregion
//#region src/dom-content-extractor.ts
var DOMContentExtractor = class DOMContentExtractor {
/**
* 创建内容提取器实例
* @param options 配置选项
*/
constructor(options = {}) {
this.jsdomInstance = new JSDOM("<!DOCTYPE html><html><body></body></html>");
this.documentNode = this.jsdomInstance.window.document;
this.nodeType = this.jsdomInstance.window.Node;
this.formElementTags = new Set(options.formElementTags || DOMContentExtractor.FORM_ELEMENT_TAGS);
this.preservedAttributes = options.preservedAttributes || DOMContentExtractor.PRESERVED_ATTRIBUTES;
this.preservedAttributesSet = new Set(this.preservedAttributes);
this.emphasisTags = new Set(options.emphasisTags || DOMContentExtractor.EMPHASIS_TAGS);
this.skipKeywords = options.skipKeywords || DOMContentExtractor.SKIP_KEYWORDS;
}
/**
* 提取简化的DOM内容
* @param element 需要处理的DOM节点
* @returns 提取后的DOM节点或null
*/
extract(element) {
return this.transformNode(element);
}
/**
* 检查提取的内容是否有意义
* @param element 提取后的DOM节点
* @returns 是否有意义
*/
isContentMeaningful(element) {
if (!element) return false;
const allNodes = [];
const collectNodes = (node) => {
allNodes.push(node);
if (this.isElementNode(node)) {
const children = Array.from(node.childNodes);
for (const child of children) collectNodes(child);
}
};
collectNodes(element);
if (allNodes.length < 10) return false;
const textNodes = allNodes.filter((node) => this.isTextNode(node));
const elementNodes = allNodes.filter((node) => this.isElementNode(node));
if (textNodes.length === allNodes.length) return false;
if (elementNodes.length === allNodes.length && !allNodes.some((node) => node.textContent?.trim())) return false;
return true;
}
/**
* 转换DOM节点
* @param element 需要处理的DOM节点
* @returns 处理后的DOM节点或null
* @private
*/
transformNode(element) {
if (this.isTextNode(element)) {
const text = element.textContent?.trim();
return text ? this.documentNode.createTextNode(`${text} `) : null;
}
const type = Object.prototype.toString.call(element);
if (!/HTML.*Element/.test(type)) return null;
const htmlElement = element;
const tagName = htmlElement.tagName;
if (this.shouldSkipBasedOnTag(htmlElement, tagName)) return null;
if (this.emphasisTags.has(tagName)) {
const textContent = htmlElement.textContent?.trim();
return textContent ? this.documentNode.createTextNode(`${textContent} `) : null;
}
if (this.shouldSkipElement(htmlElement)) return null;
const processedChildren = this.processChildNodes(htmlElement, tagName);
if (processedChildren.length === 0 && !this.formElementTags.has(tagName)) return null;
const shouldKeepElement = this.shouldKeepElement(htmlElement);
if (!shouldKeepElement && processedChildren.length === 1) return processedChildren[0] ?? null;
return this.createContainerWithAttributes(htmlElement, tagName, processedChildren);
}
isTextNode(node) {
return node.nodeType === this.nodeType.TEXT_NODE;
}
isElementNode(node) {
return node.nodeType === this.nodeType.ELEMENT_NODE;
}
shouldSkipBasedOnTag(element, tagName) {
if (tagName === "IMG") return true;
if (element.getAttribute("data-visible") !== "true") return true;
if (this.isAdvertisement(element)) return true;
if (element.childNodes.length === 0 && !this.formElementTags.has(tagName)) return true;
return false;
}
isAdvertisement(element) {
const className = element.className.toLowerCase();
if (className && /\b(ad|ads|advert|advertisement|banner|sponsor|sponsored|promotion)\b/.test(className)) return true;
const id = element.id.toLowerCase();
if (id && /\b(ad|ads|advert|advertisement|banner|sponsor|sponsored|promotion)\b/.test(id)) return true;
if (element.hasAttribute("data-ad") || element.hasAttribute("data-sponsored") || element.getAttribute("data-ad-client") || element.getAttribute("aria-label")?.toLowerCase().includes("广告")) return true;
return false;
}
processChildNodes(element, tagName) {
const childNodes = element.childNodes;
const childrenLength = childNodes.length;
const processedChildren = [];
for (let i = 0; i < childrenLength; i++) {
const childNode = childNodes[i];
if (childNode) {
const result = this.transformNode(childNode);
if (result) processedChildren.push(result);
}
}
if (tagName === "BODY") {
let j = 0;
while (j < processedChildren.length) {
const child = processedChildren[j];
if (child && this.isTextNode(child)) processedChildren.splice(j, 1);
else j++;
}
}
return processedChildren;
}
shouldKeepElement(element) {
const isInteractive = element.getAttribute("data-interactive") === "true" || element.hasAttribute("role");
const isEditable = element.getAttribute("data-editable") === "true";
const hasLabel = element.hasAttribute("aria-label") || element.hasAttribute("name");
return isInteractive || hasLabel || isEditable;
}
createContainerWithAttributes(htmlElement, tagName, processedChildren) {
const container = this.documentNode.createElement(tagName);
const elementText = htmlElement.textContent?.trim() || "";
const titleValue = htmlElement.getAttribute("title");
const ariaLabelValue = htmlElement.getAttribute("aria-label");
const titleEqualsAriaLabel = titleValue && ariaLabelValue && titleValue === ariaLabelValue;
const attrNames = htmlElement.getAttributeNames();
for (let i = 0; i < attrNames.length; i++) {
const attr = attrNames[i];
if (attr && this.preservedAttributesSet.has(attr)) {
const attrValue = htmlElement.getAttribute(attr);
if ((attr === "title" || attr === "aria-label") && attrValue === elementText) continue;
if (titleEqualsAriaLabel && attr === "title") continue;
container.setAttribute(attr, attrValue);
}
}
const isInteractive = htmlElement.getAttribute("data-interactive") === "true" || htmlElement.hasAttribute("role");
const isEditable = htmlElement.getAttribute("data-editable") === "true";
if (isInteractive || isEditable) {
const dataId = htmlElement.getAttribute("data-id") || htmlElement.getAttribute("data-element-id");
if (dataId) container.setAttribute("id", dataId);
if (isEditable) container.setAttribute("editable", "true");
}
for (let i = 0; i < processedChildren.length; i++) {
const child = processedChildren[i];
if (child) container.appendChild(child);
}
return container;
}
/**
* 检测节点是否应该被跳过
* @param element HTML元素
* @returns 是否应该被跳过
* @private
*/
shouldSkipElement(element) {
const directText = this.getDirectTextContent(element).toLowerCase();
if (directText && this.containsAnyKeyword(directText, this.skipKeywords)) return true;
return false;
}
/**
* 获取元素的直接文本内容(不包括子元素的文本)
* @param element HTML元素
* @returns 直接文本内容
* @private
*/
getDirectTextContent(element) {
let directText = "";
for (let i = 0; i < element.childNodes.length; i++) {
const node = element.childNodes[i];
if (this.isTextNode(node)) directText += node.textContent || "";
}
return directText.trim();
}
/**
* 检查文本是否包含关键词列表中的任一关键词
* @param text 要检查的文本
* @param keywords 关键词列表
* @returns 是否包含关键词
* @private
*/
containsAnyKeyword(text, keywords) {
for (let i = 0; i < keywords.length; i++) {
const keyword = keywords[i];
if (keyword && text.includes(keyword.toLowerCase())) return true;
}
return false;
}
};
/**
* 默认值常量
*/
DOMContentExtractor.FORM_ELEMENT_TAGS = [
"INPUT",
"TEXTAREA",
"SELECT",
"BUTTON",
"OPTION",
"LABEL"
];
DOMContentExtractor.PRESERVED_ATTRIBUTES = [
"aria-label",
"data-name",
"name",
"type",
"placeholder",
"value",
"role",
"title"
];
DOMContentExtractor.EMPHASIS_TAGS = [
"EM",
"STRONG",
"B",
"I",
"MARK",
"SMALL",
"DEL",
"INS",
"SUB",
"SUP"
];
DOMContentExtractor.BRIEF_TEXT_THRESHOLD = 200;
DOMContentExtractor.SKIP_KEYWORDS = [
"copyright",
"©",
"all rights reserved",
"版权所有",
"隐私政策",
"privacy policy",
"使用条款",
"terms of service",
"terms of use",
"terms and conditions",
"cookie",
"cookies政策",
"广告",
"advertisement",
"sponsored",
"赞助",
"ad",
"ads",
"promotion"
];
//#endregion
export { DOMContentExtractor, DOMDomainAnalyzer, DOMShrinker };