UNPKG

shrink-dom

Version:

适用于网页分析、内容提取、AI训练数据准备和网页爬虫等场景,帮助开发者更高效地处理和优化DOM结构。

858 lines (852 loc) 31.5 kB
"use strict"; //#region rolldown:runtime var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) { key = keys[i]; if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: ((k) => from[k]).bind(null, key), enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod)); //#endregion const jsdom = __toESM(require("jsdom")); //#region src/dom-domain-analyzer.ts var DOMDomainAnalyzer = class DOMDomainAnalyzer { constructor(options = {}) { this.annotatedElements = []; this.interactiveElements = []; this.annotationAttribute = options.annotationAttribute || DOMDomainAnalyzer.DEFAULT_CONFIG.ANNOTATION_ATTRIBUTE; this.minInteractiveSize = options.minInteractiveSize || DOMDomainAnalyzer.DEFAULT_CONFIG.MIN_INTERACTIVE_SIZE; this.highlightStyle = options.highlightStyle || DOMDomainAnalyzer.DEFAULT_CONFIG.HIGHLIGHT_STYLE; this.highlightDuration = options.highlightDuration || DOMDomainAnalyzer.DEFAULT_CONFIG.HIGHLIGHT_DURATION; this.enableHighlight = options.enableHighlight !== void 0 ? options.enableHighlight : DOMDomainAnalyzer.DEFAULT_CONFIG.ENABLE_HIGHLIGHT; } getElements() { return [...this.annotatedElements]; } isElementVisible(element, computedStyle) { return !(computedStyle.display === "none" || computedStyle.visibility === "hidden" || computedStyle.opacity === "0" || element.getAttribute("data-visible") === "false" || element.getAttribute("data-hidden") === "true" || element.getAttribute("data-ignore") === "true" || element.getAttribute("hidden") === "true"); } isElementInteractive(element, computedStyle) { const tagName = element.tagName; const isInteractiveTag = DOMDomainAnalyzer.INTERACTIVE_TAGS.includes(tagName) || tagName === "INPUT" && DOMDomainAnalyzer.INTERACTIVE_INPUT_TYPES.includes(element.type) || tagName === "LABEL" && element.hasAttribute("for"); const hasInteractiveAttr = Boolean(element.onclick) || Boolean(element.onmousedown) || Boolean(element.onmouseup) || Boolean(element.hasAttribute("onclick")) || Boolean(element.hasAttribute("onmousedown")) || Boolean(element.hasAttribute("onmouseup")) || element.hasAttribute("role") && DOMDomainAnalyzer.INTERACTIVE_ROLES.includes(element.getAttribute("role") || "") || element.hasAttribute("tabindex"); return isInteractiveTag || hasInteractiveAttr || computedStyle.cursor === "pointer"; } isElementEditable(element) { const tagName = element.tagName; const isFormElement = (tagName === "INPUT" && !DOMDomainAnalyzer.NON_EDITABLE_INPUT_TYPES.includes(element.type) || tagName === "SELECT" || tagName === "TEXTAREA") && !(element.hasAttribute("readonly") || element.hasAttribute("disabled")); const isContentEditable = element.hasAttribute("contenteditable") && element.getAttribute("contenteditable") !== "false"; return isFormElement || isContentEditable; } traverseNode(node) { const annotatedNode = node.cloneNode(false); if (node.nodeType === DOMDomainAnalyzer.ELEMENT_NODE) { const htmlElement = node; const computedStyle = window.getComputedStyle(htmlElement); this.annotatedElements.push(htmlElement); const elementIndex = this.annotatedElements.length - 1; annotatedNode.setAttribute(this.annotationAttribute, elementIndex.toString()); const isVisible = this.isElementVisible(htmlElement, computedStyle); annotatedNode.setAttribute("data-visible", String(isVisible)); const isInteractive = this.isElementInteractive(htmlElement, computedStyle); annotatedNode.setAttribute("data-interactive", String(isInteractive)); const isEditable = this.isElementEditable(htmlElement); annotatedNode.setAttribute("data-editable", String(isEditable)); if (isEditable || isInteractive) this.interactiveElements.push(htmlElement); if (htmlElement.hasAttribute("id")) annotatedNode.setAttribute("data-original-id", htmlElement.id); if (htmlElement.hasAttribute("class")) annotatedNode.setAttribute("data-original-class", htmlElement.className); if (htmlElement.hasAttribute("name")) annotatedNode.setAttribute("data-original-name", htmlElement.getAttribute("name") || ""); if (htmlElement.tagName === "INPUT" || htmlElement.tagName === "TEXTAREA" || htmlElement.tagName === "SELECT") { const inputElement = htmlElement; if ("value" in inputElement) annotatedNode.setAttribute("data-value", inputElement.value); if ("checked" in inputElement && typeof inputElement.checked === "boolean") annotatedNode.setAttribute("data-checked", inputElement.checked.toString()); if ("disabled" in inputElement && typeof inputElement.disabled === "boolean") annotatedNode.setAttribute("data-disabled", inputElement.disabled.toString()); } } for (const child of node.childNodes) { const childNode = this.traverseNode(child); annotatedNode.appendChild(childNode); } return annotatedNode; } highlightInteractiveElements() { if (!this.enableHighlight) return; if (!document || !window || typeof document.createElement !== "function") { console.warn("当前环境不支持 DOM 高亮功能"); return; } const elementsToHighlight = this.interactiveElements.filter((el) => { try { const computedStyle = window.getComputedStyle(el); const width = Number.parseInt(computedStyle.width, 10); const height = Number.parseInt(computedStyle.height, 10); return width >= this.minInteractiveSize && height >= this.minInteractiveSize; } catch (e) { console.warn("获取元素样式时出错", e); return false; } }); const transitionStyle = DOMDomainAnalyzer.DEFAULT_CONFIG.TRANSITION_STYLE; for (const el of elementsToHighlight) try { const originalFilter = el.style.filter; el.style.filter = this.highlightStyle; el.style.transition = transitionStyle; el.setAttribute("data-original-filter", originalFilter); } catch (e) { console.warn("应用高亮样式时出错", e); } setTimeout(() => { for (const el of elementsToHighlight) try { el.style.filter = el.getAttribute("data-original-filter") || ""; el.removeAttribute("data-original-filter"); } catch (e) { console.warn("恢复原始样式时出错", e); } }, this.highlightDuration); } getAnalyzedHTML(element) { this.annotatedElements = []; this.interactiveElements = []; const analyzedDocument = this.traverseNode(element ?? document.documentElement); this.highlightInteractiveElements(); return analyzedDocument.outerHTML; } static analyze(options) { const analyzer = new DOMDomainAnalyzer(options); return analyzer.getAnalyzedHTML(); } }; DOMDomainAnalyzer.DEFAULT_CONFIG = { ANNOTATION_ATTRIBUTE: "data-element-id", MIN_INTERACTIVE_SIZE: 20, HIGHLIGHT_STYLE: "brightness(1.2) contrast(1.1)", HIGHLIGHT_DURATION: 500, ENABLE_HIGHLIGHT: true, TRANSITION_STYLE: "filter 0.3s ease" }; DOMDomainAnalyzer.INTERACTIVE_TAGS = ["A", "BUTTON"]; DOMDomainAnalyzer.INTERACTIVE_INPUT_TYPES = [ "button", "submit", "reset", "checkbox", "radio", "file" ]; DOMDomainAnalyzer.INTERACTIVE_ROLES = [ "button", "link", "checkbox", "radio", "menuitem", "tab" ]; DOMDomainAnalyzer.NON_EDITABLE_INPUT_TYPES = [ "button", "submit", "reset", "hidden", "file" ]; DOMDomainAnalyzer.ELEMENT_NODE = 1; //#endregion //#region src/types.ts var NodeTypeEnum; (function(NodeTypeEnum$1) { NodeTypeEnum$1[NodeTypeEnum$1["ELEMENT_NODE"] = 1] = "ELEMENT_NODE"; NodeTypeEnum$1[NodeTypeEnum$1["TEXT_NODE"] = 3] = "TEXT_NODE"; NodeTypeEnum$1[NodeTypeEnum$1["DOCUMENT_NODE"] = 9] = "DOCUMENT_NODE"; })(NodeTypeEnum || (NodeTypeEnum = {})); //#endregion //#region src/shrink-dom.ts var DOMShrinker = class DOMShrinker { /** * 创建一个新的DOM压缩器实例 * @param options 配置选项 */ constructor(options = {}) { this.options = { minTemplateDepth: options.minTemplateDepth ?? DOMShrinker.DEFAULT_MIN_TEMPLATE_DEPTH, minTemplateOccurrences: options.minTemplateOccurrences ?? DOMShrinker.DEFAULT_MIN_TEMPLATE_OCCURRENCES, templateIdPrefix: options.templateIdPrefix ?? DOMShrinker.DEFAULT_TEMPLATE_ID_PREFIX, semanticAttributes: options.semanticAttributes ?? DOMShrinker.DEFAULT_SEMANTIC_ATTRIBUTES, useHeuristicRules: options.useHeuristicRules ?? true, uiPatterns: { forms: options.uiPatterns?.forms ?? true, navigation: options.uiPatterns?.navigation ?? true, cards: options.uiPatterns?.cards ?? true, tables: options.uiPatterns?.tables ?? true, custom: options.uiPatterns?.custom ?? {} }, semanticPreservationLevel: options.semanticPreservationLevel ?? "medium", preserveDataAttributes: options.preserveDataAttributes ?? true, preserveAriaAttributes: options.preserveAriaAttributes ?? true, preserveRoles: options.preserveRoles ?? true, criticalAttributes: options.criticalAttributes ?? [ "id", "name", "action", "method" ] }; } /** * 将DOM节点转换为JSON表示 */ nodeToJson(node) { if (node.nodeType === NodeTypeEnum.TEXT_NODE) { const text = node.textContent?.trim() || ""; if (!text) return { type: "text", text: "" }; return { type: "text", text }; } if (node.nodeType === NodeTypeEnum.ELEMENT_NODE) { const element = node; const attrs = {}; for (const attr of Array.from(element.attributes)) if (attr) attrs[attr.name] = attr.value || ""; const children = Array.from(node.childNodes).map((child) => this.nodeToJson(child)).filter((childJson) => childJson.type === "text" && childJson.text || childJson.type === "element"); return { type: "element", tag: element.tagName.toLowerCase(), attrs: Object.keys(attrs).length > 0 ? attrs : void 0, children: children.length > 0 ? children : void 0 }; } return { type: "text", text: "" }; } /** * 检查属性是否为语义属性 */ isSemanticAttribute(attrName) { if (this.options.semanticAttributes.includes(attrName)) return true; if (this.options.semanticPreservationLevel !== "low") { if (this.options.preserveDataAttributes && attrName.startsWith("data-")) return true; if (this.options.preserveAriaAttributes && attrName.startsWith("aria-")) return true; if (this.options.preserveRoles && attrName === "role") return true; } return this.options.criticalAttributes.includes(attrName); } /** * 改进的计算节点哈希函数,考虑语义因素 */ computeNodeHash(node) { if (node.type === "text") return `text:${node.text?.length || 0}`; const regularAttrs = []; const semanticAttrs = []; if (node.attrs) for (const [key, value] of Object.entries(node.attrs)) if (this.isSemanticAttribute(key)) semanticAttrs.push(`${key}=${value}`); else regularAttrs.push(key); let patternSignature = ""; if (this.options.useHeuristicRules && node.type === "element") patternSignature = this.detectUIPattern(node); const childrenHashes = node.children ? node.children.map((child) => this.computeNodeHash(child)).join(",") : ""; return `${node.tag}[pattern:${patternSignature}][semantic:${semanticAttrs.sort().join(",")}][attr:${regularAttrs.sort().join(",")}](${childrenHashes})`; } /** * 检测节点的UI模式 */ detectUIPattern(node) { if (node.type !== "element") return ""; const tag = node.tag?.toLowerCase() || ""; const classAttr = node.attrs?.class || ""; const classes = classAttr.split(/\s+/).filter(Boolean); const role = node.attrs?.role || ""; if (this.options.uiPatterns.forms && (DOMShrinker.UI_PATTERN_SELECTORS.forms.includes(tag) || role === "form" || classAttr.includes("form"))) return "form"; if (this.options.uiPatterns.navigation && (DOMShrinker.UI_PATTERN_SELECTORS.navigation.includes(tag) || role === "navigation" || classAttr.includes("nav"))) return "navigation"; if (this.options.uiPatterns.cards && DOMShrinker.UI_PATTERN_SELECTORS.cards.some((selector) => tag === selector || classes.some((cls) => selector === `.${cls}`))) return "card"; if (this.options.uiPatterns.tables && (DOMShrinker.UI_PATTERN_SELECTORS.tables.includes(tag) || role === "table" || role === "grid")) return "table"; for (const [patternName, selectors] of Object.entries(this.options.uiPatterns.custom || {})) if (selectors.some((selector) => tag === selector || classes.some((cls) => selector === `.${cls}`) || selector.startsWith("[") && selector.endsWith("]") && node.attrs?.[selector.slice(1, -1)])) return patternName; return ""; } /** * 改进的模板检测方法,考虑节点的上下文位置 */ detectTemplates(root) { const templates = {}; const minDepth = this.options.minTemplateDepth; const minOccurrences = this.options.minTemplateOccurrences; const traverse = (node, depth, path = "") => { node.path = path; const basicHash = this.computeNodeHash(node); node.templateHash = basicHash; if (depth >= minDepth && node.type === "element") { const contextualHash = this.createContextualHash(node, basicHash, path); if (!templates[contextualHash]) templates[contextualHash] = { hash: contextualHash, structure: JSON.parse(JSON.stringify(node)), occurrences: [], depth, path }; templates[contextualHash].occurrences.push(node); } if (node.children) for (let i = 0; i < node.children.length; i++) traverse(node.children[i], depth + 1, `${path}/${node.tag}[${i}]`); }; traverse(root, 0, ""); return Object.values(templates).filter((t) => t.occurrences.length >= minOccurrences); } /** * 创建考虑上下文的哈希值 */ createContextualHash(node, basicHash, path) { const pathSegments = path.split("/").filter(Boolean); const contextDepth = Math.min(3, pathSegments.length); const relevantPath = pathSegments.slice(-contextDepth).join("/"); const semanticId = this.extractSemanticIdentifier(node); if (this.options.semanticPreservationLevel === "high" && semanticId) return `${basicHash}#${relevantPath}#${semanticId}`; if (this.options.semanticPreservationLevel === "medium") { const pattern = this.detectUIPattern(node); if (pattern) return `${basicHash}#${pattern}#${relevantPath}`; } return basicHash; } /** * 提取节点的语义标识符 */ extractSemanticIdentifier(node) { if (node.type !== "element" || !node.attrs) return ""; for (const attr of [ "id", "data-testid", "data-purpose", "name" ]) if (node.attrs[attr]) return `${attr}:${node.attrs[attr]}`; if (node.attrs.role) return `role:${node.attrs.role}`; const classAttr = node.attrs.class || ""; const semanticClasses = classAttr.split(/\s+/).filter((cls) => /-(form|container|section|card|wrapper|box|panel)$/.test(cls)); if (semanticClasses.length > 0) return `class:${semanticClasses[0]}`; return ""; } /** * 分析并优化模板结构 */ analyzeTemplate(template) { const inlineValues = {}; const nodeCache = new Map(); const analyzeNode = (nodes, path) => { if (nodes.length === 0) return; const cacheKey = `${path}-${nodes.length}`; if (nodeCache.has(cacheKey)) return; nodeCache.set(cacheKey, nodes); if (nodes[0]?.type === "text") { const firstText = nodes[0]?.text; const allSame = nodes.every((n) => n.text === firstText); if (allSame) inlineValues[path] = firstText || null; else inlineValues[path] = null; return; } if (nodes[0]?.attrs) { const attrKeys = Object.keys(nodes[0]?.attrs || {}); for (const key of attrKeys) { const attrPath = `${path}.attrs.${key}`; const firstValue = nodes[0]?.attrs?.[key] || ""; if (this.isSemanticAttribute(key)) if (this.options.semanticPreservationLevel === "high") inlineValues[attrPath] = firstValue; else if (this.options.semanticPreservationLevel === "medium") { const allSame = nodes.every((n) => n.attrs && n.attrs[key] === firstValue); inlineValues[attrPath] = allSame ? firstValue : null; } else { const allSame = nodes.every((n) => n.attrs && n.attrs[key] === firstValue); inlineValues[attrPath] = allSame ? firstValue : null; } else { const allSame = nodes.every((n) => n.attrs && n.attrs[key] === firstValue); inlineValues[attrPath] = allSame ? firstValue : null; } } } if (nodes[0]?.children) nodes[0]?.children.forEach((_, i) => { const childNodes = nodes.map((n) => n.children?.[i]).filter((node) => Boolean(node)); analyzeNode(childNodes, `${path}.children.${i}`); }); }; analyzeNode(template.occurrences, ""); return { structure: template.structure, inlineValues, occurrences: template.occurrences, depth: template.depth, hash: template.hash, path: template.path }; } /** * 筛选最佳模板 */ selectBestTemplates(templates) { const sortedTemplates = [...templates].sort((a, b) => { const scoreA = a.occurrences.length * a.depth; const scoreB = b.occurrences.length * b.depth; return scoreB - scoreA; }); const chosen = []; const usedNodes = new Set(); for (const template of sortedTemplates) { const availableOccurrences = template.occurrences.filter((node) => !usedNodes.has(node)); if (availableOccurrences.length >= 2) { const replacements = new Map(); for (const node of availableOccurrences) { const params = []; Object.entries(template.inlineValues).filter(([_, value]) => value === null).forEach((_, index) => { params.push(index); }); replacements.set(node, params); usedNodes.add(node); } chosen.push({ ...template, id: `${this.options.templateIdPrefix}${chosen.length + 1}`, occurrences: availableOccurrences, replacements }); } } return chosen; } /** * 构建模板化的节点树 */ buildTemplateTree(root, templates) { const cloneWithTemplates = (node) => { for (const template of templates) if (template.replacements.has(node)) { const params = template.replacements.get(node); if (params) return { type: "template", templateId: template.id, params }; } const clone = { ...node }; if (node.children) clone.children = node.children.map(cloneWithTemplates); return clone; }; return cloneWithTemplates(root); } /** * 将节点树序列化为字符串 */ stringifyTree(root, templates) { const parts = []; for (const template of templates) { const templateDef = this.stringifyTemplate(template); parts.push(`${template.id}: ${templateDef}`); } parts.push(""); parts.push(this.stringifyNode(root)); return parts.join("\n"); } /** * 将模板结构序列化为字符串 */ stringifyTemplate(template) { const serializeStructure = (node) => { if (node.type === "text") return this.escapeHtml(node.text || ""); let result = `<${node.tag}`; if (node.attrs) for (const [key, value] of Object.entries(node.attrs)) result += ` ${key}="${value}"`; if (!node.children || node.children.length === 0) { if (DOMShrinker.SELF_CLOSING_TAGS.has(node.tag || "")) return `${result} />`; return `${result}></${node.tag}>`; } result += ">"; if (node.children) for (const child of node.children) result += serializeStructure(child); return `${result}</${node.tag}>`; }; return serializeStructure(template.structure); } /** * 将单个节点序列化为字符串 */ stringifyNode(node) { if (node.type === "template") return `{${node.templateId}(${node.params?.join(",") || ""})}`; if (node.type === "text") return this.escapeHtml(node.text || ""); let result = `<${node.tag}`; if (node.attrs) { const attrParts = []; for (const [key, value] of Object.entries(node.attrs)) attrParts.push(` ${key}="${value}"`); result += attrParts.join(""); } if (!node.children || node.children.length === 0) { if (DOMShrinker.SELF_CLOSING_TAGS.has(node.tag || "")) return `${result} />`; return `${result}></${node.tag}>`; } result += ">"; if (node.children) { const childParts = []; for (const child of node.children) childParts.push(this.stringifyNode(child)); result += childParts.join(""); } return `${result}</${node.tag}>`; } escapeHtml(text) { return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#039;"); } calculateNodeSize(node) { let size = 1; if (node.attrs) size += Object.keys(node.attrs).length; if (node.children) size += node.children.reduce((sum, child) => sum + this.calculateNodeSize(child), 0); return size; } /** * 压缩HTML * @param document DOM文档或文档元素 * @returns 压缩后的HTML字符串 */ compressHTML(document$1) { const cache = new Map(); const nodeToJsonWithCache = (node) => { if (cache.has(node)) return cache.get(node); const result = this.nodeToJson(node); cache.set(node, result); return result; }; const rootElement = document$1.nodeType === NodeTypeEnum.DOCUMENT_NODE ? document$1.body : document$1; const jsonTree = nodeToJsonWithCache(rootElement); const templates = this.detectTemplates(jsonTree); const optimizedTemplates = templates.map((template) => this.analyzeTemplate(template)); const chosenTemplates = this.selectBestTemplates(optimizedTemplates); const templateTree = this.buildTemplateTree(jsonTree, chosenTemplates); return this.stringifyTree(templateTree, chosenTemplates); } /** * 从HTML字符串创建压缩版本 * @param html HTML字符串 * @param options 配置选项 * @returns 压缩后的HTML字符串 */ static compressHTMLString(html, options) { const dom = new jsdom.JSDOM(html); return new DOMShrinker(options).compressHTML(dom.window.document.body); } }; DOMShrinker.SELF_CLOSING_TAGS = new Set([ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr" ]); DOMShrinker.DEFAULT_MIN_TEMPLATE_DEPTH = 2; DOMShrinker.DEFAULT_MIN_TEMPLATE_OCCURRENCES = 2; DOMShrinker.DEFAULT_TEMPLATE_ID_PREFIX = "T"; DOMShrinker.DEFAULT_SEMANTIC_ATTRIBUTES = [ "id", "role", "data-testid", "data-purpose", "data-section", "aria-label", "aria-labelledby", "aria-describedby", "name", "type", "for" ]; DOMShrinker.UI_PATTERN_SELECTORS = { forms: [ "form", "input", "button", "select", "textarea", "label" ], navigation: [ "nav", "a", "ul.menu", "ul.nav", ".navigation" ], cards: [ ".card", "article", ".item", ".product" ], tables: [ "table", "tr", "td", "th", "thead", "tbody" ] }; //#endregion //#region src/dom-content-extractor.ts var DOMContentExtractor = class DOMContentExtractor { /** * 创建内容提取器实例 * @param options 配置选项 */ constructor(options = {}) { this.jsdomInstance = new jsdom.JSDOM("<!DOCTYPE html><html><body></body></html>"); this.documentNode = this.jsdomInstance.window.document; this.nodeType = this.jsdomInstance.window.Node; this.formElementTags = new Set(options.formElementTags || DOMContentExtractor.FORM_ELEMENT_TAGS); this.preservedAttributes = options.preservedAttributes || DOMContentExtractor.PRESERVED_ATTRIBUTES; this.preservedAttributesSet = new Set(this.preservedAttributes); this.emphasisTags = new Set(options.emphasisTags || DOMContentExtractor.EMPHASIS_TAGS); this.skipKeywords = options.skipKeywords || DOMContentExtractor.SKIP_KEYWORDS; } /** * 提取简化的DOM内容 * @param element 需要处理的DOM节点 * @returns 提取后的DOM节点或null */ extract(element) { return this.transformNode(element); } /** * 检查提取的内容是否有意义 * @param element 提取后的DOM节点 * @returns 是否有意义 */ isContentMeaningful(element) { if (!element) return false; const allNodes = []; const collectNodes = (node) => { allNodes.push(node); if (this.isElementNode(node)) { const children = Array.from(node.childNodes); for (const child of children) collectNodes(child); } }; collectNodes(element); if (allNodes.length < 10) return false; const textNodes = allNodes.filter((node) => this.isTextNode(node)); const elementNodes = allNodes.filter((node) => this.isElementNode(node)); if (textNodes.length === allNodes.length) return false; if (elementNodes.length === allNodes.length && !allNodes.some((node) => node.textContent?.trim())) return false; return true; } /** * 转换DOM节点 * @param element 需要处理的DOM节点 * @returns 处理后的DOM节点或null * @private */ transformNode(element) { if (this.isTextNode(element)) { const text = element.textContent?.trim(); return text ? this.documentNode.createTextNode(`${text} `) : null; } const type = Object.prototype.toString.call(element); if (!/HTML.*Element/.test(type)) return null; const htmlElement = element; const tagName = htmlElement.tagName; if (this.shouldSkipBasedOnTag(htmlElement, tagName)) return null; if (this.emphasisTags.has(tagName)) { const textContent = htmlElement.textContent?.trim(); return textContent ? this.documentNode.createTextNode(`${textContent} `) : null; } if (this.shouldSkipElement(htmlElement)) return null; const processedChildren = this.processChildNodes(htmlElement, tagName); if (processedChildren.length === 0 && !this.formElementTags.has(tagName)) return null; const shouldKeepElement = this.shouldKeepElement(htmlElement); if (!shouldKeepElement && processedChildren.length === 1) return processedChildren[0] ?? null; return this.createContainerWithAttributes(htmlElement, tagName, processedChildren); } isTextNode(node) { return node.nodeType === this.nodeType.TEXT_NODE; } isElementNode(node) { return node.nodeType === this.nodeType.ELEMENT_NODE; } shouldSkipBasedOnTag(element, tagName) { if (tagName === "IMG") return true; if (element.getAttribute("data-visible") !== "true") return true; if (this.isAdvertisement(element)) return true; if (element.childNodes.length === 0 && !this.formElementTags.has(tagName)) return true; return false; } isAdvertisement(element) { const className = element.className.toLowerCase(); if (className && /\b(ad|ads|advert|advertisement|banner|sponsor|sponsored|promotion)\b/.test(className)) return true; const id = element.id.toLowerCase(); if (id && /\b(ad|ads|advert|advertisement|banner|sponsor|sponsored|promotion)\b/.test(id)) return true; if (element.hasAttribute("data-ad") || element.hasAttribute("data-sponsored") || element.getAttribute("data-ad-client") || element.getAttribute("aria-label")?.toLowerCase().includes("广告")) return true; return false; } processChildNodes(element, tagName) { const childNodes = element.childNodes; const childrenLength = childNodes.length; const processedChildren = []; for (let i = 0; i < childrenLength; i++) { const childNode = childNodes[i]; if (childNode) { const result = this.transformNode(childNode); if (result) processedChildren.push(result); } } if (tagName === "BODY") { let j = 0; while (j < processedChildren.length) { const child = processedChildren[j]; if (child && this.isTextNode(child)) processedChildren.splice(j, 1); else j++; } } return processedChildren; } shouldKeepElement(element) { const isInteractive = element.getAttribute("data-interactive") === "true" || element.hasAttribute("role"); const isEditable = element.getAttribute("data-editable") === "true"; const hasLabel = element.hasAttribute("aria-label") || element.hasAttribute("name"); return isInteractive || hasLabel || isEditable; } createContainerWithAttributes(htmlElement, tagName, processedChildren) { const container = this.documentNode.createElement(tagName); const elementText = htmlElement.textContent?.trim() || ""; const titleValue = htmlElement.getAttribute("title"); const ariaLabelValue = htmlElement.getAttribute("aria-label"); const titleEqualsAriaLabel = titleValue && ariaLabelValue && titleValue === ariaLabelValue; const attrNames = htmlElement.getAttributeNames(); for (let i = 0; i < attrNames.length; i++) { const attr = attrNames[i]; if (attr && this.preservedAttributesSet.has(attr)) { const attrValue = htmlElement.getAttribute(attr); if ((attr === "title" || attr === "aria-label") && attrValue === elementText) continue; if (titleEqualsAriaLabel && attr === "title") continue; container.setAttribute(attr, attrValue); } } const isInteractive = htmlElement.getAttribute("data-interactive") === "true" || htmlElement.hasAttribute("role"); const isEditable = htmlElement.getAttribute("data-editable") === "true"; if (isInteractive || isEditable) { const dataId = htmlElement.getAttribute("data-id") || htmlElement.getAttribute("data-element-id"); if (dataId) container.setAttribute("id", dataId); if (isEditable) container.setAttribute("editable", "true"); } for (let i = 0; i < processedChildren.length; i++) { const child = processedChildren[i]; if (child) container.appendChild(child); } return container; } /** * 检测节点是否应该被跳过 * @param element HTML元素 * @returns 是否应该被跳过 * @private */ shouldSkipElement(element) { const directText = this.getDirectTextContent(element).toLowerCase(); if (directText && this.containsAnyKeyword(directText, this.skipKeywords)) return true; return false; } /** * 获取元素的直接文本内容(不包括子元素的文本) * @param element HTML元素 * @returns 直接文本内容 * @private */ getDirectTextContent(element) { let directText = ""; for (let i = 0; i < element.childNodes.length; i++) { const node = element.childNodes[i]; if (this.isTextNode(node)) directText += node.textContent || ""; } return directText.trim(); } /** * 检查文本是否包含关键词列表中的任一关键词 * @param text 要检查的文本 * @param keywords 关键词列表 * @returns 是否包含关键词 * @private */ containsAnyKeyword(text, keywords) { for (let i = 0; i < keywords.length; i++) { const keyword = keywords[i]; if (keyword && text.includes(keyword.toLowerCase())) return true; } return false; } }; /** * 默认值常量 */ DOMContentExtractor.FORM_ELEMENT_TAGS = [ "INPUT", "TEXTAREA", "SELECT", "BUTTON", "OPTION", "LABEL" ]; DOMContentExtractor.PRESERVED_ATTRIBUTES = [ "aria-label", "data-name", "name", "type", "placeholder", "value", "role", "title" ]; DOMContentExtractor.EMPHASIS_TAGS = [ "EM", "STRONG", "B", "I", "MARK", "SMALL", "DEL", "INS", "SUB", "SUP" ]; DOMContentExtractor.BRIEF_TEXT_THRESHOLD = 200; DOMContentExtractor.SKIP_KEYWORDS = [ "copyright", "©", "all rights reserved", "版权所有", "隐私政策", "privacy policy", "使用条款", "terms of service", "terms of use", "terms and conditions", "cookie", "cookies政策", "广告", "advertisement", "sponsored", "赞助", "ad", "ads", "promotion" ]; //#endregion exports.DOMContentExtractor = DOMContentExtractor exports.DOMDomainAnalyzer = DOMDomainAnalyzer exports.DOMShrinker = DOMShrinker