UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

341 lines (340 loc) 15.6 kB
"use strict"; /** * HTML Filter - for cleaning and filtering HTML content * Based on the Python version of PruningContentFilter */ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.HtmlFilter = void 0; const dom_adapter_1 = require("./dom-adapter"); class HtmlFilter { constructor(minWordThreshold, thresholdType = 'dynamic', threshold = 0.48) { this.includedTags = new Set([ 'p', 'div', 'article', 'section', 'main', 'content', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code', 'ul', 'ol', 'li', 'table', 'thead', 'tbody', 'tr', 'td', 'th', 'figure', 'figcaption', 'img', 'video', 'audio', 'embed', 'iframe', 'object', 'strong', 'em', 'b', 'i', 'u', 'mark', 'small', 'del', 'ins', 'sub', 'sup', 'a', 'span', 'time', 'address', 'cite', 'q', 'dfn', 'abbr', 'data', 'var', 'samp', 'kbd', 'br', 'hr', 'wbr' ]); this.excludedTags = new Set([ 'nav', 'header', 'footer', 'aside', 'menu', 'menuitem', 'script', 'style', 'meta', 'link', 'title', 'head', 'noscript', 'template', 'slot', 'form', 'input', 'textarea', 'button', 'select', 'option', 'optgroup', 'label', 'fieldset', 'legend', 'canvas', 'svg', 'math' ]); this.headerTags = new Set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']); this.negativePattern = /comment|meta|footer|footnote|sidebar|nav|advertisement|banner|social|share|related|recommended|trending|popular|ads?|popup|modal|overlay|cookie|consent|notification|breadcrumb|pagination|search-suggest|autocomplete/i; this.minWordCount = minWordThreshold || 2; this.threshold = threshold; this.thresholdType = thresholdType; this.tagImportance = { 'article': 1.5, 'main': 1.5, 'section': 1.2, 'div': 0.6, 'p': 1.1, 'h1': 1.4, 'h2': 1.3, 'h3': 1.2, 'h4': 1.15, 'h5': 1.1, 'h6': 1.05, 'blockquote': 1.1, 'ul': 0.9, 'ol': 0.9, 'li': 0.85, 'table': 0.9, 'tr': 0.8, 'td': 0.8, 'th': 0.85, 'figure': 1.0, 'figcaption': 0.9, 'code': 0.9, 'pre': 0.9, 'strong': 0.95, 'em': 0.95, 'b': 0.9, 'i': 0.9, 'a': 0.8, 'span': 0.6 }; this.metricConfig = { textDensity: true, linkDensity: true, tagWeight: true, classIdWeight: true, textLength: true }; this.metricWeights = { textDensity: 0.35, linkDensity: 0.15, tagWeight: 0.25, classIdWeight: 0.15, textLength: 0.1 }; } /** * Filters HTML content and returns an array of HTML blocks. * @param html HTML string * @returns Array of HTML blocks after filtering */ filterContent(html) { return __awaiter(this, void 0, void 0, function* () { if (!html) { return []; } let doc = yield (0, dom_adapter_1.parseHTML)(html); // If no body, add one if (!doc.body) { doc = yield (0, dom_adapter_1.parseHTML)(`<body>${html}</body>`); } yield this.removeComments(doc); this.removeUnwantedTags(doc); const body = doc.body; this.pruneTree(body); const contentBlocks = []; Array.from(body.children).forEach(element => { if (element.textContent && element.textContent.trim().length > 0) { contentBlocks.push(element.outerHTML); } }); return contentBlocks; }); } /** * Filters HTML content and returns a concatenated HTML string. * @param html HTML string * @returns Concatenated HTML string after filtering */ filterContentAsString(html) { return __awaiter(this, void 0, void 0, function* () { const blocks = yield this.filterContent(html); return blocks.join(''); }); } /** * Removes HTML comments from the document. * @param doc DOM document */ removeComments(doc) { return __awaiter(this, void 0, void 0, function* () { const nodeFilter = yield (0, dom_adapter_1.getNodeFilter)(); const document = yield (0, dom_adapter_1.getDocument)(); const nodeIterator = document.createNodeIterator(doc, nodeFilter.SHOW_COMMENT, null); let node; const nodesToRemove = []; // First, collect all comment nodes while ((node = nodeIterator.nextNode())) { if (node) { nodesToRemove.push(node); } } // Then, remove them nodesToRemove.forEach(commentNode => { var _a; (_a = commentNode.parentNode) === null || _a === void 0 ? void 0 : _a.removeChild(commentNode); }); }); } /** * Removes unwanted tags from the document. * @param doc DOM document */ removeUnwantedTags(doc) { // Remove standard excluded tags this.excludedTags.forEach(tag => { const elements = doc.getElementsByTagName(tag); // Convert to array as the collection changes during iteration Array.from(elements).forEach(element => { var _a; (_a = element.parentNode) === null || _a === void 0 ? void 0 : _a.removeChild(element); }); }); // Remove elements with noise-indicating attributes const noiseSelectors = [ // Hidden elements '[style*="display:none"]', '[style*="visibility:hidden"]', '[hidden]', // Ad-related '[class*="ad"]', '[id*="ad"]', '[class*="advertisement"]', '[id*="advertisement"]', '[class*="banner"]', '[id*="banner"]', // Search engine specific noise '[class*="suggest"]', '[id*="suggest"]', '[class*="autocomplete"]', '[id*="autocomplete"]', '[class*="dropdown"]', '[id*="dropdown"]', '[class*="popup"]', '[id*="popup"]', '[class*="modal"]', '[id*="modal"]', '[class*="overlay"]', '[id*="overlay"]', // Navigation and UI noise '[class*="breadcrumb"]', '[id*="breadcrumb"]', '[class*="pagination"]', '[id*="pagination"]', '[class*="toolbar"]', '[id*="toolbar"]', '[class*="sidebar"]', '[id*="sidebar"]', // Cookie and notification banners '[class*="cookie"]', '[id*="cookie"]', '[class*="consent"]', '[id*="consent"]', '[class*="notification"]', '[id*="notification"]' ]; noiseSelectors.forEach(selector => { try { const elements = doc.querySelectorAll(selector); Array.from(elements).forEach(element => { var _a; // Only remove if it's not a main content container if (!this.isMainContentContainer(element)) { (_a = element.parentNode) === null || _a === void 0 ? void 0 : _a.removeChild(element); } }); } catch (error) { // Selector might not be supported, continue } }); } /** * Check if element is a main content container that should be preserved */ isMainContentContainer(element) { const tag = element.tagName.toLowerCase(); const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); // Preserve semantic content elements if (['main', 'article', 'section'].includes(tag)) { return true; } // Preserve elements with content-indicating names const contentIndicators = ['content', 'main', 'article', 'post', 'entry', 'text']; return contentIndicators.some(indicator => className.includes(indicator) || id.includes(indicator)); } /** * Prunes the tree structure. * @param node Current node */ pruneTree(node) { if (!node || !node.tagName) { return; } const tagName = node.tagName.toLowerCase(); const textLen = node.textContent ? node.textContent.trim().length : 0; const tagLen = node.innerHTML.length; const linkTextLen = Array.from(node.querySelectorAll('a')) .reduce((sum, a) => sum + (a.textContent ? a.textContent.trim().length : 0), 0); const metrics = { node: node, tagName: tagName, textLen: textLen, tagLen: tagLen, linkTextLen: linkTextLen }; const score = this.computeCompositeScore(metrics, textLen, tagLen, linkTextLen); let shouldRemove = false; if (this.thresholdType === 'fixed') { shouldRemove = score < this.threshold; } else { // dynamic const tagImportanceValue = this.tagImportance[metrics.tagName] || 0.7; const textRatio = tagLen > 0 ? textLen / tagLen : 0; // const linkRatio = textLen > 0 ? linkTextLen / textLen : 1; // linkRatio seems unused let currentThreshold = this.threshold; // Base threshold if (tagImportanceValue > 1) { currentThreshold *= 0.8; // Lower threshold for important tags } if (textRatio > 0.4) { // Lower threshold for high text density currentThreshold *= 0.9; } // Consider additional adjustments for linkRatio if it's relevant shouldRemove = score < currentThreshold; } if (shouldRemove && node.parentNode && node.parentNode !== node.ownerDocument) { node.parentNode.removeChild(node); return; } // Recursively prune children // Convert HTMLCollection to array for safe iteration while modifying the DOM const children = Array.from(node.children); for (let i = 0; i < children.length; i++) { this.pruneTree(children[i]); } // After processing children, re-evaluate the current node // This handles cases where children removal might make the parent insignificant if (node.children.length === 0 && (node.textContent || '').trim().length === 0 && !this.isEssentialTag(tagName)) { if (this.countWords(node.textContent || '') < this.minWordCount && node.parentNode && node.parentNode !== node.ownerDocument) { const parent = node.parentNode; parent.removeChild(node); // If parent becomes empty after child removal, it might also need pruning in a subsequent pass or by adjusting logic } } } isEssentialTag(tagName) { // Define tags that should not be removed even if empty, e.g., <br>, <img> // For now, let's assume no such tags or handle them based on existing includedTags return this.includedTags.has(tagName); // Placeholder logic } /** * Computes a composite score for a node based on various metrics. * @param metrics Filter metrics for the node * @param textLen Length of text content * @param tagLen Length of HTML content * @param linkTextLen Length of text within links * @returns Composite score */ computeCompositeScore(metrics, textLen, tagLen, linkTextLen) { let score = 0; let totalWeight = 0; if (this.metricConfig.textDensity) { const density = tagLen > 0 ? textLen / tagLen : 0; score += density * this.metricWeights.textDensity; totalWeight += this.metricWeights.textDensity; } if (this.metricConfig.linkDensity) { const linkDensity = textLen > 0 ? linkTextLen / textLen : 0; // Penalize high link density (often indicates navigation or ads) score -= linkDensity * this.metricWeights.linkDensity; totalWeight += this.metricWeights.linkDensity; // Weight is added, but value is subtracted } if (this.metricConfig.tagWeight) { const tagWeight = this.tagImportance[metrics.tagName] || 0.5; // Default weight for unknown tags score += tagWeight * this.metricWeights.tagWeight; totalWeight += this.metricWeights.tagWeight; } if (this.metricConfig.classIdWeight) { const classIdWeight = this.computeClassIdWeight(metrics.node); // Negative patterns reduce the score score += classIdWeight * this.metricWeights.classIdWeight; totalWeight += this.metricWeights.classIdWeight; } if (this.metricConfig.textLength) { // Normalize text length score (e.g., based on an expected average or max length) // Simple approach: penalize very short text, reward longer text up to a point const lengthScore = Math.min(1, textLen / 100); // Example normalization score += lengthScore * this.metricWeights.textLength; totalWeight += this.metricWeights.textLength; } // Normalize score by total weight if weights don't sum to 1 // This ensures the score is roughly within a predictable range (e.g., 0-1 if individual scores are normalized) return totalWeight > 0 ? score / totalWeight : 0; } /** * Computes a weight based on class names and ID. * Negative patterns (like 'comment', 'nav') decrease the score. * @param node HTML element * @returns Weight based on class/ID */ computeClassIdWeight(node) { let weight = 0; const classAndId = `${node.className} ${node.id}`.toLowerCase(); if (this.negativePattern.test(classAndId)) { weight -= 0.5; // Significant penalty for negative patterns } // Add more sophisticated class/ID analysis if needed // e.g., positive patterns, specific class weights return weight; } /** * Counts words in a string. * @param text Input string * @returns Number of words */ countWords(text) { if (!text) return 0; return text.trim().split(/\s+/).length; } } exports.HtmlFilter = HtmlFilter;