UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

423 lines (422 loc) 16.2 kB
"use strict"; /** * Intelligent Content Quality Filter * A universal content filtering system that identifies and preserves valuable text content * while removing noise, regardless of the website or page type. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.createContentQualityPreset = exports.IntelligentContentFilter = exports.DEFAULT_CONTENT_QUALITY_OPTIONS = void 0; const dom_adapter_1 = require("../dom-adapter"); exports.DEFAULT_CONTENT_QUALITY_OPTIONS = { intensity: 'moderate', minTextDensity: 0.3, minTextLength: 10, maxNoiseRatio: 0.7, preserveNavigation: true, preserveStructuredData: true, qualityWeights: { textDensity: 0.3, semanticValue: 0.3, structuralImportance: 0.2, userEngagement: 0.2 } }; class IntelligentContentFilter { constructor(options = {}) { this.options = Object.assign(Object.assign({}, exports.DEFAULT_CONTENT_QUALITY_OPTIONS), options); } /** * Apply intelligent content filtering to HTML string */ filterHtmlString(html) { const document = (0, dom_adapter_1.parseHTML)(html); return this.filter(document); } /** * Apply intelligent content filtering to the document */ filter(document) { const startTime = Date.now(); let removedElements = 0; let preservedElements = 0; // Phase 1: Remove obvious noise elements removedElements += this.removeNoiseElements(document); // Phase 2: Analyze and score content blocks const contentBlocks = this.analyzeContentBlocks(document); // Phase 3: Apply quality-based filtering const filterResults = this.applyQualityFiltering(contentBlocks); // Phase 4: Clean up empty containers removedElements += this.cleanupEmptyContainers(document); // Phase 5: Preserve high-value elements preservedElements += this.preserveHighValueContent(document); return { removedElements, preservedElements, processingTime: Date.now() - startTime, qualityScore: this.calculateOverallQuality(document), contentBlocks: filterResults }; } /** * Remove elements that are obviously noise (styles, scripts, ads, etc.) */ removeNoiseElements(document) { let removed = 0; // Remove style and script elements const noiseSelectors = [ 'style', 'script', 'noscript', 'meta[http-equiv]', 'link[rel="stylesheet"]', 'link[rel="dns-prefetch"]', 'link[rel="preconnect"]' ]; // Add conditional noise selectors based on intensity if (this.options.intensity !== 'minimal') { noiseSelectors.push('[style*="display:none"]', '[style*="visibility:hidden"]', '[class*="ad"]', '[id*="ad"]', '[class*="advertisement"]'); } if (this.options.intensity === 'aggressive' || this.options.intensity === 'maximum') { noiseSelectors.push('[class*="popup"]', '[class*="modal"]', '[class*="overlay"]', '[class*="sidebar"]', '[class*="footer"]'); } noiseSelectors.forEach(selector => { const elements = document.querySelectorAll(selector); elements.forEach(element => { if (!this.isProtectedElement(element)) { element.remove(); removed++; } }); }); return removed; } /** * Analyze content blocks and assign quality scores */ analyzeContentBlocks(document) { const blocks = []; const candidates = document.querySelectorAll('div, article, section, main, p, li'); candidates.forEach((element, index) => { const block = this.analyzeElement(element, index); if (block.qualityScore > 0) { blocks.push(block); } }); return blocks.sort((a, b) => b.qualityScore - a.qualityScore); } /** * Analyze individual element and calculate quality score */ analyzeElement(element, index) { var _a; const textContent = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || ''; const htmlContent = element.innerHTML || ''; // Calculate various quality metrics const textDensity = this.calculateTextDensity(textContent, htmlContent); const semanticValue = this.calculateSemanticValue(element, textContent); const structuralImportance = this.calculateStructuralImportance(element); const userEngagement = this.calculateUserEngagementValue(element); // Calculate weighted quality score const qualityScore = textDensity * this.options.qualityWeights.textDensity + semanticValue * this.options.qualityWeights.semanticValue + structuralImportance * this.options.qualityWeights.structuralImportance + userEngagement * this.options.qualityWeights.userEngagement; return { element, index, textContent, textLength: textContent.length, textDensity, semanticValue, structuralImportance, userEngagement, qualityScore, shouldPreserve: qualityScore >= this.getQualityThreshold() }; } /** * Calculate text density (ratio of text to HTML) */ calculateTextDensity(textContent, htmlContent) { if (htmlContent.length === 0) return 0; // Remove common noise patterns from HTML const cleanHtml = htmlContent .replace(/<style[^>]*>.*?<\/style>/gi, '') .replace(/<script[^>]*>.*?<\/script>/gi, '') .replace(/style\s*=\s*"[^"]*"/gi, ''); const textLength = textContent.length; const htmlLength = cleanHtml.length; if (htmlLength === 0) return 0; return Math.min(textLength / htmlLength, 1); } /** * Calculate semantic value based on content characteristics */ calculateSemanticValue(element, textContent) { let score = 0; // Length scoring (sweet spot around 50-500 characters) if (textContent.length >= this.options.minTextLength) { if (textContent.length >= 50 && textContent.length <= 500) { score += 0.4; } else if (textContent.length > 500 && textContent.length <= 2000) { score += 0.3; } else if (textContent.length > 20) { score += 0.2; } } // Content quality indicators if (this.hasLinks(element)) score += 0.1; if (this.hasStructuredContent(textContent)) score += 0.2; if (this.isReadableText(textContent)) score += 0.2; if (!this.containsCodeOrStyles(textContent)) score += 0.1; return Math.min(score, 1); } /** * Calculate structural importance based on element position and semantics */ calculateStructuralImportance(element) { let score = 0; // Semantic HTML elements const tagName = element.tagName.toLowerCase(); const semanticTags = { 'main': 0.4, 'article': 0.3, 'section': 0.2, 'nav': 0.3, 'header': 0.2, 'h1': 0.4, 'h2': 0.3, 'h3': 0.2, 'p': 0.2, 'li': 0.1 }; score += semanticTags[tagName] || 0; // Navigation preservation if (this.options.preserveNavigation && this.isNavigationElement(element)) { score += 0.3; } // Position scoring (higher for content that appears earlier) const position = this.getElementPosition(element); if (position < 0.3) score += 0.2; else if (position < 0.6) score += 0.1; return Math.min(score, 1); } /** * Calculate user engagement value (links, interactive elements) */ calculateUserEngagementValue(element) { let score = 0; const links = element.querySelectorAll('a[href]'); const buttons = element.querySelectorAll('button, input[type="button"]'); const forms = element.querySelectorAll('form'); // Scoring based on interactive elements if (links.length > 0) { score += Math.min(links.length * 0.1, 0.3); // Bonus for external links const externalLinks = Array.from(links).filter(link => { const href = link.getAttribute('href'); return href && (href.startsWith('http') || href.startsWith('//')); }); score += Math.min(externalLinks.length * 0.05, 0.2); } if (buttons.length > 0) score += Math.min(buttons.length * 0.05, 0.1); if (forms.length > 0) score += 0.1; return Math.min(score, 1); } /** * Apply quality-based filtering to content blocks */ applyQualityFiltering(blocks) { const threshold = this.getQualityThreshold(); blocks.forEach(block => { if (!block.shouldPreserve && block.qualityScore < threshold) { // Low quality content - remove or simplify if (this.options.intensity === 'maximum') { block.element.remove(); } else if (this.options.intensity === 'aggressive') { this.simplifyElement(block.element); } } }); return blocks.filter(block => block.shouldPreserve); } /** * Get quality threshold based on intensity setting */ getQualityThreshold() { const thresholds = { 'minimal': 0.2, 'moderate': 0.4, 'aggressive': 0.6, 'maximum': 0.8 }; return thresholds[this.options.intensity]; } /** * Clean up empty containers after filtering */ cleanupEmptyContainers(document) { let removed = 0; const containers = ['div', 'span', 'section', 'article']; containers.forEach(tag => { const elements = document.querySelectorAll(tag); elements.forEach(element => { if (this.isEmptyContainer(element) && !this.isProtectedElement(element)) { element.remove(); removed++; } }); }); return removed; } /** * Preserve elements that are definitely valuable */ preserveHighValueContent(document) { let preserved = 0; // Always preserve main content areas const highValueSelectors = [ 'main', '[role="main"]', '.main-content', '.content', 'article', '[itemprop="articleBody"]' ]; if (this.options.preserveNavigation) { highValueSelectors.push('nav', '[role="navigation"]'); } highValueSelectors.forEach(selector => { const elements = document.querySelectorAll(selector); elements.forEach(element => { element.setAttribute('data-preserve', 'true'); preserved++; }); }); return preserved; } // Helper methods isProtectedElement(element) { return element.hasAttribute('data-preserve') || element.closest('[data-preserve]') !== null; } isNavigationElement(element) { const navIndicators = ['nav', 'menu', 'navigation', 'breadcrumb']; const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); const tagName = element.tagName.toLowerCase(); return tagName === 'nav' || navIndicators.some(indicator => className.includes(indicator) || id.includes(indicator)); } hasLinks(element) { return element.querySelectorAll('a[href]').length > 0; } hasStructuredContent(text) { // Check for structured patterns return /[.!?]\s+[A-Z]/.test(text) || // Sentences /\d+\.\s/.test(text) || // Numbered lists /[•\-]\s/.test(text); // Bullet points } isReadableText(text) { // Check if text appears to be readable content vs code/styles const codePatterns = [ /\{[^}]*\}/, /[\w-]+:\s*[\w-]+;/, /function\s*\(/, /var\s+\w+\s*=/, /<[^>]+>/ // HTML tags ]; return !codePatterns.some(pattern => pattern.test(text)); } containsCodeOrStyles(text) { return /(\{[^}]*\}|[\w-]+:\s*[\w-]+;|function\s*\(|var\s+\w+\s*=)/.test(text); } getElementPosition(element) { // Calculate relative position in document (0 = top, 1 = bottom) const allElements = document.querySelectorAll('*'); const elementIndex = Array.from(allElements).indexOf(element); return elementIndex / allElements.length; } isEmptyContainer(element) { var _a; const textContent = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || ''; const hasChildren = element.children.length > 0; const hasPreserveAttribute = element.hasAttribute('data-preserve'); return !textContent && !hasChildren && !hasPreserveAttribute; } simplifyElement(element) { // Remove style attributes and simplify complex elements element.removeAttribute('style'); element.removeAttribute('class'); // Remove empty child elements const children = Array.from(element.children); children.forEach(child => { if (this.isEmptyContainer(child)) { child.remove(); } }); } calculateOverallQuality(document) { var _a, _b, _c, _d; const textLength = ((_b = (_a = document.body) === null || _a === void 0 ? void 0 : _a.textContent) === null || _b === void 0 ? void 0 : _b.length) || 0; const htmlLength = ((_d = (_c = document.body) === null || _c === void 0 ? void 0 : _c.innerHTML) === null || _d === void 0 ? void 0 : _d.length) || 0; if (htmlLength === 0) return 0; return Math.min(textLength / htmlLength, 1); } } exports.IntelligentContentFilter = IntelligentContentFilter; /** * Convenience function to create preset configurations */ function createContentQualityPreset(preset) { const presets = { clean: { intensity: 'minimal', minTextDensity: 0.2, minTextLength: 5, maxNoiseRatio: 0.8, preserveNavigation: true, preserveStructuredData: true }, balanced: { intensity: 'moderate', minTextDensity: 0.3, minTextLength: 10, maxNoiseRatio: 0.7, preserveNavigation: true, preserveStructuredData: true }, aggressive: { intensity: 'aggressive', minTextDensity: 0.5, minTextLength: 20, maxNoiseRatio: 0.5, preserveNavigation: false, preserveStructuredData: false }, maximum: { intensity: 'maximum', minTextDensity: 0.7, minTextLength: 30, maxNoiseRatio: 0.3, preserveNavigation: false, preserveStructuredData: false } }; return presets[preset]; } exports.createContentQualityPreset = createContentQualityPreset;