html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
423 lines (422 loc) • 16.2 kB
JavaScript
;
/**
* Intelligent Content Quality Filter
* A universal content filtering system that identifies and preserves valuable text content
* while removing noise, regardless of the website or page type.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.createContentQualityPreset = exports.IntelligentContentFilter = exports.DEFAULT_CONTENT_QUALITY_OPTIONS = void 0;
const dom_adapter_1 = require("../dom-adapter");
exports.DEFAULT_CONTENT_QUALITY_OPTIONS = {
intensity: 'moderate',
minTextDensity: 0.3,
minTextLength: 10,
maxNoiseRatio: 0.7,
preserveNavigation: true,
preserveStructuredData: true,
qualityWeights: {
textDensity: 0.3,
semanticValue: 0.3,
structuralImportance: 0.2,
userEngagement: 0.2
}
};
class IntelligentContentFilter {
constructor(options = {}) {
this.options = Object.assign(Object.assign({}, exports.DEFAULT_CONTENT_QUALITY_OPTIONS), options);
}
/**
* Apply intelligent content filtering to HTML string
*/
filterHtmlString(html) {
const document = (0, dom_adapter_1.parseHTML)(html);
return this.filter(document);
}
/**
* Apply intelligent content filtering to the document
*/
filter(document) {
const startTime = Date.now();
let removedElements = 0;
let preservedElements = 0;
// Phase 1: Remove obvious noise elements
removedElements += this.removeNoiseElements(document);
// Phase 2: Analyze and score content blocks
const contentBlocks = this.analyzeContentBlocks(document);
// Phase 3: Apply quality-based filtering
const filterResults = this.applyQualityFiltering(contentBlocks);
// Phase 4: Clean up empty containers
removedElements += this.cleanupEmptyContainers(document);
// Phase 5: Preserve high-value elements
preservedElements += this.preserveHighValueContent(document);
return {
removedElements,
preservedElements,
processingTime: Date.now() - startTime,
qualityScore: this.calculateOverallQuality(document),
contentBlocks: filterResults
};
}
/**
* Remove elements that are obviously noise (styles, scripts, ads, etc.)
*/
removeNoiseElements(document) {
let removed = 0;
// Remove style and script elements
const noiseSelectors = [
'style',
'script',
'noscript',
'meta[http-equiv]',
'link[rel="stylesheet"]',
'link[rel="dns-prefetch"]',
'link[rel="preconnect"]'
];
// Add conditional noise selectors based on intensity
if (this.options.intensity !== 'minimal') {
noiseSelectors.push('[style*="display:none"]', '[style*="visibility:hidden"]', '[class*="ad"]', '[id*="ad"]', '[class*="advertisement"]');
}
if (this.options.intensity === 'aggressive' || this.options.intensity === 'maximum') {
noiseSelectors.push('[class*="popup"]', '[class*="modal"]', '[class*="overlay"]', '[class*="sidebar"]', '[class*="footer"]');
}
noiseSelectors.forEach(selector => {
const elements = document.querySelectorAll(selector);
elements.forEach(element => {
if (!this.isProtectedElement(element)) {
element.remove();
removed++;
}
});
});
return removed;
}
/**
* Analyze content blocks and assign quality scores
*/
analyzeContentBlocks(document) {
const blocks = [];
const candidates = document.querySelectorAll('div, article, section, main, p, li');
candidates.forEach((element, index) => {
const block = this.analyzeElement(element, index);
if (block.qualityScore > 0) {
blocks.push(block);
}
});
return blocks.sort((a, b) => b.qualityScore - a.qualityScore);
}
/**
* Analyze individual element and calculate quality score
*/
analyzeElement(element, index) {
var _a;
const textContent = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '';
const htmlContent = element.innerHTML || '';
// Calculate various quality metrics
const textDensity = this.calculateTextDensity(textContent, htmlContent);
const semanticValue = this.calculateSemanticValue(element, textContent);
const structuralImportance = this.calculateStructuralImportance(element);
const userEngagement = this.calculateUserEngagementValue(element);
// Calculate weighted quality score
const qualityScore = textDensity * this.options.qualityWeights.textDensity +
semanticValue * this.options.qualityWeights.semanticValue +
structuralImportance * this.options.qualityWeights.structuralImportance +
userEngagement * this.options.qualityWeights.userEngagement;
return {
element,
index,
textContent,
textLength: textContent.length,
textDensity,
semanticValue,
structuralImportance,
userEngagement,
qualityScore,
shouldPreserve: qualityScore >= this.getQualityThreshold()
};
}
/**
* Calculate text density (ratio of text to HTML)
*/
calculateTextDensity(textContent, htmlContent) {
if (htmlContent.length === 0)
return 0;
// Remove common noise patterns from HTML
const cleanHtml = htmlContent
.replace(/<style[^>]*>.*?<\/style>/gi, '')
.replace(/<script[^>]*>.*?<\/script>/gi, '')
.replace(/style\s*=\s*"[^"]*"/gi, '');
const textLength = textContent.length;
const htmlLength = cleanHtml.length;
if (htmlLength === 0)
return 0;
return Math.min(textLength / htmlLength, 1);
}
/**
* Calculate semantic value based on content characteristics
*/
calculateSemanticValue(element, textContent) {
let score = 0;
// Length scoring (sweet spot around 50-500 characters)
if (textContent.length >= this.options.minTextLength) {
if (textContent.length >= 50 && textContent.length <= 500) {
score += 0.4;
}
else if (textContent.length > 500 && textContent.length <= 2000) {
score += 0.3;
}
else if (textContent.length > 20) {
score += 0.2;
}
}
// Content quality indicators
if (this.hasLinks(element))
score += 0.1;
if (this.hasStructuredContent(textContent))
score += 0.2;
if (this.isReadableText(textContent))
score += 0.2;
if (!this.containsCodeOrStyles(textContent))
score += 0.1;
return Math.min(score, 1);
}
/**
* Calculate structural importance based on element position and semantics
*/
calculateStructuralImportance(element) {
let score = 0;
// Semantic HTML elements
const tagName = element.tagName.toLowerCase();
const semanticTags = {
'main': 0.4,
'article': 0.3,
'section': 0.2,
'nav': 0.3,
'header': 0.2,
'h1': 0.4,
'h2': 0.3,
'h3': 0.2,
'p': 0.2,
'li': 0.1
};
score += semanticTags[tagName] || 0;
// Navigation preservation
if (this.options.preserveNavigation && this.isNavigationElement(element)) {
score += 0.3;
}
// Position scoring (higher for content that appears earlier)
const position = this.getElementPosition(element);
if (position < 0.3)
score += 0.2;
else if (position < 0.6)
score += 0.1;
return Math.min(score, 1);
}
/**
* Calculate user engagement value (links, interactive elements)
*/
calculateUserEngagementValue(element) {
let score = 0;
const links = element.querySelectorAll('a[href]');
const buttons = element.querySelectorAll('button, input[type="button"]');
const forms = element.querySelectorAll('form');
// Scoring based on interactive elements
if (links.length > 0) {
score += Math.min(links.length * 0.1, 0.3);
// Bonus for external links
const externalLinks = Array.from(links).filter(link => {
const href = link.getAttribute('href');
return href && (href.startsWith('http') || href.startsWith('//'));
});
score += Math.min(externalLinks.length * 0.05, 0.2);
}
if (buttons.length > 0)
score += Math.min(buttons.length * 0.05, 0.1);
if (forms.length > 0)
score += 0.1;
return Math.min(score, 1);
}
/**
* Apply quality-based filtering to content blocks
*/
applyQualityFiltering(blocks) {
const threshold = this.getQualityThreshold();
blocks.forEach(block => {
if (!block.shouldPreserve && block.qualityScore < threshold) {
// Low quality content - remove or simplify
if (this.options.intensity === 'maximum') {
block.element.remove();
}
else if (this.options.intensity === 'aggressive') {
this.simplifyElement(block.element);
}
}
});
return blocks.filter(block => block.shouldPreserve);
}
/**
* Get quality threshold based on intensity setting
*/
getQualityThreshold() {
const thresholds = {
'minimal': 0.2,
'moderate': 0.4,
'aggressive': 0.6,
'maximum': 0.8
};
return thresholds[this.options.intensity];
}
/**
* Clean up empty containers after filtering
*/
cleanupEmptyContainers(document) {
let removed = 0;
const containers = ['div', 'span', 'section', 'article'];
containers.forEach(tag => {
const elements = document.querySelectorAll(tag);
elements.forEach(element => {
if (this.isEmptyContainer(element) && !this.isProtectedElement(element)) {
element.remove();
removed++;
}
});
});
return removed;
}
/**
* Preserve elements that are definitely valuable
*/
preserveHighValueContent(document) {
let preserved = 0;
// Always preserve main content areas
const highValueSelectors = [
'main',
'[role="main"]',
'.main-content',
'.content',
'article',
'[itemprop="articleBody"]'
];
if (this.options.preserveNavigation) {
highValueSelectors.push('nav', '[role="navigation"]');
}
highValueSelectors.forEach(selector => {
const elements = document.querySelectorAll(selector);
elements.forEach(element => {
element.setAttribute('data-preserve', 'true');
preserved++;
});
});
return preserved;
}
// Helper methods
isProtectedElement(element) {
return element.hasAttribute('data-preserve') ||
element.closest('[data-preserve]') !== null;
}
isNavigationElement(element) {
const navIndicators = ['nav', 'menu', 'navigation', 'breadcrumb'];
const className = element.className.toLowerCase();
const id = element.id.toLowerCase();
const tagName = element.tagName.toLowerCase();
return tagName === 'nav' ||
navIndicators.some(indicator => className.includes(indicator) || id.includes(indicator));
}
hasLinks(element) {
return element.querySelectorAll('a[href]').length > 0;
}
hasStructuredContent(text) {
// Check for structured patterns
return /[.!?]\s+[A-Z]/.test(text) || // Sentences
/\d+\.\s/.test(text) || // Numbered lists
/[•\-]\s/.test(text); // Bullet points
}
isReadableText(text) {
// Check if text appears to be readable content vs code/styles
const codePatterns = [
/\{[^}]*\}/,
/[\w-]+:\s*[\w-]+;/,
/function\s*\(/,
/var\s+\w+\s*=/,
/<[^>]+>/ // HTML tags
];
return !codePatterns.some(pattern => pattern.test(text));
}
containsCodeOrStyles(text) {
return /(\{[^}]*\}|[\w-]+:\s*[\w-]+;|function\s*\(|var\s+\w+\s*=)/.test(text);
}
getElementPosition(element) {
// Calculate relative position in document (0 = top, 1 = bottom)
const allElements = document.querySelectorAll('*');
const elementIndex = Array.from(allElements).indexOf(element);
return elementIndex / allElements.length;
}
isEmptyContainer(element) {
var _a;
const textContent = ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '';
const hasChildren = element.children.length > 0;
const hasPreserveAttribute = element.hasAttribute('data-preserve');
return !textContent && !hasChildren && !hasPreserveAttribute;
}
simplifyElement(element) {
// Remove style attributes and simplify complex elements
element.removeAttribute('style');
element.removeAttribute('class');
// Remove empty child elements
const children = Array.from(element.children);
children.forEach(child => {
if (this.isEmptyContainer(child)) {
child.remove();
}
});
}
calculateOverallQuality(document) {
var _a, _b, _c, _d;
const textLength = ((_b = (_a = document.body) === null || _a === void 0 ? void 0 : _a.textContent) === null || _b === void 0 ? void 0 : _b.length) || 0;
const htmlLength = ((_d = (_c = document.body) === null || _c === void 0 ? void 0 : _c.innerHTML) === null || _d === void 0 ? void 0 : _d.length) || 0;
if (htmlLength === 0)
return 0;
return Math.min(textLength / htmlLength, 1);
}
}
exports.IntelligentContentFilter = IntelligentContentFilter;
/**
* Convenience function to create preset configurations
*/
function createContentQualityPreset(preset) {
const presets = {
clean: {
intensity: 'minimal',
minTextDensity: 0.2,
minTextLength: 5,
maxNoiseRatio: 0.8,
preserveNavigation: true,
preserveStructuredData: true
},
balanced: {
intensity: 'moderate',
minTextDensity: 0.3,
minTextLength: 10,
maxNoiseRatio: 0.7,
preserveNavigation: true,
preserveStructuredData: true
},
aggressive: {
intensity: 'aggressive',
minTextDensity: 0.5,
minTextLength: 20,
maxNoiseRatio: 0.5,
preserveNavigation: false,
preserveStructuredData: false
},
maximum: {
intensity: 'maximum',
minTextDensity: 0.7,
minTextLength: 30,
maxNoiseRatio: 0.3,
preserveNavigation: false,
preserveStructuredData: false
}
};
return presets[preset];
}
exports.createContentQualityPreset = createContentQualityPreset;