UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

312 lines 11 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ContentScorer = void 0; const constants_1 = require("./constants"); const contentIndicators = [ 'admonition', 'article', 'content', 'entry', 'image', 'img', 'font', 'figure', 'figcaption', 'pre', 'main', 'post', 'story', 'table' ]; // Text content to test against const navigationIndicators = [ 'advertisement', 'all rights reserved', 'banner', 'cookie', 'comments', 'copyright', 'follow me', 'follow us', 'footer', 'header', 'homepage', 'login', 'menu', 'more articles', 'more like this', 'most read', 'nav', 'navigation', 'newsletter', 'newsletter', 'popular', 'privacy', 'recommended', 'register', 'related', 'responses', 'share', 'sidebar', 'sign in', 'sign up', 'signup', 'social', 'sponsored', 'subscribe', 'subscribe', 'terms', 'trending' ]; // Classes that indicate non-content these are elements are // not removed, but lower the score const nonContentPatterns = [ 'ad', 'banner', 'cookie', 'copyright', 'footer', 'header', 'homepage', 'menu', 'nav', 'newsletter', 'popular', 'privacy', 'recommended', 'related', 'rights', 'share', 'sidebar', 'social', 'sponsored', 'subscribe', 'terms', 'trending', 'widget' ]; class ContentScorer { constructor(doc, debug = false) { this.doc = doc; this.debug = debug; } static scoreElement(element) { let score = 0; // Text density const text = element.textContent || ''; const words = text.split(/\s+/).length; score += words; // Paragraph ratio const paragraphs = element.getElementsByTagName('p').length; score += paragraphs * 10; // Link density (penalize high link density) const links = element.getElementsByTagName('a').length; const linkDensity = links / (words || 1); score -= linkDensity * 5; // Image ratio (penalize high image density) const images = element.getElementsByTagName('img').length; const imageDensity = images / (words || 1); score -= imageDensity * 3; // Position bonus (center/right elements) try { const style = element.getAttribute('style') || ''; const align = element.getAttribute('align') || ''; const isRightSide = style.includes('float: right') || style.includes('text-align: right') || align === 'right'; if (isRightSide) score += 5; } catch (e) { // Ignore position if we can't get style } // Content indicators const hasDate = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b/i.test(text); if (hasDate) score += 10; const hasAuthor = /\b(?:by|written by|author:)\s+[A-Za-z\s]+\b/i.test(text); if (hasAuthor) score += 10; // Check for common content classes/attributes const className = element.className.toLowerCase(); if (className.includes('content') || className.includes('article') || className.includes('post')) { score += 15; } // Check for footnotes/references const hasFootnotes = element.querySelector(constants_1.FOOTNOTE_INLINE_REFERENCES); if (hasFootnotes) score += 10; const hasFootnotesList = element.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS); if (hasFootnotesList) score += 10; // Check for nested tables (penalize) const nestedTables = element.getElementsByTagName('table').length; score -= nestedTables * 5; // Additional scoring for table cells if (element.tagName.toLowerCase() === 'td') { // Table cells get a bonus for being in the main content area const parentTable = element.closest('table'); if (parentTable) { // Only favor cells in tables that look like old-style content layouts const tableWidth = parseInt(parentTable.getAttribute('width') || '0'); const tableAlign = parentTable.getAttribute('align') || ''; const tableClass = parentTable.className.toLowerCase(); const isTableLayout = tableWidth > 400 || // Common width for main content tables tableAlign === 'center' || tableClass.includes('content') || tableClass.includes('article'); if (isTableLayout) { // Additional checks to ensure this is likely the main content cell const allCells = Array.from(parentTable.getElementsByTagName('td')); const cellIndex = allCells.indexOf(element); const isCenterCell = cellIndex > 0 && cellIndex < allCells.length - 1; if (isCenterCell) { score += 10; } } } } return score; } static findBestElement(elements, minScore = 50) { let bestElement = null; let bestScore = 0; elements.forEach(element => { const score = this.scoreElement(element); if (score > bestScore) { bestScore = score; bestElement = element; } }); return bestScore > minScore ? bestElement : null; } /** * Scores blocks based on their content and structure * and removes those that are likely not content */ static scoreAndRemove(doc, debug = false) { const startTime = Date.now(); let removedCount = 0; // Track all elements to be removed const elementsToRemove = new Set(); // Get all block elements const blockElements = Array.from(doc.querySelectorAll(constants_1.BLOCK_ELEMENTS.join(','))); // Process each block element blockElements.forEach(element => { // Skip elements that are already marked for removal if (elementsToRemove.has(element)) { return; } // Skip elements that are likely to be content if (ContentScorer.isLikelyContent(element)) { return; } // Score the element based on various criteria const score = ContentScorer.scoreNonContentBlock(element); // If the score is below the threshold, mark for removal if (score < 0) { elementsToRemove.add(element); removedCount++; } }); // Remove all collected elements in a single pass elementsToRemove.forEach(el => el.remove()); const endTime = Date.now(); if (debug) { console.log('Defuddle', 'Removed non-content blocks:', { count: removedCount, processingTime: `${(endTime - startTime).toFixed(2)}ms` }); } } /** * Determines if an element is likely to be content based on its structure and attributes. */ static isLikelyContent(element) { // Check if the element has a role that indicates content const role = element.getAttribute('role'); if (role && ['article', 'main', 'contentinfo'].includes(role)) { return true; } // Check if the element has a class or id that indicates content const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); for (const indicator of contentIndicators) { if (className.includes(indicator) || id.includes(indicator)) { return true; } } // Check if the element has a high text density const text = element.textContent || ''; const words = text.split(/\s+/).length; const paragraphs = element.getElementsByTagName('p').length; // If the element has a significant amount of text and paragraphs, it's likely content if (words > 50 && paragraphs > 1) { return true; } // Check for elements with significant text content, even if they don't have many paragraphs if (words > 100) { return true; } // Check for elements with text content and some paragraphs if (words > 30 && paragraphs > 0) { return true; } return false; } /** * Scores a block element based on various criteria to determine if it's likely not content. * Returns a negative score if the element is likely not content, a positive score if it is. */ static scoreNonContentBlock(element) { // Skip footnote list elements if (element.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS)) { return 0; } let score = 0; // Get text content const text = element.textContent || ''; const words = text.split(/\s+/).length; // Skip very small elements if (words < 3) { return 0; } for (const indicator of navigationIndicators) { if (text.toLowerCase().includes(indicator)) { score -= 10; } } // Check for high link density (navigation) const links = element.getElementsByTagName('a').length; const linkDensity = links / (words || 1); if (linkDensity > 0.5) { score -= 15; } // Check for list structure (navigation) const lists = element.getElementsByTagName('ul').length + element.getElementsByTagName('ol').length; if (lists > 0 && links > lists * 3) { score -= 10; } // Check for specific class patterns that indicate non-content const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); for (const pattern of nonContentPatterns) { if (className.includes(pattern) || id.includes(pattern)) { score -= 8; } } // Check for elements with many child elements but little text (typical for navigation) // const childElements = element.children.length; // if (childElements > 5 && words < childElements * 3) { // score -= 12; // } // Check for elements with many divs but little text (typical for layout elements) // const divs = element.getElementsByTagName('div').length; // if (divs > 3 && words < divs * 2) { // score -= 10; // } return score; } } exports.ContentScorer = ContentScorer; //# sourceMappingURL=scoring.js.map