defuddle
Version:
Extract article content and metadata from web pages.
467 lines • 18.6 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ContentScorer = void 0;
const constants_1 = require("./constants");
const utils_1 = require("./utils");
const contentIndicators = [
'admonition',
'article',
'content',
'entry',
'image',
'img',
'font',
'figure',
'figcaption',
'pre',
'main',
'post',
'story',
'table'
];
// Text content to test against
const navigationIndicators = [
'advertisement',
'all rights reserved',
'banner',
'cookie',
'comments',
'copyright',
'follow me',
'follow us',
'footer',
'header',
'homepage',
'login',
'menu',
'more articles',
'more like this',
'most read',
'nav',
'navigation',
'newsletter',
'popular',
'privacy',
'recommended',
'register',
'related',
'responses',
'share',
'sidebar',
'sign in',
'sign up',
'signup',
'social',
'sponsored',
'subscribe',
'terms',
'trending'
];
// Social media profile URL pattern — used to detect author bios
const socialProfilePattern = /\b(linkedin\.com\/(in|company)\/|twitter\.com\/(?!intent\b)\w|x\.com\/(?!intent\b)\w|facebook\.com\/(?!share\b)\w|instagram\.com\/\w|threads\.net\/\w|mastodon\.\w)/i;
// Date pattern for detecting standalone bylines — no leading \b because
// textContent can concatenate adjacent elements without whitespace
const datePattern = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
// Author attribution pattern — case-sensitive "By" + capitalized name
const bylinePattern = /\bBy\s+[A-Z]/;
// Pre-compiled navigation indicator regexes for scoreNonContentBlock
const navigationIndicatorRegexes = navigationIndicators.map(indicator => new RegExp(`\\b${indicator.replace(/\s+/g, '\\s+')}\\b`));
// Single combined regex for heading text matching in isLikelyContent
const navigationHeadingPattern = new RegExp(navigationIndicators.map(i => i.replace(/\s+/g, '\\s+')).join('|'), 'i');
// Classes that indicate non-content these are elements are
// not removed, but lower the score
const nonContentPatterns = [
'advert',
'ad-',
'ads',
'banner',
'cookie',
'copyright',
'footer',
'header',
'homepage',
'menu',
'nav',
'newsletter',
'popular',
'privacy',
'recommended',
'related',
'rights',
'share',
'sidebar',
'social',
'sponsored',
'subscribe',
'terms',
'trending',
'widget'
];
class ContentScorer {
constructor(doc, debug = false) {
this.doc = doc;
this.debug = debug;
}
static scoreElement(element) {
let score = 0;
// Text density
const text = element.textContent || '';
const words = text.split(/\s+/).length;
score += words;
// Paragraph ratio
const paragraphs = element.getElementsByTagName('p').length;
score += paragraphs * 10;
// Comma counting — prose text has commas, navigation doesn't
const commas = text.split(/,/).length - 1;
score += commas;
// Image ratio (penalize high image density)
const images = element.getElementsByTagName('img').length;
const imageDensity = images / (words || 1);
score -= imageDensity * 3;
// Position bonus (center/right elements)
try {
const style = element.getAttribute('style') || '';
const align = element.getAttribute('align') || '';
const isRightSide = style.includes('float: right') ||
style.includes('text-align: right') ||
align === 'right';
if (isRightSide)
score += 5;
}
catch (e) {
// Ignore position if we can't get style
}
// Content indicators
const hasDate = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b/i.test(text);
if (hasDate)
score += 10;
const hasAuthor = /\b(?:by|written by|author:)\s+[A-Za-z\s]+\b/i.test(text);
if (hasAuthor)
score += 10;
// Check for common content classes/attributes
const className = element.className.toLowerCase();
if (className.includes('content') || className.includes('article') || className.includes('post')) {
score += 15;
}
// Check for footnotes/references
const hasFootnotes = element.querySelector(constants_1.FOOTNOTE_INLINE_REFERENCES);
if (hasFootnotes)
score += 10;
const hasFootnotesList = element.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS);
if (hasFootnotesList)
score += 10;
// Check for nested tables (penalize)
const nestedTables = element.getElementsByTagName('table').length;
score -= nestedTables * 5;
// Additional scoring for table cells
if (element.tagName.toLowerCase() === 'td') {
// Table cells get a bonus for being in the main content area
const parentTable = element.closest('table');
if (parentTable) {
// Only favor cells in tables that look like old-style content layouts
const tableWidth = parseInt(parentTable.getAttribute('width') || '0');
const tableAlign = parentTable.getAttribute('align') || '';
const tableClass = parentTable.className.toLowerCase();
const isTableLayout = tableWidth > 400 || // Common width for main content tables
tableAlign === 'center' ||
tableClass.includes('content') ||
tableClass.includes('article');
if (isTableLayout) {
// Additional checks to ensure this is likely the main content cell
const allCells = Array.from(parentTable.getElementsByTagName('td'));
const cellIndex = allCells.indexOf(element);
const isCenterCell = cellIndex > 0 && cellIndex < allCells.length - 1;
if (isCenterCell) {
score += 10;
}
}
}
}
// Link density as a multiplier — scales the score down proportionally
// rather than applying a fixed penalty. Capped at 0.5 reduction to
// avoid over-penalizing link-heavy content like blog index pages.
const linkElements = element.getElementsByTagName('a');
let linkTextLength = 0;
for (let i = 0; i < linkElements.length; i++) {
linkTextLength += (linkElements[i].textContent || '').length;
}
const textLength = text.length || 1;
const linkDensity = Math.min(linkTextLength / textLength, 0.5);
score *= (1 - linkDensity);
return score;
}
static findBestElement(elements, minScore = 50) {
let bestElement = null;
let bestScore = 0;
elements.forEach(element => {
const score = this.scoreElement(element);
if (score > bestScore) {
bestScore = score;
bestElement = element;
}
});
return bestScore > minScore ? bestElement : null;
}
/**
* Scores blocks based on their content and structure
* and removes those that are likely not content.
*/
static scoreAndRemove(doc, debug = false, debugRemovals, mainContent) {
const startTime = Date.now();
// Track all elements to be removed
const elementsToRemove = new Map();
// Get all block elements
const blockElements = Array.from(doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR));
// Process each block element
blockElements.forEach(element => {
// Skip elements that are already marked for removal
if (elementsToRemove.has(element)) {
return;
}
// Skip ancestors of mainContent to avoid disconnecting it
if (mainContent && element.contains(mainContent)) {
return;
}
// Skip elements inside code blocks — they are code structure, not page navigation
if (element.closest('pre')) {
return;
}
// Skip elements that are likely to be content
if (ContentScorer.isLikelyContent(element)) {
return;
}
// Score the element based on various criteria
const score = ContentScorer.scoreNonContentBlock(element);
// If the score is below the threshold, mark for removal
if (score < 0) {
elementsToRemove.set(element, score);
}
});
// Remove all collected elements in a single pass
elementsToRemove.forEach((score, el) => {
if (debug && debugRemovals) {
debugRemovals.push({
step: 'scoreAndRemove',
reason: `score: ${score}`,
text: (0, utils_1.textPreview)(el)
});
}
el.remove();
});
const endTime = Date.now();
if (debug) {
console.log('Defuddle', 'Removed non-content blocks:', {
count: elementsToRemove.size,
processingTime: `${(endTime - startTime).toFixed(2)}ms`
});
}
}
/**
* Determines if an element is likely to be content based on its structure and attributes.
*/
static isLikelyContent(element) {
// Check if the element has a role that indicates content
const role = element.getAttribute('role');
if (role && ['article', 'main', 'contentinfo'].includes(role)) {
return true;
}
// Check if the element has a class or id that indicates content
const className = element.className.toLowerCase();
const id = element.id.toLowerCase();
for (const indicator of contentIndicators) {
if (className.includes(indicator) || id.includes(indicator)) {
return true;
}
}
// Elements containing code blocks or tables are likely content
if (element.querySelector('pre, table')) {
return true;
}
const text = element.textContent || '';
const words = text.split(/\s+/).length;
// Check for headings that signal non-content sections (e.g. "Related articles")
// even if the element has enough text/paragraphs to otherwise look like content.
// Skip very large elements (1000+ words) as they are likely page-level wrappers.
if (words < 1000) {
const headings = element.querySelectorAll('h1, h2, h3, h4, h5, h6');
let hasNavigationHeading = false;
for (let i = 0; i < headings.length; i++) {
const headingText = (headings[i].textContent || '').toLowerCase().trim();
if (navigationHeadingPattern.test(headingText)) {
hasNavigationHeading = true;
break;
}
}
if (hasNavigationHeading) {
if (words < 200) {
return false;
}
// Larger sections (e.g. card grids) are also non-content
// if they have high link density
const linkCount = element.getElementsByTagName('a').length;
const linkDensity = linkCount / (words || 1);
if (linkDensity > 0.2) {
return false;
}
}
}
// Article card listing detection: blocks with many headings and images
// but very little prose per heading are likely article card grids
// (e.g. "related articles", "more stories"), not single-article content.
// Also checked in scoreNonContentBlock as a score penalty for elements
// that pass the content checks above but still look like card grids.
if (ContentScorer.isCardGrid(element, words)) {
return false;
}
// Small elements containing social media profile links are likely
// author bios or social widgets, not article content.
if (words < 80) {
const links = element.getElementsByTagName('a');
for (let i = 0; i < links.length; i++) {
const href = (links[i].getAttribute('href') || '').toLowerCase();
if (socialProfilePattern.test(href)) {
return false;
}
}
}
const paragraphs = element.getElementsByTagName('p').length;
const listItems = element.getElementsByTagName('li').length;
const contentBlocks = paragraphs + listItems;
// If the element has a significant amount of text and paragraphs/list items, it's likely content
if (words > 50 && contentBlocks > 1) {
return true;
}
// Check for elements with significant text content, even if they don't have many paragraphs
if (words > 100) {
return true;
}
// Check for elements with text content and some paragraphs/list items
if (words > 30 && contentBlocks > 0) {
return true;
}
// Prose text with sentence-ending punctuation and low link density is
// likely content even without <p> tags (e.g. transcript segments using divs/spans)
if (words >= 10 && /[.?!]/.test(text)) {
const linkCount = element.getElementsByTagName('a').length;
const linkDensity = linkCount / words;
if (linkDensity < 0.1) {
return true;
}
}
return false;
}
/**
* Scores a block element based on various criteria to determine if it's likely not content.
* Returns a negative score if the element is likely not content, a positive score if it is.
*/
static scoreNonContentBlock(element) {
// Skip footnote list elements and their descendants
try {
if (element.matches(constants_1.FOOTNOTE_LIST_SELECTORS) ||
element.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS) ||
element.closest(constants_1.FOOTNOTE_LIST_SELECTORS)) {
return 0;
}
}
catch (e) { }
let score = 0;
// Get text content
const text = element.textContent || '';
const words = text.split(/\s+/).length;
// Skip very small elements
if (words < 3) {
return 0;
}
// Comma counting — prose has commas, navigation/boilerplate doesn't.
// This counterbalances negative signals from navigation indicators.
const commas = text.split(/,/).length - 1;
score += commas;
const textLower = text.toLowerCase();
let indicatorMatches = 0;
for (const regex of navigationIndicatorRegexes) {
if (regex.test(textLower)) {
indicatorMatches++;
}
}
score -= indicatorMatches * 10;
// Check for high link density (navigation)
const linkElements = element.getElementsByTagName('a');
const links = linkElements.length;
const linkDensity = links / (words || 1);
if (linkDensity > 0.5) {
score -= 15;
}
// Check for high link text ratio (e.g. card groups, nav sections)
// Requires multiple links to avoid penalizing content paragraphs
// that happen to be wrapped in a single link
if (links > 1 && words < 80) {
let linkTextLength = 0;
for (let i = 0; i < linkElements.length; i++) {
linkTextLength += (linkElements[i].textContent || '').length;
}
const totalTextLength = text.length;
if (totalTextLength > 0 && linkTextLength / totalTextLength > 0.8) {
score -= 15;
}
}
// Check for list structure (navigation)
const lists = element.getElementsByTagName('ul').length + element.getElementsByTagName('ol').length;
if (lists > 0 && links > lists * 3) {
score -= 10;
}
// Check for social media profile links (author bios, social widgets)
if (words < 80) {
const elLinks = element.getElementsByTagName('a');
for (let i = 0; i < elLinks.length; i++) {
const href = (elLinks[i].getAttribute('href') || '').toLowerCase();
if (socialProfilePattern.test(href)) {
score -= 15;
break;
}
}
}
// Penalize very small blocks that look like standalone author bylines with dates
// e.g. "By Author Name · March 4, 2026". Requires both an author attribution
// and a date to avoid false positives.
if (words < 15) {
if (bylinePattern.test(text) && datePattern.test(text)) {
score -= 10;
}
}
// Penalize blocks that look like article card grids
if (ContentScorer.isCardGrid(element, words)) {
score -= 15;
}
// Check for specific class patterns that indicate non-content
const className = element.className.toLowerCase();
const id = element.id.toLowerCase();
for (const pattern of nonContentPatterns) {
if (className.includes(pattern) || id.includes(pattern)) {
score -= 8;
}
}
return score;
}
/**
* Detects article card grids: blocks with 3+ headings and 2+ images
* but very little prose per heading.
*/
static isCardGrid(element, words) {
if (words < 3 || words >= 500)
return false;
const headings = element.querySelectorAll('h2, h3, h4');
if (headings.length < 3)
return false;
const images = element.querySelectorAll('img');
if (images.length < 2)
return false;
let headingWordCount = 0;
for (let i = 0; i < headings.length; i++) {
headingWordCount += (headings[i].textContent || '').split(/\s+/).length;
}
const prosePerHeading = (words - headingWordCount) / headings.length;
return prosePerHeading < 20;
}
}
exports.ContentScorer = ContentScorer;
//# sourceMappingURL=scoring.js.map