defuddle
Version:
Extract article content and metadata from web pages.
636 lines • 27.8 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Defuddle = void 0;
const metadata_1 = require("./metadata");
const extractor_registry_1 = require("./extractor-registry");
const constants_1 = require("./constants");
const standardize_1 = require("./standardize");
const scoring_1 = require("./scoring");
const utils_1 = require("./utils");
class Defuddle {
/**
* Create a new Defuddle instance
* @param doc - The document to parse
* @param options - Options for parsing
*/
constructor(doc, options = {}) {
this.doc = doc;
this.options = options;
this.debug = options.debug || false;
}
/**
* Parse the document and extract its main content
*/
parse() {
// Try first with default settings
const result = this.parseInternal();
// If result has very little content, try again without clutter removal
if (result.wordCount < 200) {
console.log('Initial parse returned very little content, trying again');
const retryResult = this.parseInternal({
removePartialSelectors: false
});
// Return the result with more content
if (retryResult.wordCount > result.wordCount) {
this._log('Retry produced more content');
return retryResult;
}
}
return result;
}
/**
* Internal parse method that does the actual work
*/
parseInternal(overrideOptions = {}) {
const startTime = Date.now();
const options = {
removeExactSelectors: true,
removePartialSelectors: true,
...this.options,
...overrideOptions
};
// Extract schema.org data
const schemaOrgData = this._extractSchemaOrgData(this.doc);
// Collect meta tags
const pageMetaTags = [];
this.doc.querySelectorAll('meta').forEach(meta => {
const name = meta.getAttribute('name');
const property = meta.getAttribute('property');
let content = meta.getAttribute('content');
if (content) { // Only include tags that have content
pageMetaTags.push({ name, property, content: this._decodeHTMLEntities(content) });
}
});
// Extract metadata
const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
try {
// Use site-specific extractor first, if there is one
const url = options.url || this.doc.URL;
const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
if (extractor && extractor.canExtract()) {
const extracted = extractor.extract();
const endTime = Date.now();
// console.log('Using extractor:', extractor.constructor.name.replace('Extractor', ''));
return {
content: extracted.contentHtml,
title: extracted.variables?.title || metadata.title,
description: metadata.description,
domain: metadata.domain,
favicon: metadata.favicon,
image: metadata.image,
published: extracted.variables?.published || metadata.published,
author: extracted.variables?.author || metadata.author,
site: metadata.site,
schemaOrgData: metadata.schemaOrgData,
wordCount: this.countWords(extracted.contentHtml),
parseTime: Math.round(endTime - startTime),
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
metaTags: pageMetaTags
};
}
// Continue if there is no extractor...
// Evaluate mobile styles and sizes on original document
const mobileStyles = this._evaluateMediaQueries(this.doc);
// Find small images in original document, excluding lazy-loaded ones
const smallImages = this.findSmallImages(this.doc);
// Clone document
const clone = this.doc.cloneNode(true);
// Apply mobile styles to clone
this.applyMobileStyles(clone, mobileStyles);
// Find main content
const mainContent = this.findMainContent(clone);
if (!mainContent) {
const endTime = Date.now();
return {
content: this.doc.body.innerHTML,
...metadata,
wordCount: this.countWords(this.doc.body.innerHTML),
parseTime: Math.round(endTime - startTime),
metaTags: pageMetaTags
};
}
// Remove small images
this.removeSmallImages(clone, smallImages);
// Remove hidden elements using computed styles
this.removeHiddenElements(clone);
// Remove non-content blocks by scoring
// Tries to find lists, navigation based on text content and link density
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug);
// Remove clutter using selectors
if (options.removeExactSelectors || options.removePartialSelectors) {
this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors);
}
// Normalize the main content
(0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
const content = mainContent.outerHTML;
const endTime = Date.now();
return {
content,
...metadata,
wordCount: this.countWords(content),
parseTime: Math.round(endTime - startTime),
metaTags: pageMetaTags
};
}
catch (error) {
console.error('Defuddle', 'Error processing document:', error);
const endTime = Date.now();
return {
content: this.doc.body.innerHTML,
...metadata,
wordCount: this.countWords(this.doc.body.innerHTML),
parseTime: Math.round(endTime - startTime),
metaTags: pageMetaTags
};
}
}
countWords(content) {
// Create a temporary div to parse HTML content
const tempDiv = this.doc.createElement('div');
tempDiv.innerHTML = content;
// Get text content, removing extra whitespace
const text = tempDiv.textContent || '';
const words = text
.trim()
.replace(/\s+/g, ' ') // Replace multiple spaces with single space
.split(' ')
.filter(word => word.length > 0); // Filter out empty strings
return words.length;
}
// Make all other methods private by removing the static keyword and using private
_log(...args) {
if (this.debug) {
console.log('Defuddle:', ...args);
}
}
_evaluateMediaQueries(doc) {
const mobileStyles = [];
const maxWidthRegex = /max-width[^:]*:\s*(\d+)/;
try {
// Get all styles, including inline styles
const sheets = Array.from(doc.styleSheets).filter(sheet => {
try {
// Access rules once to check validity
sheet.cssRules;
return true;
}
catch (e) {
// Expected error for cross-origin stylesheets or Node.js environment
if (e instanceof DOMException && e.name === 'SecurityError') {
return false;
}
return false;
}
});
// Process all sheets in a single pass
const mediaRules = sheets.flatMap(sheet => {
try {
// Check if we're in a browser environment where CSSMediaRule is available
if (typeof CSSMediaRule === 'undefined') {
return [];
}
return Array.from(sheet.cssRules)
.filter((rule) => rule instanceof CSSMediaRule &&
rule.conditionText.includes('max-width'));
}
catch (e) {
if (this.debug) {
console.warn('Defuddle: Failed to process stylesheet:', e);
}
return [];
}
});
// Process all media rules in a single pass
mediaRules.forEach(rule => {
const match = rule.conditionText.match(maxWidthRegex);
if (match) {
const maxWidth = parseInt(match[1]);
if (constants_1.MOBILE_WIDTH <= maxWidth) {
// Batch process all style rules
const styleRules = Array.from(rule.cssRules)
.filter((r) => r instanceof CSSStyleRule);
styleRules.forEach(cssRule => {
try {
mobileStyles.push({
selector: cssRule.selectorText,
styles: cssRule.style.cssText
});
}
catch (e) {
if (this.debug) {
console.warn('Defuddle: Failed to process CSS rule:', e);
}
}
});
}
}
});
}
catch (e) {
console.error('Defuddle: Error evaluating media queries:', e);
}
return mobileStyles;
}
applyMobileStyles(doc, mobileStyles) {
let appliedCount = 0;
mobileStyles.forEach(({ selector, styles }) => {
try {
const elements = doc.querySelectorAll(selector);
elements.forEach(element => {
element.setAttribute('style', (element.getAttribute('style') || '') + styles);
appliedCount++;
});
}
catch (e) {
console.error('Defuddle', 'Error applying styles for selector:', selector, e);
}
});
}
removeHiddenElements(doc) {
let count = 0;
const elementsToRemove = new Set();
// Get all elements and check their styles
const allElements = Array.from(doc.getElementsByTagName('*'));
// Process styles in batches to minimize layout thrashing
const BATCH_SIZE = 100;
for (let i = 0; i < allElements.length; i += BATCH_SIZE) {
const batch = allElements.slice(i, i + BATCH_SIZE);
// Read phase - gather all computedStyles
const styles = batch.map(element => {
try {
return element.ownerDocument.defaultView?.getComputedStyle(element);
}
catch (e) {
// If we can't get computed style, check inline styles
const style = element.getAttribute('style');
if (!style)
return null;
// Create a temporary style element to parse inline styles
const tempStyle = doc.createElement('style');
tempStyle.textContent = `* { ${style} }`;
doc.head.appendChild(tempStyle);
const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element);
doc.head.removeChild(tempStyle);
return computedStyle;
}
});
// Write phase - mark elements for removal
batch.forEach((element, index) => {
const computedStyle = styles[index];
if (computedStyle && (computedStyle.display === 'none' ||
computedStyle.visibility === 'hidden' ||
computedStyle.opacity === '0')) {
elementsToRemove.add(element);
count++;
}
});
}
// Batch remove all hidden elements
this._log('Removed hidden elements:', count);
}
removeBySelector(doc, removeExact = true, removePartial = true) {
const startTime = Date.now();
let exactSelectorCount = 0;
let partialSelectorCount = 0;
// Track all elements to be removed
const elementsToRemove = new Set();
// First collect elements matching exact selectors
if (removeExact) {
const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS.join(','));
exactElements.forEach(el => {
if (el?.parentNode) {
elementsToRemove.add(el);
exactSelectorCount++;
}
});
}
if (removePartial) {
// Pre-compile regexes and combine into a single regex for better performance
const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|');
const partialRegex = new RegExp(combinedPattern, 'i');
// Create an efficient attribute selector for elements we care about
const attributeSelector = constants_1.TEST_ATTRIBUTES.map(attr => `[${attr}]`).join(',');
const allElements = doc.querySelectorAll(attributeSelector);
// Process elements for partial matches
allElements.forEach(el => {
// Skip if already marked for removal
if (elementsToRemove.has(el)) {
return;
}
// Get all relevant attributes and combine into a single string
const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
if (attr === 'class') {
return el.className && typeof el.className === 'string' ? el.className : '';
}
if (attr === 'id') {
return el.id || '';
}
return el.getAttribute(attr) || '';
}).join(' ').toLowerCase();
// Skip if no attributes to check
if (!attrs.trim()) {
return;
}
// Check for partial match using single regex test
if (partialRegex.test(attrs)) {
elementsToRemove.add(el);
partialSelectorCount++;
}
});
}
// Remove all collected elements in a single pass
elementsToRemove.forEach(el => el.remove());
const endTime = Date.now();
this._log('Removed clutter elements:', {
exactSelectors: exactSelectorCount,
partialSelectors: partialSelectorCount,
total: elementsToRemove.size,
processingTime: `${(endTime - startTime).toFixed(2)}ms`
});
}
// Find small IMG and SVG elements
findSmallImages(doc) {
const MIN_DIMENSION = 33;
const smallImages = new Set();
const transformRegex = /scale\(([\d.]+)\)/;
const startTime = Date.now();
let processedCount = 0;
// 1. Read phase - Gather all elements in a single pass
const elements = [
...Array.from(doc.getElementsByTagName('img')),
...Array.from(doc.getElementsByTagName('svg'))
];
if (elements.length === 0) {
return smallImages;
}
// 2. Batch process - Collect all measurements in one go
const measurements = elements.map(element => ({
element,
// Static attributes (no reflow)
naturalWidth: element.tagName.toLowerCase() === 'img' ?
parseInt(element.getAttribute('width') || '0') || 0 : 0,
naturalHeight: element.tagName.toLowerCase() === 'img' ?
parseInt(element.getAttribute('height') || '0') || 0 : 0,
attrWidth: parseInt(element.getAttribute('width') || '0'),
attrHeight: parseInt(element.getAttribute('height') || '0')
}));
// 3. Batch compute styles - Process in chunks to avoid long tasks
const BATCH_SIZE = 50;
for (let i = 0; i < measurements.length; i += BATCH_SIZE) {
const batch = measurements.slice(i, i + BATCH_SIZE);
try {
// Read phase - compute all styles at once
const styles = batch.map(({ element }) => {
try {
return element.ownerDocument.defaultView?.getComputedStyle(element);
}
catch (e) {
return null;
}
});
// Get bounding rectangles if available
const rects = batch.map(({ element }) => {
try {
return element.getBoundingClientRect();
}
catch (e) {
return null;
}
});
// Process phase - no DOM operations
batch.forEach((measurement, index) => {
try {
const style = styles[index];
const rect = rects[index];
if (!style)
return;
// Get transform scale in the same batch
const transform = style.transform;
const scale = transform ?
parseFloat(transform.match(transformRegex)?.[1] || '1') : 1;
// Calculate effective dimensions
const widths = [
measurement.naturalWidth,
measurement.attrWidth,
parseInt(style.width) || 0,
rect ? rect.width * scale : 0
].filter(dim => typeof dim === 'number' && dim > 0);
const heights = [
measurement.naturalHeight,
measurement.attrHeight,
parseInt(style.height) || 0,
rect ? rect.height * scale : 0
].filter(dim => typeof dim === 'number' && dim > 0);
// Decision phase - no DOM operations
if (widths.length > 0 && heights.length > 0) {
const effectiveWidth = Math.min(...widths);
const effectiveHeight = Math.min(...heights);
if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
const identifier = this.getElementIdentifier(measurement.element);
if (identifier) {
smallImages.add(identifier);
processedCount++;
}
}
}
}
catch (e) {
if (this.debug) {
console.warn('Defuddle: Failed to process element dimensions:', e);
}
}
});
}
catch (e) {
if (this.debug) {
console.warn('Defuddle: Failed to process batch:', e);
}
}
}
const endTime = Date.now();
this._log('Found small elements:', {
count: processedCount,
processingTime: `${(endTime - startTime).toFixed(2)}ms`
});
return smallImages;
}
removeSmallImages(doc, smallImages) {
let removedCount = 0;
['img', 'svg'].forEach(tag => {
const elements = doc.getElementsByTagName(tag);
Array.from(elements).forEach(element => {
const identifier = this.getElementIdentifier(element);
if (identifier && smallImages.has(identifier)) {
element.remove();
removedCount++;
}
});
});
this._log('Removed small elements:', removedCount);
}
getElementIdentifier(element) {
// Try to create a unique identifier using various attributes
if (element.tagName.toLowerCase() === 'img') {
// For lazy-loaded images, use data-src as identifier if available
const dataSrc = element.getAttribute('data-src');
if (dataSrc)
return `src:${dataSrc}`;
const src = element.getAttribute('src') || '';
const srcset = element.getAttribute('srcset') || '';
const dataSrcset = element.getAttribute('data-srcset');
if (src)
return `src:${src}`;
if (srcset)
return `srcset:${srcset}`;
if (dataSrcset)
return `srcset:${dataSrcset}`;
}
const id = element.id || '';
const className = element.className || '';
const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
if (id)
return `id:${id}`;
if (viewBox)
return `viewBox:${viewBox}`;
if (className)
return `class:${className}`;
return null;
}
findMainContent(doc) {
// Find all potential content containers
const candidates = [];
constants_1.ENTRY_POINT_ELEMENTS.forEach((selector, index) => {
const elements = doc.querySelectorAll(selector);
elements.forEach(element => {
// Base score from selector priority (earlier = higher)
let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 40;
// Add score based on content analysis
score += scoring_1.ContentScorer.scoreElement(element);
candidates.push({ element, score });
});
});
if (candidates.length === 0) {
// Fall back to scoring block elements
return this.findContentByScoring(doc);
}
// Sort by score descending
candidates.sort((a, b) => b.score - a.score);
if (this.debug) {
this._log('Content candidates:', candidates.map(c => ({
element: c.element.tagName,
selector: this.getElementSelector(c.element),
score: c.score
})));
}
// If we only matched body, try table-based detection
if (candidates.length === 1 && candidates[0].element.tagName.toLowerCase() === 'body') {
const tableContent = this.findTableBasedContent(doc);
if (tableContent) {
return tableContent;
}
}
return candidates[0].element;
}
findTableBasedContent(doc) {
// First check if this looks like an old-style table-based layout
const tables = Array.from(doc.getElementsByTagName('table'));
const hasTableLayout = tables.some(table => {
const width = parseInt(table.getAttribute('width') || '0');
const style = this.getComputedStyle(table);
return width > 400 ||
(style?.width.includes('px') && parseInt(style.width) > 400) ||
table.getAttribute('align') === 'center' ||
table.className.toLowerCase().includes('content') ||
table.className.toLowerCase().includes('article');
});
if (!hasTableLayout) {
return null; // Don't try table-based extraction for modern layouts
}
const cells = Array.from(doc.getElementsByTagName('td'));
return scoring_1.ContentScorer.findBestElement(cells);
}
findContentByScoring(doc) {
const candidates = [];
constants_1.BLOCK_ELEMENTS.forEach((tag) => {
Array.from(doc.getElementsByTagName(tag)).forEach((element) => {
const score = scoring_1.ContentScorer.scoreElement(element);
if (score > 0) {
candidates.push({ score, element });
}
});
});
return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
}
getElementSelector(element) {
const parts = [];
let current = element;
while (current && current !== this.doc.documentElement) {
let selector = current.tagName.toLowerCase();
if (current.id) {
selector += '#' + current.id;
}
else if (current.className && typeof current.className === 'string') {
selector += '.' + current.className.trim().split(/\s+/).join('.');
}
parts.unshift(selector);
current = current.parentElement;
}
return parts.join(' > ');
}
getComputedStyle(element) {
return (0, utils_1.getComputedStyle)(element);
}
_extractSchemaOrgData(doc) {
const schemaScripts = doc.querySelectorAll('script[type="application/ld+json"]');
const rawSchemaItems = [];
schemaScripts.forEach(script => {
let jsonContent = script.textContent || '';
try {
jsonContent = jsonContent
.replace(/\/\*[\s\S]*?\*\/|^\s*\/\/.*$/gm, '')
.replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/, '$1')
.replace(/^\s*(\*\/|\/\*)\s*|\s*(\*\/|\/\*)\s*$/g, '')
.trim();
const jsonData = JSON.parse(jsonContent);
if (jsonData['@graph'] && Array.isArray(jsonData['@graph'])) {
rawSchemaItems.push(...jsonData['@graph']);
}
else {
rawSchemaItems.push(jsonData);
}
}
catch (error) {
console.error('Defuddle: Error parsing schema.org data:', error);
if (this.debug) {
console.error('Defuddle: Problematic JSON content:', jsonContent);
}
}
});
const decodeStringsInObject = (item) => {
if (typeof item === 'string') {
return this._decodeHTMLEntities(item);
}
else if (Array.isArray(item)) {
return item.map(decodeStringsInObject);
}
else if (typeof item === 'object' && item !== null) {
const newItem = {};
for (const key in item) {
if (Object.prototype.hasOwnProperty.call(item, key)) {
newItem[key] = decodeStringsInObject(item[key]);
}
}
return newItem;
}
return item;
};
return rawSchemaItems.map(decodeStringsInObject);
}
_decodeHTMLEntities(text) {
const textarea = this.doc.createElement('textarea');
textarea.innerHTML = text;
return textarea.value;
}
}
exports.Defuddle = Defuddle;
//# sourceMappingURL=defuddle.js.map