UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

github.com/kepano/defuddle

kepano/defuddle

1,118 lines • 66.8 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Defuddle = void 0; const metadata_1 = require("./metadata"); const extractor_registry_1 = require("./extractor-registry"); const constants_1 = require("./constants"); const standardize_1 = require("./standardize"); const footnotes_1 = require("./elements/footnotes"); const scoring_1 = require("./scoring"); const utils_1 = require("./utils"); const dom_1 = require("./utils/dom"); /** Keys from extractor variables that map to top-level DefuddleResponse fields */ const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']); // Content pattern detection constants const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i; const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i; const BOILERPLATE_PATTERNS = [ /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i, /^A version of this (?:article|story) (?:appeared|was published) in\b/i, /^Originally (?:published|appeared) (?:in|on|at)\b/i, ]; const METADATA_STRIP_PATTERNS = [ /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi, /\b\d+(?:st|nd|rd|th)?\b/g, /\bmin(?:ute)?s?\b/gi, /\bread\b/gi, /[|·•—–\-,.\s]/g, ]; class Defuddle { /** * Create a new Defuddle instance * @param doc - The document to parse * @param options - Options for parsing */ constructor(doc, options = {}) { this._schemaOrgData = undefined; this._schemaOrgExtracted = false; this.doc = doc; this.options = options; this.debug = options.debug || false; } /** * Lazily extract and cache schema.org data. Must be called before * parse() strips script tags from the document. */ getSchemaOrgData() { if (!this._schemaOrgExtracted) { this._schemaOrgData = this._extractSchemaOrgData(this.doc); this._schemaOrgExtracted = true; } return this._schemaOrgData; } /** * Parse the document and extract its main content */ parse() { // Try first with default settings let result = this.parseInternal(); // If result has very little content, try again without clutter removal if (result.wordCount < 200) { this._log('Initial parse returned very little content, trying again'); const retryResult = this.parseInternal({ removePartialSelectors: false }); // Only use the retry if it produces significantly more content. // A small increase likely means partial selectors correctly removed // clutter (author blocks, related articles, etc.) from a short article. // A large increase (2x+) suggests partial selectors were too aggressive. if (retryResult.wordCount > result.wordCount * 2) { this._log('Retry produced more content'); result = retryResult; } } // If still very little content, the page may be an index/listing page // where card elements were scored as non-content or removed by partial // selectors (e.g. "post-preview"). Retry with both disabled. if (result.wordCount < 50) { this._log('Still very little content, retrying without scoring/partial selectors (possible index page)'); const indexRetry = this.parseInternal({ removeLowScoring: false, removePartialSelectors: false, removeContentPatterns: false }); if (indexRetry.wordCount > result.wordCount) { this._log('Index page retry produced more content'); result = indexRetry; } } // Strip dangerous elements from this.doc before any fallback paths // that read from it (e.g. _findContentBySchemaText). // This must happen after parseInternal, which needs script tags // for schema.org extraction, site-specific extractors, and math. this._stripUnsafeElements(); // If schema.org has a SocialMediaPosting with text content that is // longer than what we extracted, the scorer likely picked the wrong // element from a feed. Find the correct element in the DOM. const schemaText = this._getSchemaText(result.schemaOrgData); if (schemaText && this.countWords(schemaText) > result.wordCount) { const contentHtml = this._findContentBySchemaText(schemaText); if (contentHtml) { this._log('Found DOM content matching schema.org text'); result.content = contentHtml; result.wordCount = this.countWords(contentHtml); } else { this._log('Using schema.org text as content (DOM element not found)'); result.content = schemaText; result.wordCount = this.countWords(schemaText); } } return result; } /** * Extract text content from schema.org data (e.g. SocialMediaPosting, Article) */ _getSchemaText(schemaOrgData) { if (!schemaOrgData) return ''; const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData]; for (const item of items) { if (item?.text && typeof item.text === 'string') { return item.text; } if (item?.articleBody && typeof item.articleBody === 'string') { return item.articleBody; } } return ''; } /** * Remove dangerous elements and attributes from this.doc. * Called after parseInternal so that extractors and schema extraction * can still read script tags they depend on. */ _stripUnsafeElements() { const body = this.doc.body; if (!body) return; // Remove dangerous elements. Iframes are kept — same-origin policy // isolates them, and they're widely used for legitimate media embeds. // Dangerous iframe attributes (srcdoc, javascript: src) are stripped // in the attribute pass below. Math scripts are preserved for LaTeX // content (matching the EXACT_SELECTORS approach). const dangerousElements = body.querySelectorAll('script:not([type^="math/"]), style, noscript, frame, frameset, object, embed, applet, base'); for (const el of dangerousElements) el.remove(); // Remove event handler attributes, dangerous URIs, and srcdoc const allElements = body.querySelectorAll('*'); for (const el of allElements) { for (const attr of Array.from(el.attributes)) { const name = attr.name.toLowerCase(); if (name.startsWith('on')) { el.removeAttribute(attr.name); } else if (name === 'srcdoc') { el.removeAttribute(attr.name); } else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) { if ((0, dom_1.isDangerousUrl)(attr.value)) { el.removeAttribute(attr.name); } } } } } /** * Find a DOM element whose text matches the schema.org text content. * Used when the content scorer picked the wrong element from a feed page. * Returns the element's inner HTML including sibling media (images, etc.) */ _findContentBySchemaText(schemaText) { const body = this.doc.body; if (!body) return ''; // Use the first paragraph as the search phrase. // DOM textContent concatenates <p> elements without separators, // so we can't cross paragraph boundaries when matching. const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || ''; const searchPhrase = firstPara.substring(0, 100).trim(); if (!searchPhrase) return ''; const schemaWordCount = this.countWords(schemaText); // Find the smallest element whose text contains the search phrase // and whose word count is close to the schema text's word count let bestMatch = null; let bestSize = Infinity; const allElements = body.querySelectorAll('*'); for (const el of allElements) { const elText = (el.textContent || ''); if (!elText.includes(searchPhrase)) continue; const elWords = elText.trim().split(/\s+/).length; // Element should contain roughly the same amount of text // (allow some slack for surrounding whitespace / minor extras) if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) { bestSize = elWords; bestMatch = el; } } if (!bestMatch) return ''; // Read the largest sibling image src BEFORE resolveRelativeUrls // can mangle comma-containing CDN URLs in srcset attributes let imageSrc = ''; let imageAlt = ''; const parent = bestMatch.parentElement; if (parent && parent !== body) { const images = parent.querySelectorAll('img'); let largestImg = null; let largestArea = 0; for (const img of images) { if (bestMatch.contains(img)) continue; const w = parseInt(img.getAttribute('width') || '0', 10); const h = parseInt(img.getAttribute('height') || '0', 10); const area = w * h; if (area > largestArea) { largestArea = area; largestImg = img; } } if (largestImg) { imageSrc = this._getLargestImageSrc(largestImg); imageAlt = largestImg.getAttribute('alt') || ''; try { const baseUrl = this.options.url || this.doc.URL; if (baseUrl) imageSrc = new URL(imageSrc, baseUrl).href; } catch { } } } // Now resolve URLs in the text content this.resolveRelativeUrls(bestMatch); let html = (0, dom_1.serializeHTML)(bestMatch); if (imageSrc) { const img = this.doc.createElement('img'); img.setAttribute('src', imageSrc); img.setAttribute('alt', imageAlt); html += img.outerHTML; } return html; } /** * Get the largest available src from an img element, * checking srcset for higher-resolution versions. */ _getLargestImageSrc(img) { const srcset = img.getAttribute('srcset') || ''; if (!srcset) return img.getAttribute('src') || ''; // Parse srcset entries: each ends with a width descriptor (e.g. "424w") // URLs may contain commas (e.g. Substack CDN), so split on width descriptors const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g; let bestUrl = ''; let bestWidth = 0; let match; let lastIndex = 0; while ((match = entryPattern.exec(srcset)) !== null) { let url = match[1].trim(); if (lastIndex > 0) { url = url.replace(/^,\s*/, ''); } lastIndex = entryPattern.lastIndex; const width = parseFloat(match[2]); if (url && width > bestWidth) { bestWidth = width; bestUrl = url; } } let url = bestUrl || img.getAttribute('src') || ''; // Strip CDN width/crop constraints to get the full resolution image // (e.g. Cloudinary-style params: ,w_852,c_limit → removed) url = url.replace(/,w_\d+/g, '').replace(/,c_\w+/g, ''); return url; } /** * Parse the document asynchronously. Checks for extractors that prefer * async (e.g. YouTube transcripts) before sync, then falls back to async * extractors if sync parse yields no content. */ async parseAsync() { if (this.options.useAsync !== false) { const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry)); if (asyncResult) return asyncResult; } const result = this.parse(); if (result.wordCount > 0 || this.options.useAsync === false) { return result; } return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result; } /** * Fetch only async variables (e.g. transcript) without re-parsing. * Safe to call after parse() — uses cached schema.org data since * parse() strips script tags from the document. */ async fetchAsyncVariables() { if (this.options.useAsync === false) return null; try { const url = this.options.url || this.doc.URL; const schemaOrgData = this.getSchemaOrgData(); const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData); if (extractor) { const extracted = await extractor.extractAsync(); return this.getExtractorVariables(extracted.variables) || null; } } catch (error) { console.error('Defuddle', 'Error fetching async variables:', error); } return null; } async tryAsyncExtractor(finder) { try { const url = this.options.url || this.doc.URL; const schemaOrgData = this.getSchemaOrgData(); const extractor = finder(this.doc, url, schemaOrgData); if (extractor) { const startTime = Date.now(); const extracted = await extractor.extractAsync(); const pageMetaTags = this._collectMetaTags(); const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags); return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags); } } catch (error) { console.error('Defuddle', 'Error in async extraction:', error); } return null; } /** * Internal parse method that does the actual work */ parseInternal(overrideOptions = {}) { const startTime = Date.now(); const options = { removeExactSelectors: true, removePartialSelectors: true, removeHiddenElements: true, removeLowScoring: true, removeSmallImages: true, removeContentPatterns: true, standardize: true, ...this.options, ...overrideOptions }; const debugRemovals = []; // Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts) const schemaOrgData = this.getSchemaOrgData(); // Cache meta tags and metadata across retries if (!this._metaTags) { this._metaTags = this._collectMetaTags(); } const pageMetaTags = this._metaTags; if (!this._metadata) { this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags); } const metadata = this._metadata; if (options.removeImages) { this.removeImages(this.doc); } try { // Use site-specific extractor first, if there is one const url = options.url || this.doc.URL; const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData); if (extractor && extractor.canExtract()) { const extracted = extractor.extract(); return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags); } // Continue if there is no extractor... // Evaluate mobile styles and sizes on original document (cached across retries) if (!this._mobileStyles) { this._mobileStyles = this._evaluateMediaQueries(this.doc); } const mobileStyles = this._mobileStyles; // Find small images in original document (cached across retries) if (!this._smallImages) { this._smallImages = this.findSmallImages(this.doc); } const smallImages = this._smallImages; // Clone document const clone = this.doc.cloneNode(true); // Flatten shadow DOM content into the clone this.flattenShadowRoots(this.doc, clone); // Resolve React streaming SSR suspense boundaries this.resolveStreamedContent(clone); // Apply mobile styles to clone this.applyMobileStyles(clone, mobileStyles); // Find main content let mainContent = null; if (options.contentSelector) { mainContent = clone.querySelector(options.contentSelector); this._log('Using contentSelector:', options.contentSelector, mainContent ? 'found' : 'not found'); } if (!mainContent) { mainContent = this.findMainContent(clone); } if (!mainContent) { const fallbackContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)); const endTime = Date.now(); return { content: fallbackContent, ...metadata, wordCount: this.countWords(fallbackContent), parseTime: Math.round(endTime - startTime), metaTags: pageMetaTags }; } // Standardize footnotes before cleanup (CSS sidenotes use display:none) if (options.standardize) { (0, footnotes_1.standardizeFootnotes)(mainContent); } // Remove small images if (options.removeSmallImages) { this.removeSmallImages(clone, smallImages); } // Remove hidden elements using computed styles if (options.removeHiddenElements) { this.removeHiddenElements(clone, debugRemovals); } // Remove non-content blocks by scoring // Tries to find lists, navigation based on text content and link density if (options.removeLowScoring) { scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent); } // Remove clutter using selectors if (options.removeExactSelectors || options.removePartialSelectors) { this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals); } // Remove elements by content patterns (read time, boilerplate, article cards) if (options.removeContentPatterns && mainContent) { this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined); } // Normalize the main content if (options.standardize) { (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug); } // Resolve relative URLs to absolute this.resolveRelativeUrls(mainContent); const content = mainContent.outerHTML; const endTime = Date.now(); const result = { content, ...metadata, wordCount: this.countWords(content), parseTime: Math.round(endTime - startTime), metaTags: pageMetaTags }; if (this.debug) { result.debug = { contentSelector: this.getElementSelector(mainContent), removals: debugRemovals }; } return result; } catch (error) { console.error('Defuddle', 'Error processing document:', error); const errorContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)); const endTime = Date.now(); return { content: errorContent, ...metadata, wordCount: this.countWords(errorContent), parseTime: Math.round(endTime - startTime), metaTags: pageMetaTags }; } } countWords(content) { // Strip HTML tags and decode common entities without DOM parsing const text = content .replace(/<[^>]*>/g, ' ') .replace(/ /gi, ' ') .replace(/&/gi, '&') .replace(/</gi, '<') .replace(/>/gi, '>') .replace(/"/gi, '"') .replace(/&#\d+;/g, ' ') .replace(/&\w+;/g, ' '); const trimmed = text.trim(); if (!trimmed) return 0; // Count words by splitting on whitespace let count = 0; let inWord = false; for (let i = 0; i < trimmed.length; i++) { const isSpace = trimmed.charCodeAt(i) <= 32; if (!isSpace && !inWord) { count++; inWord = true; } else if (isSpace) { inWord = false; } } return count; } // Make all other methods private by removing the static keyword and using private _log(...args) { if (this.debug) { console.log('Defuddle:', ...args); } } _evaluateMediaQueries(doc) { const mobileStyles = []; const maxWidthRegex = /max-width[^:]*:\s*(\d+)/; try { // Get all styles, including inline styles const sheets = Array.from(doc.styleSheets).filter(sheet => { try { // Access rules once to check validity sheet.cssRules; return true; } catch (e) { // Expected error for cross-origin stylesheets or Node.js environment if (e instanceof DOMException && e.name === 'SecurityError') { return false; } return false; } }); // Process all sheets in a single pass const mediaRules = sheets.flatMap(sheet => { try { // Check if we're in a browser environment where CSSMediaRule is available if (typeof CSSMediaRule === 'undefined') { return []; } return Array.from(sheet.cssRules) .filter((rule) => rule instanceof CSSMediaRule && rule.conditionText.includes('max-width')); } catch (e) { if (this.debug) { console.warn('Defuddle: Failed to process stylesheet:', e); } return []; } }); // Process all media rules in a single pass mediaRules.forEach(rule => { const match = rule.conditionText.match(maxWidthRegex); if (match) { const maxWidth = parseInt(match[1]); if (constants_1.MOBILE_WIDTH <= maxWidth) { // Batch process all style rules const styleRules = Array.from(rule.cssRules) .filter((r) => r instanceof CSSStyleRule); styleRules.forEach(cssRule => { try { mobileStyles.push({ selector: cssRule.selectorText, styles: cssRule.style.cssText }); } catch (e) { if (this.debug) { console.warn('Defuddle: Failed to process CSS rule:', e); } } }); } } }); } catch (e) { console.error('Defuddle: Error evaluating media queries:', e); } return mobileStyles; } applyMobileStyles(doc, mobileStyles) { let appliedCount = 0; mobileStyles.forEach(({ selector, styles }) => { try { const elements = doc.querySelectorAll(selector); elements.forEach(element => { element.setAttribute('style', (element.getAttribute('style') || '') + styles); appliedCount++; }); } catch (e) { console.error('Defuddle', 'Error applying styles for selector:', selector, e); } }); } removeImages(doc) { const images = doc.getElementsByTagName('img'); Array.from(images).forEach(image => { image.remove(); }); } removeHiddenElements(doc, debugRemovals) { let count = 0; const elementsToRemove = new Map(); // Check inline styles and CSS class-based hidden patterns. const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i; // Only use getComputedStyle in browser environments where it's meaningful. // In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable. const defaultView = doc.defaultView; const isBrowser = typeof window !== 'undefined' && defaultView === window; const allElements = doc.querySelectorAll('*'); for (const element of allElements) { // Skip elements that contain math — sites like Wikipedia wrap MathML // in display:none spans for accessibility (the visible version is an // image/SVG fallback). We need to preserve these for math extraction. if (element.querySelector('math, [data-mathml], .katex-mathml') || element.tagName.toLowerCase() === 'math') { continue; } // Check inline style for hidden patterns const style = element.getAttribute('style'); if (style && hiddenStylePattern.test(style)) { const reason = style.includes('display') ? 'display:none' : style.includes('visibility') ? 'visibility:hidden' : 'opacity:0'; elementsToRemove.set(element, reason); count++; continue; } // Use getComputedStyle only in real browser environments if (isBrowser) { try { const computedStyle = defaultView.getComputedStyle(element); let reason = ''; if (computedStyle.display === 'none') reason = 'display:none'; else if (computedStyle.visibility === 'hidden') reason = 'visibility:hidden'; else if (computedStyle.opacity === '0') reason = 'opacity:0'; if (reason) { elementsToRemove.set(element, reason); count++; continue; } } catch (e) { } } // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden", // "sm:hidden", "not-machine:hidden") const className = element.getAttribute('class') || ''; if (className) { const tokens = className.split(/\s+/); for (const token of tokens) { if (token === 'hidden' || token.endsWith(':hidden')) { elementsToRemove.set(element, `class:${token}`); count++; break; } } } } // Batch remove all hidden elements elementsToRemove.forEach((reason, el) => { if (this.debug && debugRemovals) { debugRemovals.push({ step: 'removeHiddenElements', reason, text: (0, utils_1.textPreview)(el) }); } el.remove(); }); this._log('Removed hidden elements:', count); } removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals) { const startTime = Date.now(); let exactSelectorCount = 0; let partialSelectorCount = 0; // Track all elements to be removed, with their match type const elementsToRemove = new Map(); // First collect elements matching exact selectors if (removeExact) { const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS.join(',')); exactElements.forEach(el => { if (el?.parentNode) { // Skip elements inside code blocks (e.g. syntax highlighting spans) if (el.closest('pre, code')) { return; } elementsToRemove.set(el, { type: 'exact' }); exactSelectorCount++; } }); } if (removePartial) { // Pre-compile regexes and combine into a single regex for better performance const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|'); const partialRegex = new RegExp(combinedPattern, 'i'); // Pre-compile individual regexes for debug pattern identification const individualRegexes = this.debug ? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') })) : null; // Create an efficient attribute selector for elements we care about const attributeSelector = constants_1.TEST_ATTRIBUTES.map(attr => `[${attr}]`).join(','); const allElements = doc.querySelectorAll(attributeSelector); // Process elements for partial matches allElements.forEach(el => { // Skip if already marked for removal if (elementsToRemove.has(el)) { return; } // Skip code elements and elements containing code blocks // where class names indicate language/syntax, not page structure const tag = el.tagName; if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) { return; } // Get all relevant attributes and combine into a single string const attrs = constants_1.TEST_ATTRIBUTES.map(attr => { if (attr === 'class') { return el.className && typeof el.className === 'string' ? el.className : ''; } if (attr === 'id') { return el.id || ''; } return el.getAttribute(attr) || ''; }).join(' ').toLowerCase(); // Skip if no attributes to check if (!attrs.trim()) { return; } // Check for partial match using single regex test if (partialRegex.test(attrs)) { const matchedPattern = individualRegexes ? individualRegexes.find(r => r.regex.test(attrs))?.pattern : undefined; elementsToRemove.set(el, { type: 'partial', selector: matchedPattern }); partialSelectorCount++; } }); } // Remove all collected elements in a single pass // Skip elements that are ancestors of mainContent to avoid disconnecting it // Skip footnote list containers, their parents, and immediate children // Skip anchor links inside headings - the heading transform handles these elementsToRemove.forEach(({ type, selector }, el) => { if (mainContent && el.contains(mainContent)) { return; } if (el.tagName === 'A' && el.closest('h1, h2, h3, h4, h5, h6')) { return; } try { if (el.matches(constants_1.FOOTNOTE_LIST_SELECTORS) || el.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS)) { return; } // Protect immediate children of footnote containers (e.g. wikidot div.footnote-footer) const parent = el.parentElement; if (parent && parent.matches(constants_1.FOOTNOTE_LIST_SELECTORS)) { return; } } catch (e) { } if (this.debug && debugRemovals) { debugRemovals.push({ step: 'removeBySelector', selector: type === 'exact' ? 'exact' : selector, reason: type === 'exact' ? 'exact selector match' : `partial match: ${selector}`, text: (0, utils_1.textPreview)(el) }); } el.remove(); }); const endTime = Date.now(); this._log('Removed clutter elements:', { exactSelectors: exactSelectorCount, partialSelectors: partialSelectorCount, total: elementsToRemove.size, processingTime: `${(endTime - startTime).toFixed(2)}ms` }); } // Find small IMG and SVG elements findSmallImages(doc) { const MIN_DIMENSION = 33; const smallImages = new Set(); let processedCount = 0; const elements = doc.querySelectorAll('img, svg'); const defaultView = doc.defaultView; const isBrowser = typeof window !== 'undefined' && defaultView === window; for (const element of elements) { const attrWidth = parseInt(element.getAttribute('width') || '0'); const attrHeight = parseInt(element.getAttribute('height') || '0'); // Check inline style dimensions const style = element.getAttribute('style') || ''; const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0'); const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0'); // Use getComputedStyle and getBoundingClientRect only in browser let computedWidth = 0, computedHeight = 0; if (isBrowser) { try { const cs = defaultView.getComputedStyle(element); computedWidth = parseInt(cs.width) || 0; computedHeight = parseInt(cs.height) || 0; } catch (e) { } try { const rect = element.getBoundingClientRect(); if (rect.width > 0) computedWidth = computedWidth || rect.width; if (rect.height > 0) computedHeight = computedHeight || rect.height; } catch (e) { } } const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0); const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0); if (widths.length > 0 && heights.length > 0) { const effectiveWidth = Math.min(...widths); const effectiveHeight = Math.min(...heights); if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) { const identifier = this.getElementIdentifier(element); if (identifier) { smallImages.add(identifier); processedCount++; } } } } this._log('Found small elements:', processedCount); return smallImages; } removeSmallImages(doc, smallImages) { let removedCount = 0; ['img', 'svg'].forEach(tag => { const elements = doc.getElementsByTagName(tag); Array.from(elements).forEach(element => { const identifier = this.getElementIdentifier(element); if (identifier && smallImages.has(identifier)) { element.remove(); removedCount++; } }); }); this._log('Removed small elements:', removedCount); } getElementIdentifier(element) { // Try to create a unique identifier using various attributes if (element.tagName.toLowerCase() === 'img') { // For lazy-loaded images, use data-src as identifier if available const dataSrc = element.getAttribute('data-src'); if (dataSrc) return `src:${dataSrc}`; const src = element.getAttribute('src') || ''; const srcset = element.getAttribute('srcset') || ''; const dataSrcset = element.getAttribute('data-srcset'); if (src) return `src:${src}`; if (srcset) return `srcset:${srcset}`; if (dataSrcset) return `srcset:${dataSrcset}`; } const id = element.id || ''; const className = element.className || ''; const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : ''; if (id) return `id:${id}`; if (viewBox) return `viewBox:${viewBox}`; if (className) return `class:${className}`; return null; } findMainContent(doc) { // Find all potential content containers const candidates = []; constants_1.ENTRY_POINT_ELEMENTS.forEach((selector, index) => { const elements = doc.querySelectorAll(selector); elements.forEach(element => { // Base score from selector priority (earlier = higher) let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 40; // Add score based on content analysis score += scoring_1.ContentScorer.scoreElement(element); candidates.push({ element, score, selectorIndex: index }); }); }); if (candidates.length === 0) { // Fall back to scoring block elements return this.findContentByScoring(doc); } // Sort by score descending candidates.sort((a, b) => b.score - a.score); if (this.debug) { this._log('Content candidates:', candidates.map(c => ({ element: c.element.tagName, selector: this.getElementSelector(c.element), score: c.score }))); } // If we only matched body, try table-based detection if (candidates.length === 1 && candidates[0].element.tagName.toLowerCase() === 'body') { const tableContent = this.findTableBasedContent(doc); if (tableContent) { return tableContent; } } // If the top candidate contains a child candidate that matched a // higher-priority selector, prefer the most specific (deepest) child. // This prevents e.g. <main> from winning over a contained <article> // just because sibling noise inflates the parent's content score. // Only prefer the child if it has meaningful content (>50 words), // otherwise it may be an empty card element (e.g. related article cards). // Skip this when the parent contains multiple children matching the // same selector — that indicates a listing/portfolio page where the // parent is the real content container. const top = candidates[0]; let best = top; for (let i = 1; i < candidates.length; i++) { const child = candidates[i]; const childWords = (child.element.textContent || '').split(/\s+/).length; if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) { // Count how many candidates share this selector index inside // the top element. Use top (not best) as the stable reference // so the check isn't affected by earlier iterations. let siblingsAtIndex = 0; for (const c of candidates) { if (c.selectorIndex === child.selectorIndex && top.element.contains(c.element)) { if (++siblingsAtIndex > 1) break; } } if (siblingsAtIndex > 1) { // Multiple articles/cards inside the parent — it's a listing page continue; } best = child; } } if (best !== top) { return best.element; } return top.element; } findTableBasedContent(doc) { // First check if this looks like an old-style table-based layout const tables = Array.from(doc.getElementsByTagName('table')); const hasTableLayout = tables.some(table => { const width = parseInt(table.getAttribute('width') || '0'); const style = this.getComputedStyle(table); return width > 400 || (style?.width?.includes('px') && parseInt(style.width) > 400) || table.getAttribute('align') === 'center' || (table.className || '').toLowerCase().includes('content') || (table.className || '').toLowerCase().includes('article'); }); if (!hasTableLayout) { return null; // Don't try table-based extraction for modern layouts } const cells = Array.from(doc.getElementsByTagName('td')); return scoring_1.ContentScorer.findBestElement(cells); } findContentByScoring(doc) { const candidates = []; doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => { const score = scoring_1.ContentScorer.scoreElement(element); if (score > 0) { candidates.push({ score, element }); } }); return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null; } getElementSelector(element) { const parts = []; let current = element; while (current && current !== this.doc.documentElement) { let selector = current.tagName.toLowerCase(); if (current.id) { selector += '#' + current.id; } else if (current.className && typeof current.className === 'string') { selector += '.' + current.className.trim().split(/\s+/).join('.'); } parts.unshift(selector); current = current.parentElement; } return parts.join(' > '); } getComputedStyle(element) { return (0, utils_1.getComputedStyle)(element); } /** * Resolve relative URLs to absolute within a DOM element */ resolveRelativeUrls(element) { const baseUrl = this.options.url || this.doc.URL; if (!baseUrl) return; const resolve = (url) => { try { return new URL(url, baseUrl).href; } catch { return url; } }; element.querySelectorAll('[href]').forEach(el => { const href = el.getAttribute('href'); if (href) el.setAttribute('href', resolve(href)); }); element.querySelectorAll('[src]').forEach(el => { const src = el.getAttribute('src'); if (src) el.setAttribute('src', resolve(src)); }); element.querySelectorAll('[srcset]').forEach(el => { const srcset = el.getAttribute('srcset'); if (srcset) { // Parse srcset using width/density descriptors as delimiters, // not commas — URLs may contain commas (e.g. CDN transform params) const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g; const entries = []; let match; let lastIdx = 0; while ((match = entryPattern.exec(srcset)) !== null) { let url = match[1].trim(); if (lastIdx > 0) { url = url.replace(/^,\s*/, ''); } lastIdx = entryPattern.lastIndex; entries.push(`${resolve(url)} ${match[2]}`); } if (entries.length > 0) { el.setAttribute('srcset', entries.join(', ')); } else { // Fallback: simple comma split for srcsets without descriptors const resolved = srcset.split(',').map(entry => { const parts = entry.trim().split(/\s+/); if (parts[0]) parts[0] = resolve(parts[0]); return parts.join(' '); }).join(', '); el.setAttribute('srcset', resolved); } } }); element.querySelectorAll('[poster]').forEach(el => { const poster = el.getAttribute('poster'); if (poster) el.setAttribute('poster', resolve(poster)); }); } /** * Flatten shadow DOM content into a cloned document. * Walks both trees in parallel so positional correspondence is exact. */ flattenShadowRoots(original, clone) { const origElements = Array.from(original.body.querySelectorAll('*')); // Find the first element with a shadow root (also serves as the hasShadowRoots check) const firstShadow = origElements.find(el => el.shadowRoot); if (!firstShadow) return; const cloneElements = Array.from(clone.body.querySelectorAll('*')); // Check if we can directly read shadow DOM content (main world / Node.js). // In content script isolated worlds, shadowRoot exists but content is empty. const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0; if (canReadShadow) { // Direct traversal works (main world / Node.js) for (let i = origElements.length - 1; i >= 0; i--) { const origEl = origElements[i]; if (!origEl.shadowRoot) continue; const cloneEl = cloneElements[i]; if (!cloneEl) continue; const shadowHtml = origEl.shadowRoot.innerHTML; if (shadowHtml.length > 0) { this.replaceShadowHost(cloneEl, shadowHtml, clone); } } } else { // Content script isolated world — read data-defuddle-shadow attributes // stamped by an external main-world script. const shadowData = []; for (let i = 0; i < origElements.length; i++) { const origEl = origElements[i]; const shadowHtml = origEl.getAttribute('data-defuddle-shadow'); if (!shadowHtml) continue; const cloneEl = cloneElements[i]; if (!cloneEl) continue; shadowData.push({ cloneEl, html: shadowHtml }); // Clean up temporary attributes from both original and clone origEl.removeAttribute('data-defuddle-shadow'); cloneEl.removeAttribute('data-defuddle-shadow'); } for (const { cloneEl, html } of shadowData) { this.replaceShadowHost(cloneEl, html, clone); } } } /** * Resolve React streaming SSR suspense boundaries. * React's streaming SSR places content in hidden divs (id="S:0") and * template placeholders (id="B:0") with $RC scripts to swap them. * Since we don't execute scripts, we perform the swap manually. */ resolveStreamedContent(doc) { // Find $RC("B:X","S:X") calls in inline scripts const scripts = doc.querySelectorAll('script'); const swaps = []; const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g; for (const script of scripts) { const text = script.textContent || ''; if (!text.includes('$RC(')) continue; rcPattern.lastIndex = 0; let match; while ((match = rcPattern.exec(text)) !== null) { swaps.push({ template