UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

636 lines 27.8 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Defuddle = void 0; const metadata_1 = require("./metadata"); const extractor_registry_1 = require("./extractor-registry"); const constants_1 = require("./constants"); const standardize_1 = require("./standardize"); const scoring_1 = require("./scoring"); const utils_1 = require("./utils"); class Defuddle { /** * Create a new Defuddle instance * @param doc - The document to parse * @param options - Options for parsing */ constructor(doc, options = {}) { this.doc = doc; this.options = options; this.debug = options.debug || false; } /** * Parse the document and extract its main content */ parse() { // Try first with default settings const result = this.parseInternal(); // If result has very little content, try again without clutter removal if (result.wordCount < 200) { console.log('Initial parse returned very little content, trying again'); const retryResult = this.parseInternal({ removePartialSelectors: false }); // Return the result with more content if (retryResult.wordCount > result.wordCount) { this._log('Retry produced more content'); return retryResult; } } return result; } /** * Internal parse method that does the actual work */ parseInternal(overrideOptions = {}) { const startTime = Date.now(); const options = { removeExactSelectors: true, removePartialSelectors: true, ...this.options, ...overrideOptions }; // Extract schema.org data const schemaOrgData = this._extractSchemaOrgData(this.doc); // Collect meta tags const pageMetaTags = []; this.doc.querySelectorAll('meta').forEach(meta => { const name = meta.getAttribute('name'); const property = meta.getAttribute('property'); let content = meta.getAttribute('content'); if (content) { // Only include tags that have content pageMetaTags.push({ name, property, content: this._decodeHTMLEntities(content) }); } }); // Extract metadata const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags); try { // Use site-specific extractor first, if there is one const url = options.url || this.doc.URL; const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData); if (extractor && extractor.canExtract()) { const extracted = extractor.extract(); const endTime = Date.now(); // console.log('Using extractor:', extractor.constructor.name.replace('Extractor', '')); return { content: extracted.contentHtml, title: extracted.variables?.title || metadata.title, description: metadata.description, domain: metadata.domain, favicon: metadata.favicon, image: metadata.image, published: extracted.variables?.published || metadata.published, author: extracted.variables?.author || metadata.author, site: metadata.site, schemaOrgData: metadata.schemaOrgData, wordCount: this.countWords(extracted.contentHtml), parseTime: Math.round(endTime - startTime), extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(), metaTags: pageMetaTags }; } // Continue if there is no extractor... // Evaluate mobile styles and sizes on original document const mobileStyles = this._evaluateMediaQueries(this.doc); // Find small images in original document, excluding lazy-loaded ones const smallImages = this.findSmallImages(this.doc); // Clone document const clone = this.doc.cloneNode(true); // Apply mobile styles to clone this.applyMobileStyles(clone, mobileStyles); // Find main content const mainContent = this.findMainContent(clone); if (!mainContent) { const endTime = Date.now(); return { content: this.doc.body.innerHTML, ...metadata, wordCount: this.countWords(this.doc.body.innerHTML), parseTime: Math.round(endTime - startTime), metaTags: pageMetaTags }; } // Remove small images this.removeSmallImages(clone, smallImages); // Remove hidden elements using computed styles this.removeHiddenElements(clone); // Remove non-content blocks by scoring // Tries to find lists, navigation based on text content and link density scoring_1.ContentScorer.scoreAndRemove(clone, this.debug); // Remove clutter using selectors if (options.removeExactSelectors || options.removePartialSelectors) { this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors); } // Normalize the main content (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug); const content = mainContent.outerHTML; const endTime = Date.now(); return { content, ...metadata, wordCount: this.countWords(content), parseTime: Math.round(endTime - startTime), metaTags: pageMetaTags }; } catch (error) { console.error('Defuddle', 'Error processing document:', error); const endTime = Date.now(); return { content: this.doc.body.innerHTML, ...metadata, wordCount: this.countWords(this.doc.body.innerHTML), parseTime: Math.round(endTime - startTime), metaTags: pageMetaTags }; } } countWords(content) { // Create a temporary div to parse HTML content const tempDiv = this.doc.createElement('div'); tempDiv.innerHTML = content; // Get text content, removing extra whitespace const text = tempDiv.textContent || ''; const words = text .trim() .replace(/\s+/g, ' ') // Replace multiple spaces with single space .split(' ') .filter(word => word.length > 0); // Filter out empty strings return words.length; } // Make all other methods private by removing the static keyword and using private _log(...args) { if (this.debug) { console.log('Defuddle:', ...args); } } _evaluateMediaQueries(doc) { const mobileStyles = []; const maxWidthRegex = /max-width[^:]*:\s*(\d+)/; try { // Get all styles, including inline styles const sheets = Array.from(doc.styleSheets).filter(sheet => { try { // Access rules once to check validity sheet.cssRules; return true; } catch (e) { // Expected error for cross-origin stylesheets or Node.js environment if (e instanceof DOMException && e.name === 'SecurityError') { return false; } return false; } }); // Process all sheets in a single pass const mediaRules = sheets.flatMap(sheet => { try { // Check if we're in a browser environment where CSSMediaRule is available if (typeof CSSMediaRule === 'undefined') { return []; } return Array.from(sheet.cssRules) .filter((rule) => rule instanceof CSSMediaRule && rule.conditionText.includes('max-width')); } catch (e) { if (this.debug) { console.warn('Defuddle: Failed to process stylesheet:', e); } return []; } }); // Process all media rules in a single pass mediaRules.forEach(rule => { const match = rule.conditionText.match(maxWidthRegex); if (match) { const maxWidth = parseInt(match[1]); if (constants_1.MOBILE_WIDTH <= maxWidth) { // Batch process all style rules const styleRules = Array.from(rule.cssRules) .filter((r) => r instanceof CSSStyleRule); styleRules.forEach(cssRule => { try { mobileStyles.push({ selector: cssRule.selectorText, styles: cssRule.style.cssText }); } catch (e) { if (this.debug) { console.warn('Defuddle: Failed to process CSS rule:', e); } } }); } } }); } catch (e) { console.error('Defuddle: Error evaluating media queries:', e); } return mobileStyles; } applyMobileStyles(doc, mobileStyles) { let appliedCount = 0; mobileStyles.forEach(({ selector, styles }) => { try { const elements = doc.querySelectorAll(selector); elements.forEach(element => { element.setAttribute('style', (element.getAttribute('style') || '') + styles); appliedCount++; }); } catch (e) { console.error('Defuddle', 'Error applying styles for selector:', selector, e); } }); } removeHiddenElements(doc) { let count = 0; const elementsToRemove = new Set(); // Get all elements and check their styles const allElements = Array.from(doc.getElementsByTagName('*')); // Process styles in batches to minimize layout thrashing const BATCH_SIZE = 100; for (let i = 0; i < allElements.length; i += BATCH_SIZE) { const batch = allElements.slice(i, i + BATCH_SIZE); // Read phase - gather all computedStyles const styles = batch.map(element => { try { return element.ownerDocument.defaultView?.getComputedStyle(element); } catch (e) { // If we can't get computed style, check inline styles const style = element.getAttribute('style'); if (!style) return null; // Create a temporary style element to parse inline styles const tempStyle = doc.createElement('style'); tempStyle.textContent = `* { ${style} }`; doc.head.appendChild(tempStyle); const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element); doc.head.removeChild(tempStyle); return computedStyle; } }); // Write phase - mark elements for removal batch.forEach((element, index) => { const computedStyle = styles[index]; if (computedStyle && (computedStyle.display === 'none' || computedStyle.visibility === 'hidden' || computedStyle.opacity === '0')) { elementsToRemove.add(element); count++; } }); } // Batch remove all hidden elements this._log('Removed hidden elements:', count); } removeBySelector(doc, removeExact = true, removePartial = true) { const startTime = Date.now(); let exactSelectorCount = 0; let partialSelectorCount = 0; // Track all elements to be removed const elementsToRemove = new Set(); // First collect elements matching exact selectors if (removeExact) { const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS.join(',')); exactElements.forEach(el => { if (el?.parentNode) { elementsToRemove.add(el); exactSelectorCount++; } }); } if (removePartial) { // Pre-compile regexes and combine into a single regex for better performance const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|'); const partialRegex = new RegExp(combinedPattern, 'i'); // Create an efficient attribute selector for elements we care about const attributeSelector = constants_1.TEST_ATTRIBUTES.map(attr => `[${attr}]`).join(','); const allElements = doc.querySelectorAll(attributeSelector); // Process elements for partial matches allElements.forEach(el => { // Skip if already marked for removal if (elementsToRemove.has(el)) { return; } // Get all relevant attributes and combine into a single string const attrs = constants_1.TEST_ATTRIBUTES.map(attr => { if (attr === 'class') { return el.className && typeof el.className === 'string' ? el.className : ''; } if (attr === 'id') { return el.id || ''; } return el.getAttribute(attr) || ''; }).join(' ').toLowerCase(); // Skip if no attributes to check if (!attrs.trim()) { return; } // Check for partial match using single regex test if (partialRegex.test(attrs)) { elementsToRemove.add(el); partialSelectorCount++; } }); } // Remove all collected elements in a single pass elementsToRemove.forEach(el => el.remove()); const endTime = Date.now(); this._log('Removed clutter elements:', { exactSelectors: exactSelectorCount, partialSelectors: partialSelectorCount, total: elementsToRemove.size, processingTime: `${(endTime - startTime).toFixed(2)}ms` }); } // Find small IMG and SVG elements findSmallImages(doc) { const MIN_DIMENSION = 33; const smallImages = new Set(); const transformRegex = /scale\(([\d.]+)\)/; const startTime = Date.now(); let processedCount = 0; // 1. Read phase - Gather all elements in a single pass const elements = [ ...Array.from(doc.getElementsByTagName('img')), ...Array.from(doc.getElementsByTagName('svg')) ]; if (elements.length === 0) { return smallImages; } // 2. Batch process - Collect all measurements in one go const measurements = elements.map(element => ({ element, // Static attributes (no reflow) naturalWidth: element.tagName.toLowerCase() === 'img' ? parseInt(element.getAttribute('width') || '0') || 0 : 0, naturalHeight: element.tagName.toLowerCase() === 'img' ? parseInt(element.getAttribute('height') || '0') || 0 : 0, attrWidth: parseInt(element.getAttribute('width') || '0'), attrHeight: parseInt(element.getAttribute('height') || '0') })); // 3. Batch compute styles - Process in chunks to avoid long tasks const BATCH_SIZE = 50; for (let i = 0; i < measurements.length; i += BATCH_SIZE) { const batch = measurements.slice(i, i + BATCH_SIZE); try { // Read phase - compute all styles at once const styles = batch.map(({ element }) => { try { return element.ownerDocument.defaultView?.getComputedStyle(element); } catch (e) { return null; } }); // Get bounding rectangles if available const rects = batch.map(({ element }) => { try { return element.getBoundingClientRect(); } catch (e) { return null; } }); // Process phase - no DOM operations batch.forEach((measurement, index) => { try { const style = styles[index]; const rect = rects[index]; if (!style) return; // Get transform scale in the same batch const transform = style.transform; const scale = transform ? parseFloat(transform.match(transformRegex)?.[1] || '1') : 1; // Calculate effective dimensions const widths = [ measurement.naturalWidth, measurement.attrWidth, parseInt(style.width) || 0, rect ? rect.width * scale : 0 ].filter(dim => typeof dim === 'number' && dim > 0); const heights = [ measurement.naturalHeight, measurement.attrHeight, parseInt(style.height) || 0, rect ? rect.height * scale : 0 ].filter(dim => typeof dim === 'number' && dim > 0); // Decision phase - no DOM operations if (widths.length > 0 && heights.length > 0) { const effectiveWidth = Math.min(...widths); const effectiveHeight = Math.min(...heights); if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) { const identifier = this.getElementIdentifier(measurement.element); if (identifier) { smallImages.add(identifier); processedCount++; } } } } catch (e) { if (this.debug) { console.warn('Defuddle: Failed to process element dimensions:', e); } } }); } catch (e) { if (this.debug) { console.warn('Defuddle: Failed to process batch:', e); } } } const endTime = Date.now(); this._log('Found small elements:', { count: processedCount, processingTime: `${(endTime - startTime).toFixed(2)}ms` }); return smallImages; } removeSmallImages(doc, smallImages) { let removedCount = 0; ['img', 'svg'].forEach(tag => { const elements = doc.getElementsByTagName(tag); Array.from(elements).forEach(element => { const identifier = this.getElementIdentifier(element); if (identifier && smallImages.has(identifier)) { element.remove(); removedCount++; } }); }); this._log('Removed small elements:', removedCount); } getElementIdentifier(element) { // Try to create a unique identifier using various attributes if (element.tagName.toLowerCase() === 'img') { // For lazy-loaded images, use data-src as identifier if available const dataSrc = element.getAttribute('data-src'); if (dataSrc) return `src:${dataSrc}`; const src = element.getAttribute('src') || ''; const srcset = element.getAttribute('srcset') || ''; const dataSrcset = element.getAttribute('data-srcset'); if (src) return `src:${src}`; if (srcset) return `srcset:${srcset}`; if (dataSrcset) return `srcset:${dataSrcset}`; } const id = element.id || ''; const className = element.className || ''; const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : ''; if (id) return `id:${id}`; if (viewBox) return `viewBox:${viewBox}`; if (className) return `class:${className}`; return null; } findMainContent(doc) { // Find all potential content containers const candidates = []; constants_1.ENTRY_POINT_ELEMENTS.forEach((selector, index) => { const elements = doc.querySelectorAll(selector); elements.forEach(element => { // Base score from selector priority (earlier = higher) let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 40; // Add score based on content analysis score += scoring_1.ContentScorer.scoreElement(element); candidates.push({ element, score }); }); }); if (candidates.length === 0) { // Fall back to scoring block elements return this.findContentByScoring(doc); } // Sort by score descending candidates.sort((a, b) => b.score - a.score); if (this.debug) { this._log('Content candidates:', candidates.map(c => ({ element: c.element.tagName, selector: this.getElementSelector(c.element), score: c.score }))); } // If we only matched body, try table-based detection if (candidates.length === 1 && candidates[0].element.tagName.toLowerCase() === 'body') { const tableContent = this.findTableBasedContent(doc); if (tableContent) { return tableContent; } } return candidates[0].element; } findTableBasedContent(doc) { // First check if this looks like an old-style table-based layout const tables = Array.from(doc.getElementsByTagName('table')); const hasTableLayout = tables.some(table => { const width = parseInt(table.getAttribute('width') || '0'); const style = this.getComputedStyle(table); return width > 400 || (style?.width.includes('px') && parseInt(style.width) > 400) || table.getAttribute('align') === 'center' || table.className.toLowerCase().includes('content') || table.className.toLowerCase().includes('article'); }); if (!hasTableLayout) { return null; // Don't try table-based extraction for modern layouts } const cells = Array.from(doc.getElementsByTagName('td')); return scoring_1.ContentScorer.findBestElement(cells); } findContentByScoring(doc) { const candidates = []; constants_1.BLOCK_ELEMENTS.forEach((tag) => { Array.from(doc.getElementsByTagName(tag)).forEach((element) => { const score = scoring_1.ContentScorer.scoreElement(element); if (score > 0) { candidates.push({ score, element }); } }); }); return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null; } getElementSelector(element) { const parts = []; let current = element; while (current && current !== this.doc.documentElement) { let selector = current.tagName.toLowerCase(); if (current.id) { selector += '#' + current.id; } else if (current.className && typeof current.className === 'string') { selector += '.' + current.className.trim().split(/\s+/).join('.'); } parts.unshift(selector); current = current.parentElement; } return parts.join(' > '); } getComputedStyle(element) { return (0, utils_1.getComputedStyle)(element); } _extractSchemaOrgData(doc) { const schemaScripts = doc.querySelectorAll('script[type="application/ld+json"]'); const rawSchemaItems = []; schemaScripts.forEach(script => { let jsonContent = script.textContent || ''; try { jsonContent = jsonContent .replace(/\/\*[\s\S]*?\*\/|^\s*\/\/.*$/gm, '') .replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/, '$1') .replace(/^\s*(\*\/|\/\*)\s*|\s*(\*\/|\/\*)\s*$/g, '') .trim(); const jsonData = JSON.parse(jsonContent); if (jsonData['@graph'] && Array.isArray(jsonData['@graph'])) { rawSchemaItems.push(...jsonData['@graph']); } else { rawSchemaItems.push(jsonData); } } catch (error) { console.error('Defuddle: Error parsing schema.org data:', error); if (this.debug) { console.error('Defuddle: Problematic JSON content:', jsonContent); } } }); const decodeStringsInObject = (item) => { if (typeof item === 'string') { return this._decodeHTMLEntities(item); } else if (Array.isArray(item)) { return item.map(decodeStringsInObject); } else if (typeof item === 'object' && item !== null) { const newItem = {}; for (const key in item) { if (Object.prototype.hasOwnProperty.call(item, key)) { newItem[key] = decodeStringsInObject(item[key]); } } return newItem; } return item; }; return rawSchemaItems.map(decodeStringsInObject); } _decodeHTMLEntities(text) { const textarea = this.doc.createElement('textarea'); textarea.innerHTML = text; return textarea.value; } } exports.Defuddle = Defuddle; //# sourceMappingURL=defuddle.js.map