defuddle
Version:
Extract article content and metadata from web pages.
862 lines • 18.4 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.ALLOWED_ATTRIBUTES_DEBUG = exports.ALLOWED_ATTRIBUTES = exports.ALLOWED_EMPTY_ELEMENTS = exports.FOOTNOTE_LIST_SELECTORS = exports.FOOTNOTE_INLINE_REFERENCES = exports.PARTIAL_SELECTORS = exports.TEST_ATTRIBUTES = exports.EXACT_SELECTORS = exports.INLINE_ELEMENTS = exports.PRESERVE_ELEMENTS = exports.BLOCK_ELEMENTS = exports.MOBILE_WIDTH = exports.ENTRY_POINT_ELEMENTS = void 0;
// Entry point elements
// These are the elements that will be used to find the main content
exports.ENTRY_POINT_ELEMENTS = [
'#post',
'.post-content',
'.article-content',
'#article-content',
'.article_post',
'.article-wrapper',
'.entry-content',
'.content-article',
'.post',
'.markdown-body',
'article',
'[role="article"]',
'main',
'[role="main"]',
'body' // ensures there is always a match
];
exports.MOBILE_WIDTH = 600;
exports.BLOCK_ELEMENTS = ['div', 'section', 'article', 'main', 'aside', 'header', 'footer', 'nav', 'content'];
// Elements that should not be unwrapped
exports.PRESERVE_ELEMENTS = new Set([
'pre', 'code', 'table', 'thead', 'tbody', 'tr', 'td', 'th',
'ul', 'ol', 'li', 'dl', 'dt', 'dd',
'figure', 'figcaption', 'picture',
'details', 'summary',
'blockquote',
'form', 'fieldset'
]);
// Inline elements that should not be unwrapped
exports.INLINE_ELEMENTS = new Set([
'a', 'span', 'strong', 'em', 'i', 'b', 'u', 'code', 'br', 'small',
'sub', 'sup', 'mark', 'date', 'del', 'ins', 'q', 'abbr', 'cite', 'relative-time', 'time',
'font'
]);
// Selectors to be removed
exports.EXACT_SELECTORS = [
// scripts, styles
'noscript',
'script:not([type^="math/"])',
'style',
'meta',
'link',
// ads
'.ad:not([class*="gradient"])',
'[class^="ad-" i]',
'[class$="-ad" i]',
'[id^="ad-" i]',
'[id$="-ad" i]',
'[role="banner" i]',
'[alt*="advert" i]',
'.promo',
'.Promo',
'#barrier-page', // ft.com
'.alert',
// comments
'[id="comments" i]',
'[id="comment" i]',
// header, nav
'header',
'.header:not(.banner)',
'#header',
'#Header',
'#banner',
'#Banner',
'nav',
'.navigation',
'#navigation',
'.hero',
'[role="navigation" i]',
'[role="dialog" i]',
'[role*="complementary" i]',
'[class*="pagination" i]',
'.menu',
'#menu',
'#siteSub',
// '.fixed', see issue #44
'.previous',
// metadata
'.author',
'.Author',
'[class$="_bio"]',
'#categories',
'.contributor',
'.date',
'#date',
'[data-date]',
'.entry-meta',
'.meta',
'.tags',
'#tags',
'.toc',
'.Toc',
'#toc',
'.headline',
'#headline',
'#title',
'#Title',
'#articleTag',
'[href*="/category"]',
'[href*="/categories"]',
'[href*="/tag/"]',
'[href*="/tags/"]',
'[href*="/topics"]',
'[href*="author"]',
'[href*="#toc"]',
'[href="#top"]',
'[href="#Top"]',
'[href="#page-header"]',
'[href="#content"]',
'[href="#site-content"]',
'[href="#main-content"]',
'[href^="#main"]',
'[src*="author"]',
// footer
'footer',
// inputs, forms, elements
'.aside',
'aside',
'button',
// '[role="button"]', Medium images
'canvas',
'date',
'dialog',
'fieldset',
'form',
'input:not([type="checkbox"])',
'label',
'option',
'select',
'textarea',
'time',
'relative-time',
// hidden
'[hidden]',
'[aria-hidden="true"]:not([class*="math"])',
'[style*="display: none"]:not([class*="math"])',
'[style*="display:none"]:not([class*="math"])',
'[style*="visibility: hidden"]',
'[style*="visibility:hidden"]',
'.hidden',
'.invisible',
// iframes
'instaread-player',
'iframe:not([src*="youtube"]):not([src*="youtu.be"]):not([src*="vimeo"]):not([src*="twitter"]):not([src*="x.com"]):not([src*="datawrapper"])',
// logos
'[class="logo" i]',
'#logo',
'#Logo',
// newsletter
'#newsletter',
'#Newsletter',
'.subscribe',
// hidden for print
'.noprint',
'[data-print-layout="hide" i]',
'[data-block="donotprint" i]',
// footnotes, citations
'[class*="clickable-icon" i]',
'li span[class*="ltx_tag" i][class*="ltx_tag_item" i]',
'a[href^="#"][class*="anchor" i]',
'a[href^="#"][class*="ref" i]',
// link lists
'[data-container*="most-viewed" i]',
// sidebar
'.sidebar',
'.Sidebar',
'#sidebar',
'#Sidebar',
'#sitesub',
// skip links
'[data-link-name*="skip" i]',
'[aria-label*="skip" i]',
'#skip-link',
// other
'.copyright',
'#copyright',
'#rss',
'#feed',
'.gutter',
'#primaryaudio', // NPR
'#NYT_ABOVE_MAIN_CONTENT_REGION',
'[data-testid="photoviewer-children-figure"] > span', // New York Times
'table.infobox',
'.pencraft:not(.pc-display-contents)', // Substack
'[data-optimizely="related-articles-section" i]', // The Economist
'[data-orientation="vertical"]'
];
// Attributes to test against for partial matches
exports.TEST_ATTRIBUTES = [
'class',
'id',
'data-test',
'data-testid',
'data-test-id',
'data-qa',
'data-cy'
];
// Removal patterns tested against attributes above
// Case insensitive, partial matches allowed
exports.PARTIAL_SELECTORS = [
'a-statement',
'access-wall',
'activitypub',
'actioncall',
'addcomment',
'advert',
// '-ad-', howtogeek.com
'adlayout',
'ad-tldr',
'ad-placement',
'ads-container',
'_ad_',
'after_content',
'after_main_article',
'afterpost',
'allterms',
'-alert-',
'alert-box',
'appendix',
'_archive',
'around-the-web',
'aroundpages',
'article-author',
'article-badges',
'article-banner',
'article-bottom-section',
'article-bottom',
'article-category',
'article-card',
'article-citation',
'article__copy',
'article_date',
'article-date',
'article-end ',
'article_header',
'article-header',
'article__header',
'article__hero',
'article__info',
'article-info',
'article-meta',
'article_meta',
'article__meta',
'articlename',
'article-subject',
'article_subject',
'article-snippet',
'article-separator',
'article--share',
'article--topics',
'articletags',
'article-tags',
'article_tags',
'articletitle',
'article-title',
'article_title',
'articletopics',
'article-topics',
// 'article-type',
'article--lede', // The Verge
'articlewell',
'associated-people',
'audio-card',
// 'author', Gwern
// '-author',
'author-bio',
'author-box',
'author-info',
'author_info',
'authorm',
'author-mini-bio',
'author-name',
'author-publish-info',
'authored-by',
'avatar',
'back-to-top',
'backlink_container',
'backlinks-section',
// 'banner',
'bio-block',
'biobox',
'blog-pager',
'bookmark-',
'-bookmark',
'bottominfo',
'bottomnav',
'bottom-of-article',
'bottom-wrapper',
'brand-bar',
'breadcrumb',
'brdcrumb',
'button-wrapper',
'buttons-container',
'btn-',
'-btn',
'byline',
'captcha',
'card-text',
'card-media',
'card-post',
// 'carousel',
'carouselcontainer',
'carousel-container',
'cat_header',
'catlinks',
'_categories',
'card-author',
'card-content',
'chapter-list', // The Economist
'collections',
'comments',
// '-comment', Syntax highlighting
'commentbox',
'comment-button',
'commentcomp',
'comment-content',
'comment-count',
'comment-form',
'comment-number',
'comment-respond',
'comment-thread',
'comment-wrap',
'complementary',
'consent',
'contact-',
'content-card', // The Verge
'content-topics',
'contentpromo',
'context-bar',
'context-widget', // Reuters
'core-collateral',
'cover-',
'created-date',
'creative-commons_',
'c-subscribe',
'_cta',
'-cta',
'cta-',
'cta_',
'current-issue', // The Nation
'custom-list-number',
'dateline',
'dateheader',
'date-header',
'date-pub',
// 'dialog',
'disclaimer',
'disclosure',
'discussion',
'discuss_',
'disqus',
'donate',
'donation',
'dropdown', // Ars Technica
'eletters',
'emailsignup',
'engagement-widget',
'enhancement',
'entry-author-info',
'entry-categories',
'entry-date',
// 'entry-meta',
'entry-title',
'entry-utility',
'-error',
'error-',
'eyebrow',
'expand-reduce',
'external-anchor',
'externallinkembedwrapper', // The New Yorker
'extra-services',
'extra-title',
'facebook',
'fancy-box',
'favorite',
'featured-content',
'feature_feed',
'feedback',
'feed-links',
'field-site-sections',
'fixheader',
'floating-vid',
// 'follow',
'follower',
'footer',
'footnote-back',
'footnoteback',
'form-group',
'for-you',
'frontmatter',
'further-reading',
'fullbleedheader',
'gated-',
'gh-feed',
'gist-meta',
// 'global',
// 'google',
'goog-',
'graph-view',
'hamburger',
'header_logo',
'header-logo',
'header-pattern', // The Verge
// 'headlines', Mercurynews
'hero-list',
// '-hidden',
'hide-for-print',
'hide-print',
'hide-when-no-script',
'hidden-print',
'hidden-sidenote',
'hidden-accessibility',
'infoline',
'instacartIntegration',
'interlude',
'interaction',
'itemendrow',
'invisible',
'jumplink',
'jump-to-',
'keepreading',
'keep-reading',
'keep_reading',
// 'keyword', // used in syntax highlighting
'keyword_wrap',
'kicker',
'labstab', // Arxiv
'-labels',
'language-name',
'lastupdated',
'latest-content',
'-ledes-', // The Verge
'-license',
'license-',
'lightbox-popup',
'like-button',
'link-box',
'links-grid', // BBC
'links-title', // BBC
'listing-dynamic-terms', // Boston Review
'list-tags',
'listinks',
'loading',
'loa-info',
'logo_container',
'ltx_role_refnum', // Arxiv
'ltx_tag_bibitem',
'ltx_error',
'masthead',
'marketing',
'media-inquiry',
'-menu',
'menu-',
// 'meta-', syntax highlighting
'metadata',
'might-like',
'minibio',
'more-about',
'_modal',
'-modal',
'more-',
'morenews',
'morestories',
'more_wrapper',
'most-read',
'move-helper',
'mw-editsection',
'mw-cite-backlink',
'mw-indicators',
'mw-jump-link',
'nav-',
'nav_',
// 'navbar',
// 'navigation',
'navigation-post',
'next-',
'newsgallery',
'news-story-title',
// 'newsletter', used on Substack
'newsletter_',
'newsletterbanner',
'newslettercontainer',
'newsletter-form',
'newsletter-signup',
'newslettersignup',
'newsletterwidget',
'newsletterwrapper',
'not-found',
'notessection',
'nomobile',
'noprint',
'open-slideshow',
'originally-published', // Mercury News
'other-blogs',
'outline-view',
// 'overlay',
'pagehead',
'page-header',
'page-title',
'paywall_message',
'-partners',
'permission-',
'plea',
'popular',
// 'popup', Gwern
'popup_links',
// 'popover',
'pop_stories',
'pop-up',
'post-author',
'post-bottom',
'post__category',
'postcomment',
'postdate',
'post-date',
'post_date',
'post-details',
'post-feeds',
'postinfo',
'post-info',
'post_info',
'post-inline-date',
'post-links',
'postlist',
'post_list',
'post_meta',
'post-meta',
'postmeta',
'post_more',
'postnavi',
'post-navigation',
'postpath',
'post-preview',
'postsnippet',
'post_snippet',
'post-snippet',
'post-subject',
'posttax',
'post-tax',
'post_tax',
'posttag',
'post_tag',
'post-tag',
'post_time',
'posttitle',
'post-title',
'post_title',
'post__title',
'post-ufi-button',
// 'preview', used on Obsidian Publish
'prev-post',
'prevnext',
'prev_next',
'prev-next',
'previousnext',
'press-inquiries',
'print-none',
'print-header',
'print:hidden',
'privacy-notice',
'privacy-settings',
'profile',
// 'promo',
'promo_article',
'promo-bar',
'promo-box',
'pubdate',
'pub_date',
'pub-date',
'publish_date',
'publish-date',
'publication-date',
'publicationName', // Medium
'qr-code',
'qr_code',
'quick_up',
'_rail',
'ratingssection',
'read_also',
'readmore',
'read-next',
'read_next',
'read_time',
'read-time',
'reading_time',
'reading-time',
'reading-list',
'recent-',
'recent-articles',
'recentpost',
'recent_post',
'recent-post',
'recommend',
'redirectedfrom',
'recirc',
'register',
'related',
'relevant',
'reversefootnote',
'_rss',
'rss-link',
'screen-reader-text',
'scroll_to',
'scroll-to',
'_search',
'-search',
'section-nav',
'series-banner',
// 'share',
// '-share', scitechdaily.com
'share-box',
'sharedaddy',
'share-icons',
'sharelinks',
'share-post',
'share-print',
'share-section',
'show-for-print',
'sidebartitle',
// 'sidebar_',
'sidebar-content',
'sidebar-wrapper',
'sideitems',
'sidebar-author',
'sidebar-item',
'side-box',
'side-logo',
'sign-in-gate',
'similar-',
'similar_',
'similars-',
'site-index',
'site-header',
'siteheader',
'site-logo',
'site-name',
'site-wordpress',
// 'skip-',
// 'skip-link', TechCrunch
'skip-content',
'skip-to-content',
// 'skip-link',
'c-skip-link',
'_skip-link',
'-slider',
'slug-wrap',
// 'social',
'social-author',
'social-shar',
'social-date',
'speechify-ignore',
'speedbump',
'sponsor',
'springercitation',
'sr-only',
// '-stats',
'_stats',
// 'sticky',
'story-date',
'story-navigation',
'storyreadtime', // Medium
'storysmall',
'storypublishdate', // Medium
'subject-label',
'subhead',
'submenu',
// 'subscribe',
'-subscribe-',
'subscriber-drive',
'subscription-',
'_tags',
'tags__item',
'tag_list',
'taxonomy',
// 'table-content',
'table-of-contents',
'tabs-',
// 'teaser', Nature
'terminaltout',
'time-rubric',
'timestamp',
'time-read',
'time-to-read',
'tip_off',
'tiptout',
'-tout-',
// '-toc',
'toc-container',
'toggle-caption',
// 'toolbar', prism.js
'tooltip',
'topbar',
'topic-list',
'topic-subnav',
// 'top-section',
'top-wrapper',
'tree-item',
'trending',
'trust-feat',
'trust-badge',
'trust-project',
'twitter',
'u-hide',
'upsell',
'viewbottom',
'visually-hidden',
'welcomebox',
'widget_pages',
// 'widget-'
];
// Selectors for footnotes and citations
exports.FOOTNOTE_INLINE_REFERENCES = [
'sup.reference',
'cite.ltx_cite',
'sup[id^="fnr"]',
'span[id^="fnr"]',
'span[class*="footnote_ref"]',
'span.footnote-link',
'a.citation',
'a[id^="ref-link"]',
'a[href^="#fn"]',
'a[href^="#cite"]',
'a[href^="#reference"]',
'a[href^="#footnote"]',
'a[href^="#r"]', // Common in academic papers
'a[href^="#b"]', // Common for bibliography references
'a[href*="cite_note"]',
'a[href*="cite_ref"]',
'a.footnote-anchor', // Substack
'span.footnote-hovercard-target a', // Substack
'a[role="doc-biblioref"]', // Science.org
'a[id^="fnref"]',
'a[id^="ref-link"]', // Nature.com
].join(',');
exports.FOOTNOTE_LIST_SELECTORS = [
'div.footnote ol',
'div.footnotes ol',
'div[role="doc-endnotes"]',
'div[role="doc-footnotes"]',
'ol.footnotes-list',
'ol.footnotes',
'ol.references',
'ol[class*="article-references"]',
'section.footnotes ol',
'section[role="doc-endnotes"]',
'section[role="doc-footnotes"]',
'section[role="doc-bibliography"]',
'ul.footnotes-list',
'ul.ltx_biblist',
'div.footnote[data-component-name="FootnoteToDOM"]' // Substack
].join(',');
// Elements that are allowed to be empty
// These are not removed even if they have no content
exports.ALLOWED_EMPTY_ELEMENTS = new Set([
'area',
'audio',
'base',
'br',
'circle',
'col',
'defs',
'ellipse',
'embed',
'figure',
'g',
'hr',
'iframe',
'img',
'input',
'line',
'link',
'mask',
'meta',
'object',
'param',
'path',
'pattern',
'picture',
'polygon',
'polyline',
'rect',
'source',
'stop',
'svg',
'td',
'th',
'track',
'use',
'video',
'wbr'
]);
// Attributes to keep
exports.ALLOWED_ATTRIBUTES = new Set([
'alt',
'allow',
'allowfullscreen',
'aria-label',
'checked',
'colspan',
'controls',
'data-latex',
'data-src',
'data-srcset',
'data-lang',
'dir',
'display',
'frameborder',
'headers',
'height',
'href',
'lang',
'role',
'rowspan',
'src',
'srcset',
'title',
'type',
'width',
// MathML attributes
'accent',
'accentunder',
'align',
'columnalign',
'columnlines',
'columnspacing',
'columnspan',
'data-mjx-texclass',
'depth',
'displaystyle',
'fence',
'frame',
'framespacing',
'linethickness',
'lspace',
'mathsize',
'mathvariant',
'maxsize',
'minsize',
'movablelimits',
'notation',
'rowalign',
'rowlines',
'rowspacing',
'rowspan',
'rspace',
'scriptlevel',
'separator',
'stretchy',
'symmetric',
'voffset',
'xmlns'
]);
exports.ALLOWED_ATTRIBUTES_DEBUG = new Set([
'class',
'id',
]);
//# sourceMappingURL=constants.js.map
;