UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

862 lines 18.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ALLOWED_ATTRIBUTES_DEBUG = exports.ALLOWED_ATTRIBUTES = exports.ALLOWED_EMPTY_ELEMENTS = exports.FOOTNOTE_LIST_SELECTORS = exports.FOOTNOTE_INLINE_REFERENCES = exports.PARTIAL_SELECTORS = exports.TEST_ATTRIBUTES = exports.EXACT_SELECTORS = exports.INLINE_ELEMENTS = exports.PRESERVE_ELEMENTS = exports.BLOCK_ELEMENTS = exports.MOBILE_WIDTH = exports.ENTRY_POINT_ELEMENTS = void 0; // Entry point elements // These are the elements that will be used to find the main content exports.ENTRY_POINT_ELEMENTS = [ '#post', '.post-content', '.article-content', '#article-content', '.article_post', '.article-wrapper', '.entry-content', '.content-article', '.post', '.markdown-body', 'article', '[role="article"]', 'main', '[role="main"]', 'body' // ensures there is always a match ]; exports.MOBILE_WIDTH = 600; exports.BLOCK_ELEMENTS = ['div', 'section', 'article', 'main', 'aside', 'header', 'footer', 'nav', 'content']; // Elements that should not be unwrapped exports.PRESERVE_ELEMENTS = new Set([ 'pre', 'code', 'table', 'thead', 'tbody', 'tr', 'td', 'th', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'figure', 'figcaption', 'picture', 'details', 'summary', 'blockquote', 'form', 'fieldset' ]); // Inline elements that should not be unwrapped exports.INLINE_ELEMENTS = new Set([ 'a', 'span', 'strong', 'em', 'i', 'b', 'u', 'code', 'br', 'small', 'sub', 'sup', 'mark', 'date', 'del', 'ins', 'q', 'abbr', 'cite', 'relative-time', 'time', 'font' ]); // Selectors to be removed exports.EXACT_SELECTORS = [ // scripts, styles 'noscript', 'script:not([type^="math/"])', 'style', 'meta', 'link', // ads '.ad:not([class*="gradient"])', '[class^="ad-" i]', '[class$="-ad" i]', '[id^="ad-" i]', '[id$="-ad" i]', '[role="banner" i]', '[alt*="advert" i]', '.promo', '.Promo', '#barrier-page', // ft.com '.alert', // comments '[id="comments" i]', '[id="comment" i]', // header, nav 'header', '.header:not(.banner)', '#header', '#Header', '#banner', '#Banner', 'nav', '.navigation', '#navigation', '.hero', '[role="navigation" i]', '[role="dialog" i]', '[role*="complementary" i]', '[class*="pagination" i]', '.menu', '#menu', '#siteSub', // '.fixed', see issue #44 '.previous', // metadata '.author', '.Author', '[class$="_bio"]', '#categories', '.contributor', '.date', '#date', '[data-date]', '.entry-meta', '.meta', '.tags', '#tags', '.toc', '.Toc', '#toc', '.headline', '#headline', '#title', '#Title', '#articleTag', '[href*="/category"]', '[href*="/categories"]', '[href*="/tag/"]', '[href*="/tags/"]', '[href*="/topics"]', '[href*="author"]', '[href*="#toc"]', '[href="#top"]', '[href="#Top"]', '[href="#page-header"]', '[href="#content"]', '[href="#site-content"]', '[href="#main-content"]', '[href^="#main"]', '[src*="author"]', // footer 'footer', // inputs, forms, elements '.aside', 'aside', 'button', // '[role="button"]', Medium images 'canvas', 'date', 'dialog', 'fieldset', 'form', 'input:not([type="checkbox"])', 'label', 'option', 'select', 'textarea', 'time', 'relative-time', // hidden '[hidden]', '[aria-hidden="true"]:not([class*="math"])', '[style*="display: none"]:not([class*="math"])', '[style*="display:none"]:not([class*="math"])', '[style*="visibility: hidden"]', '[style*="visibility:hidden"]', '.hidden', '.invisible', // iframes 'instaread-player', 'iframe:not([src*="youtube"]):not([src*="youtu.be"]):not([src*="vimeo"]):not([src*="twitter"]):not([src*="x.com"]):not([src*="datawrapper"])', // logos '[class="logo" i]', '#logo', '#Logo', // newsletter '#newsletter', '#Newsletter', '.subscribe', // hidden for print '.noprint', '[data-print-layout="hide" i]', '[data-block="donotprint" i]', // footnotes, citations '[class*="clickable-icon" i]', 'li span[class*="ltx_tag" i][class*="ltx_tag_item" i]', 'a[href^="#"][class*="anchor" i]', 'a[href^="#"][class*="ref" i]', // link lists '[data-container*="most-viewed" i]', // sidebar '.sidebar', '.Sidebar', '#sidebar', '#Sidebar', '#sitesub', // skip links '[data-link-name*="skip" i]', '[aria-label*="skip" i]', '#skip-link', // other '.copyright', '#copyright', '#rss', '#feed', '.gutter', '#primaryaudio', // NPR '#NYT_ABOVE_MAIN_CONTENT_REGION', '[data-testid="photoviewer-children-figure"] > span', // New York Times 'table.infobox', '.pencraft:not(.pc-display-contents)', // Substack '[data-optimizely="related-articles-section" i]', // The Economist '[data-orientation="vertical"]' ]; // Attributes to test against for partial matches exports.TEST_ATTRIBUTES = [ 'class', 'id', 'data-test', 'data-testid', 'data-test-id', 'data-qa', 'data-cy' ]; // Removal patterns tested against attributes above // Case insensitive, partial matches allowed exports.PARTIAL_SELECTORS = [ 'a-statement', 'access-wall', 'activitypub', 'actioncall', 'addcomment', 'advert', // '-ad-', howtogeek.com 'adlayout', 'ad-tldr', 'ad-placement', 'ads-container', '_ad_', 'after_content', 'after_main_article', 'afterpost', 'allterms', '-alert-', 'alert-box', 'appendix', '_archive', 'around-the-web', 'aroundpages', 'article-author', 'article-badges', 'article-banner', 'article-bottom-section', 'article-bottom', 'article-category', 'article-card', 'article-citation', 'article__copy', 'article_date', 'article-date', 'article-end ', 'article_header', 'article-header', 'article__header', 'article__hero', 'article__info', 'article-info', 'article-meta', 'article_meta', 'article__meta', 'articlename', 'article-subject', 'article_subject', 'article-snippet', 'article-separator', 'article--share', 'article--topics', 'articletags', 'article-tags', 'article_tags', 'articletitle', 'article-title', 'article_title', 'articletopics', 'article-topics', // 'article-type', 'article--lede', // The Verge 'articlewell', 'associated-people', 'audio-card', // 'author', Gwern // '-author', 'author-bio', 'author-box', 'author-info', 'author_info', 'authorm', 'author-mini-bio', 'author-name', 'author-publish-info', 'authored-by', 'avatar', 'back-to-top', 'backlink_container', 'backlinks-section', // 'banner', 'bio-block', 'biobox', 'blog-pager', 'bookmark-', '-bookmark', 'bottominfo', 'bottomnav', 'bottom-of-article', 'bottom-wrapper', 'brand-bar', 'breadcrumb', 'brdcrumb', 'button-wrapper', 'buttons-container', 'btn-', '-btn', 'byline', 'captcha', 'card-text', 'card-media', 'card-post', // 'carousel', 'carouselcontainer', 'carousel-container', 'cat_header', 'catlinks', '_categories', 'card-author', 'card-content', 'chapter-list', // The Economist 'collections', 'comments', // '-comment', Syntax highlighting 'commentbox', 'comment-button', 'commentcomp', 'comment-content', 'comment-count', 'comment-form', 'comment-number', 'comment-respond', 'comment-thread', 'comment-wrap', 'complementary', 'consent', 'contact-', 'content-card', // The Verge 'content-topics', 'contentpromo', 'context-bar', 'context-widget', // Reuters 'core-collateral', 'cover-', 'created-date', 'creative-commons_', 'c-subscribe', '_cta', '-cta', 'cta-', 'cta_', 'current-issue', // The Nation 'custom-list-number', 'dateline', 'dateheader', 'date-header', 'date-pub', // 'dialog', 'disclaimer', 'disclosure', 'discussion', 'discuss_', 'disqus', 'donate', 'donation', 'dropdown', // Ars Technica 'eletters', 'emailsignup', 'engagement-widget', 'enhancement', 'entry-author-info', 'entry-categories', 'entry-date', // 'entry-meta', 'entry-title', 'entry-utility', '-error', 'error-', 'eyebrow', 'expand-reduce', 'external-anchor', 'externallinkembedwrapper', // The New Yorker 'extra-services', 'extra-title', 'facebook', 'fancy-box', 'favorite', 'featured-content', 'feature_feed', 'feedback', 'feed-links', 'field-site-sections', 'fixheader', 'floating-vid', // 'follow', 'follower', 'footer', 'footnote-back', 'footnoteback', 'form-group', 'for-you', 'frontmatter', 'further-reading', 'fullbleedheader', 'gated-', 'gh-feed', 'gist-meta', // 'global', // 'google', 'goog-', 'graph-view', 'hamburger', 'header_logo', 'header-logo', 'header-pattern', // The Verge // 'headlines', Mercurynews 'hero-list', // '-hidden', 'hide-for-print', 'hide-print', 'hide-when-no-script', 'hidden-print', 'hidden-sidenote', 'hidden-accessibility', 'infoline', 'instacartIntegration', 'interlude', 'interaction', 'itemendrow', 'invisible', 'jumplink', 'jump-to-', 'keepreading', 'keep-reading', 'keep_reading', // 'keyword', // used in syntax highlighting 'keyword_wrap', 'kicker', 'labstab', // Arxiv '-labels', 'language-name', 'lastupdated', 'latest-content', '-ledes-', // The Verge '-license', 'license-', 'lightbox-popup', 'like-button', 'link-box', 'links-grid', // BBC 'links-title', // BBC 'listing-dynamic-terms', // Boston Review 'list-tags', 'listinks', 'loading', 'loa-info', 'logo_container', 'ltx_role_refnum', // Arxiv 'ltx_tag_bibitem', 'ltx_error', 'masthead', 'marketing', 'media-inquiry', '-menu', 'menu-', // 'meta-', syntax highlighting 'metadata', 'might-like', 'minibio', 'more-about', '_modal', '-modal', 'more-', 'morenews', 'morestories', 'more_wrapper', 'most-read', 'move-helper', 'mw-editsection', 'mw-cite-backlink', 'mw-indicators', 'mw-jump-link', 'nav-', 'nav_', // 'navbar', // 'navigation', 'navigation-post', 'next-', 'newsgallery', 'news-story-title', // 'newsletter', used on Substack 'newsletter_', 'newsletterbanner', 'newslettercontainer', 'newsletter-form', 'newsletter-signup', 'newslettersignup', 'newsletterwidget', 'newsletterwrapper', 'not-found', 'notessection', 'nomobile', 'noprint', 'open-slideshow', 'originally-published', // Mercury News 'other-blogs', 'outline-view', // 'overlay', 'pagehead', 'page-header', 'page-title', 'paywall_message', '-partners', 'permission-', 'plea', 'popular', // 'popup', Gwern 'popup_links', // 'popover', 'pop_stories', 'pop-up', 'post-author', 'post-bottom', 'post__category', 'postcomment', 'postdate', 'post-date', 'post_date', 'post-details', 'post-feeds', 'postinfo', 'post-info', 'post_info', 'post-inline-date', 'post-links', 'postlist', 'post_list', 'post_meta', 'post-meta', 'postmeta', 'post_more', 'postnavi', 'post-navigation', 'postpath', 'post-preview', 'postsnippet', 'post_snippet', 'post-snippet', 'post-subject', 'posttax', 'post-tax', 'post_tax', 'posttag', 'post_tag', 'post-tag', 'post_time', 'posttitle', 'post-title', 'post_title', 'post__title', 'post-ufi-button', // 'preview', used on Obsidian Publish 'prev-post', 'prevnext', 'prev_next', 'prev-next', 'previousnext', 'press-inquiries', 'print-none', 'print-header', 'print:hidden', 'privacy-notice', 'privacy-settings', 'profile', // 'promo', 'promo_article', 'promo-bar', 'promo-box', 'pubdate', 'pub_date', 'pub-date', 'publish_date', 'publish-date', 'publication-date', 'publicationName', // Medium 'qr-code', 'qr_code', 'quick_up', '_rail', 'ratingssection', 'read_also', 'readmore', 'read-next', 'read_next', 'read_time', 'read-time', 'reading_time', 'reading-time', 'reading-list', 'recent-', 'recent-articles', 'recentpost', 'recent_post', 'recent-post', 'recommend', 'redirectedfrom', 'recirc', 'register', 'related', 'relevant', 'reversefootnote', '_rss', 'rss-link', 'screen-reader-text', 'scroll_to', 'scroll-to', '_search', '-search', 'section-nav', 'series-banner', // 'share', // '-share', scitechdaily.com 'share-box', 'sharedaddy', 'share-icons', 'sharelinks', 'share-post', 'share-print', 'share-section', 'show-for-print', 'sidebartitle', // 'sidebar_', 'sidebar-content', 'sidebar-wrapper', 'sideitems', 'sidebar-author', 'sidebar-item', 'side-box', 'side-logo', 'sign-in-gate', 'similar-', 'similar_', 'similars-', 'site-index', 'site-header', 'siteheader', 'site-logo', 'site-name', 'site-wordpress', // 'skip-', // 'skip-link', TechCrunch 'skip-content', 'skip-to-content', // 'skip-link', 'c-skip-link', '_skip-link', '-slider', 'slug-wrap', // 'social', 'social-author', 'social-shar', 'social-date', 'speechify-ignore', 'speedbump', 'sponsor', 'springercitation', 'sr-only', // '-stats', '_stats', // 'sticky', 'story-date', 'story-navigation', 'storyreadtime', // Medium 'storysmall', 'storypublishdate', // Medium 'subject-label', 'subhead', 'submenu', // 'subscribe', '-subscribe-', 'subscriber-drive', 'subscription-', '_tags', 'tags__item', 'tag_list', 'taxonomy', // 'table-content', 'table-of-contents', 'tabs-', // 'teaser', Nature 'terminaltout', 'time-rubric', 'timestamp', 'time-read', 'time-to-read', 'tip_off', 'tiptout', '-tout-', // '-toc', 'toc-container', 'toggle-caption', // 'toolbar', prism.js 'tooltip', 'topbar', 'topic-list', 'topic-subnav', // 'top-section', 'top-wrapper', 'tree-item', 'trending', 'trust-feat', 'trust-badge', 'trust-project', 'twitter', 'u-hide', 'upsell', 'viewbottom', 'visually-hidden', 'welcomebox', 'widget_pages', // 'widget-' ]; // Selectors for footnotes and citations exports.FOOTNOTE_INLINE_REFERENCES = [ 'sup.reference', 'cite.ltx_cite', 'sup[id^="fnr"]', 'span[id^="fnr"]', 'span[class*="footnote_ref"]', 'span.footnote-link', 'a.citation', 'a[id^="ref-link"]', 'a[href^="#fn"]', 'a[href^="#cite"]', 'a[href^="#reference"]', 'a[href^="#footnote"]', 'a[href^="#r"]', // Common in academic papers 'a[href^="#b"]', // Common for bibliography references 'a[href*="cite_note"]', 'a[href*="cite_ref"]', 'a.footnote-anchor', // Substack 'span.footnote-hovercard-target a', // Substack 'a[role="doc-biblioref"]', // Science.org 'a[id^="fnref"]', 'a[id^="ref-link"]', // Nature.com ].join(','); exports.FOOTNOTE_LIST_SELECTORS = [ 'div.footnote ol', 'div.footnotes ol', 'div[role="doc-endnotes"]', 'div[role="doc-footnotes"]', 'ol.footnotes-list', 'ol.footnotes', 'ol.references', 'ol[class*="article-references"]', 'section.footnotes ol', 'section[role="doc-endnotes"]', 'section[role="doc-footnotes"]', 'section[role="doc-bibliography"]', 'ul.footnotes-list', 'ul.ltx_biblist', 'div.footnote[data-component-name="FootnoteToDOM"]' // Substack ].join(','); // Elements that are allowed to be empty // These are not removed even if they have no content exports.ALLOWED_EMPTY_ELEMENTS = new Set([ 'area', 'audio', 'base', 'br', 'circle', 'col', 'defs', 'ellipse', 'embed', 'figure', 'g', 'hr', 'iframe', 'img', 'input', 'line', 'link', 'mask', 'meta', 'object', 'param', 'path', 'pattern', 'picture', 'polygon', 'polyline', 'rect', 'source', 'stop', 'svg', 'td', 'th', 'track', 'use', 'video', 'wbr' ]); // Attributes to keep exports.ALLOWED_ATTRIBUTES = new Set([ 'alt', 'allow', 'allowfullscreen', 'aria-label', 'checked', 'colspan', 'controls', 'data-latex', 'data-src', 'data-srcset', 'data-lang', 'dir', 'display', 'frameborder', 'headers', 'height', 'href', 'lang', 'role', 'rowspan', 'src', 'srcset', 'title', 'type', 'width', // MathML attributes 'accent', 'accentunder', 'align', 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'data-mjx-texclass', 'depth', 'displaystyle', 'fence', 'frame', 'framespacing', 'linethickness', 'lspace', 'mathsize', 'mathvariant', 'maxsize', 'minsize', 'movablelimits', 'notation', 'rowalign', 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'separator', 'stretchy', 'symmetric', 'voffset', 'xmlns' ]); exports.ALLOWED_ATTRIBUTES_DEBUG = new Set([ 'class', 'id', ]); //# sourceMappingURL=constants.js.map