UNPKG

article-parser

Version:

To extract main article from given URL

134 lines (119 loc) 2.64 kB
// utils -> linker import { DOMParser } from 'linkedom' import { findBestMatch } from 'string-similarity' export const isValid = (url = '') => { try { const ourl = new URL(url) return ourl !== null && ourl.protocol.startsWith('http') } catch (err) { return false } } export const chooseBestUrl = (candidates = [], title = '') => { const ranking = findBestMatch(title, candidates) return ranking.bestMatch.target } export const absolutify = (fullUrl = '', relativeUrl = '') => { try { const result = new URL(relativeUrl, fullUrl) return result.toString() } catch (err) { return '' } } const blacklistKeys = [ 'CNDID', '__twitter_impression', '_hsenc', '_openstat', 'action_object_map', 'action_ref_map', 'action_type_map', 'amp', 'fb_action_ids', 'fb_action_types', 'fb_ref', 'fb_source', 'fbclid', 'ga_campaign', 'ga_content', 'ga_medium', 'ga_place', 'ga_source', 'ga_term', 'gs_l', 'hmb_campaign', 'hmb_medium', 'hmb_source', 'mbid', 'mc_cid', 'mc_eid', 'mkt_tok', 'referrer', 'spJobID', 'spMailingID', 'spReportId', 'spUserID', 'utm_brand', 'utm_campaign', 'utm_cid', 'utm_content', 'utm_int', 'utm_mailing', 'utm_medium', 'utm_name', 'utm_place', 'utm_pubreferrer', 'utm_reader', 'utm_social', 'utm_source', 'utm_swu', 'utm_term', 'utm_userid', 'utm_viz_id', 'wt_mc_o', 'yclid', 'WT.mc_id', 'WT.mc_ev', 'WT.srch', 'pk_source', 'pk_medium', 'pk_campaign' ] export const purify = (url) => { try { const pureUrl = new URL(url) blacklistKeys.forEach((key) => { pureUrl.searchParams.delete(key) }) return pureUrl.toString().replace(pureUrl.hash, '') } catch (err) { return null } } /** * @param inputHtml {string} * @param url {string} * @returns article {string} */ export const normalize = (html, url) => { const doc = new DOMParser().parseFromString(html, 'text/html') Array.from(doc.getElementsByTagName('a')).forEach((element) => { const href = element.getAttribute('href') if (href) { element.setAttribute('href', absolutify(url, href)) element.setAttribute('target', '_blank') } }) Array.from(doc.getElementsByTagName('img')).forEach((element) => { const src = element.getAttribute('data-src') ?? element.getAttribute('src') if (src) { element.setAttribute('src', absolutify(url, src)) } }) return Array.from(doc.children).map(element => element.outerHTML).join('') } export const getDomain = (url) => { const host = (new URL(url)).host return host.replace('www.', '') }