UNPKG

article-parser

Version:

Extract clean article data from given URL.

116 lines (101 loc) 2.53 kB
/** * url-resolver * @ndaidong **/ var URL = require('url'); var bella = require('bellajs'); var config = require('./config'); var isInBlackList = (url) => { if (!bella.isString(url)) { return false; } return config.blackList.some((c) => { return url.match(c); }); }; var isAdsDomain = (url) => { if (!bella.isString(url)) { return false; } return config.adsDomain.some((c) => { return url.match(c); }); }; var isExceptDomain = (url) => { if (!bella.isString(url)) { return false; } return config.exceptDomain.some((c) => { return url.match(c); }); }; var isValidURL = (str) => { if (!str) { return false; } if (isInBlackList(str)) { return false; } /* eslint-disable*/ let pattern = /^(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,}))\.?)(?::\d{2,5})?(?:[/?#]\S*)?$/i; /* eslint-enable*/ if (!pattern.test(str)) { return false; } return true; }; var removeUTM = (url) => { if (!isValidURL(url)) { return false; } if (url.includes('#')) { let a1 = url.split('#'); url = a1[0]; } let arr = url.split('?'); if (arr.length > 1) { let s = arr[1]; return [arr[0], s.split('&').filter((v) => { return !(/^utm_/).test(v) && !(/^pk_/).test(v); }).join('&')].join('?'); } return url; }; var absolutify = (fullUrl, relativeUrl) => { if (!isValidURL(fullUrl) || !bella.isString(relativeUrl)) { return ''; } let parsed = URL.parse(fullUrl); let first = [parsed.protocol, parsed.host].join('//'); return URL.resolve(first, relativeUrl); }; var purify = (url) => { url = removeUTM(url); if (!isValidURL(url)) { return false; } let g = URL.parse(url); let u = [g.protocol, '//', g.host, g.pathname].join(''); let isBad = isAdsDomain(url) || !g.search || g.search.indexOf('=') === -1; if (isBad) { return u; } return u + g.search; }; var getDomain = (url) => { if (!isValidURL(url)) { return false; } let g = URL.parse(url); return g.host; }; module.exports = { isValidURL, isAdsDomain, isExceptDomain, isInBlackList, absolutify, purify, removeUTM, getDomain };