UNPKG

x-ray-scraper

Version:
156 lines (138 loc) 2.88 kB
const has = Object.prototype.hasOwnProperty; const cheerio = require('cheerio'); const isObject = require('isobject'); const isUrlFromLib = require('is-url'); const url = require('url'); function isUrl(url) { if (typeof url !== 'string') return false; return isUrlFromLib(url); } /** * Get the root, if there is one. * * @param {Mixed} * @return {Boolean|String} */ function root(selector) { return ( (typeof selector === 'string' || Array.isArray(selector)) && !~selector.indexOf('@') && !isUrl(selector) && selector ); } /** * Compact an array, * removing empty objects * * @param {Array} arr * @return {Array} */ function compact(arr) { return arr.filter(function(val) { if (!val) return false; if (val.length !== undefined) return val.length !== 0; for (var key in val) if (has.call(val, key)) return true; return false; }); } /** * Check if the string is HTML */ function isHTML(str) { str = (str || '').toString().trim(); return str[0] === '<' && str[str.length - 1] === '>'; } /** * Checks if a given string is a valid URL * * @param {String} src * @return {Boolean} */ function isValidUrl(src) { try { url.parse(src); return true; } catch (e) { return false; } } /** * Change all the URLs into absolute urls * * @param {String} path * @param {Cheerio} $ * @return {$} */ function absolute(path, $) { const parts = url.parse(path); let remote = parts.protocol + '//' + parts.host; // apply <base> tag transformation const base = $('head').find('base'); let href; if (base.length === 1) { href = base.attr('href'); if (href) { remote = href; } } $( [ 'a[href]', 'img[src]', 'script[src]', 'link[href]', 'source[src]', 'track[src]', 'img[src]', 'frame[src]', 'iframe[src]' ].join(',') ).each(abs); function abs(i, el) { const $el = $(el); let key = null; let src = null; const hasHref = $el.attr('href'); const hashSrc = $el.attr('src'); if (hasHref) { key = 'href'; src = hasHref; } else if (hashSrc) { key = 'src'; src = hashSrc; } else { return; } src = src.trim(); if (~src.indexOf('://')) { return; } else if (isValidUrl(src)) { let current; if (href && src.indexOf('/') !== 0) { current = url.resolve(remote, href); src = url.resolve(current, src); } else { current = url.resolve(remote, parts.pathname); src = url.resolve(current, src); } } $el.attr(key, src); } return $; } function load(html, url) { html = html || ''; let $ = html.html ? html : cheerio.load(html); if (url) $ = absolute(url, $); return $; } module.exports = { root, isUrl, isHTML, compact, isObject, absolute, load };