x-ray-scraper
Version:
Scraper next gen based on x-ray (2.3.2)
156 lines (138 loc) • 2.88 kB
JavaScript
const has = Object.prototype.hasOwnProperty;
const cheerio = require('cheerio');
const isObject = require('isobject');
const isUrlFromLib = require('is-url');
const url = require('url');
function isUrl(url) {
if (typeof url !== 'string') return false;
return isUrlFromLib(url);
}
/**
* Get the root, if there is one.
*
* @param {Mixed}
* @return {Boolean|String}
*/
function root(selector) {
return (
(typeof selector === 'string' || Array.isArray(selector)) &&
!~selector.indexOf('@') &&
!isUrl(selector) &&
selector
);
}
/**
* Compact an array,
* removing empty objects
*
* @param {Array} arr
* @return {Array}
*/
function compact(arr) {
return arr.filter(function(val) {
if (!val) return false;
if (val.length !== undefined) return val.length !== 0;
for (var key in val) if (has.call(val, key)) return true;
return false;
});
}
/**
* Check if the string is HTML
*/
function isHTML(str) {
str = (str || '').toString().trim();
return str[0] === '<' && str[str.length - 1] === '>';
}
/**
* Checks if a given string is a valid URL
*
* @param {String} src
* @return {Boolean}
*/
function isValidUrl(src) {
try {
url.parse(src);
return true;
} catch (e) {
return false;
}
}
/**
* Change all the URLs into absolute urls
*
* @param {String} path
* @param {Cheerio} $
* @return {$}
*/
function absolute(path, $) {
const parts = url.parse(path);
let remote = parts.protocol + '//' + parts.host;
// apply <base> tag transformation
const base = $('head').find('base');
let href;
if (base.length === 1) {
href = base.attr('href');
if (href) {
remote = href;
}
}
$(
[
'a[href]',
'img[src]',
'script[src]',
'link[href]',
'source[src]',
'track[src]',
'img[src]',
'frame[src]',
'iframe[src]'
].join(',')
).each(abs);
function abs(i, el) {
const $el = $(el);
let key = null;
let src = null;
const hasHref = $el.attr('href');
const hashSrc = $el.attr('src');
if (hasHref) {
key = 'href';
src = hasHref;
} else if (hashSrc) {
key = 'src';
src = hashSrc;
} else {
return;
}
src = src.trim();
if (~src.indexOf('://')) {
return;
} else if (isValidUrl(src)) {
let current;
if (href && src.indexOf('/') !== 0) {
current = url.resolve(remote, href);
src = url.resolve(current, src);
} else {
current = url.resolve(remote, parts.pathname);
src = url.resolve(current, src);
}
}
$el.attr(key, src);
}
return $;
}
function load(html, url) {
html = html || '';
let $ = html.html ? html : cheerio.load(html);
if (url) $ = absolute(url, $);
return $;
}
module.exports = {
root,
isUrl,
isHTML,
compact,
isObject,
absolute,
load
};