x-ray-scraper
Version:
Scraper next gen based on x-ray (2.3.2)
191 lines (170 loc) • 5.1 kB
JavaScript
const Promise = require('bluebird');
const { isObject, compact, root } = require('./utils/commonHelpers');
const parse = require('./utils/parseHelper');
const debug = require('debug')('resolve');
const isArray = Array.isArray;
/**
* Select the attribute based on `attr`
*
* @param {Cheerio} $
* @param {String} attr
* @return {String}
*/
function attribute($el, attr) {
switch (attr) {
case 'html':
return $el.html();
case 'text':
return $el.text();
default:
return $el.attr(attr);
}
}
/**
* Filter the value(s)
*
* @param {Object} obj
* @param {Cheerio} $
* @param {String} scope
* @param {String|Array} selector
* @param {Object} filters
* @return {Array|String}
*/
function filter(obj, $, scope, selector, value, filters) {
const ctx = { $: $, selector: obj.selector, attribute: obj.attribute };
return (obj.filters || []).reduce((out, filter) => {
const fn = filters[filter.name];
if (typeof fn === 'function') {
const args = [out].concat(filter.args || []);
const filtered = fn.apply(ctx, args);
debug('%s.apply(ctx, %j) => %j', filter.name, args, filtered);
return filtered;
} else {
throw new Error('Invalid filter: ' + filter.name);
}
}, value);
}
/**
* Selector abstraction, deals
* with various instances of $
*
* @param {Cheerio} $
* @param {String} selector
* @return {Cheerio}
*/
function select($, selector) {
if ($.is && $.is(selector)) return $;
return $.find ? $.find(selector) : $(selector);
}
/**
* Find the node(s)
*
* @param {Cheerio} $
* @param {String} scope
* @param {String|Array} selector
* @param {String} attr
* @return {Array|String}
*/
function find($, scope, selector, attr) {
if (scope) {
if (isArray(selector)) {
const $scope = select($, scope);
return $scope
.map(i => {
const $el = $scope.eq(i);
const $selector = select($el, selector[0]);
return $selector.map(i => attribute($selector.eq(i), attr)).get();
})
.get();
}
const $scope = select($, scope);
return attribute($scope.find(selector).eq(0), attr);
} else {
if (isArray(selector)) {
const $selector = select($, selector[0]);
return $selector.map(i => attribute($selector.eq(i), attr)).get();
}
const $selector = select($, selector);
return attribute($selector.eq(0), attr);
}
}
/**
* Initialize `resolve`
*
* @param {$} cheerio object
* @param {String} scope
* @param {String|Array} selector
* @param {Object} filters
* @return {Array|String}
*/
function resolve($, scope, selector, filters = {}) {
debug('resolve($j, %j)', scope, selector);
const isAnArray = isArray(selector);
let obj = parse(isAnArray ? selector[0] : selector);
obj.attribute = obj.attribute || 'text';
if (!obj.selector) {
obj.selector = scope;
scope = null;
}
let value = find(
$,
scope,
isAnArray ? [obj.selector] : obj.selector,
obj.attribute
);
debug('resolved($j, %j) => %j', scope, selector, value);
if (isAnArray) {
return value.map(v => filter(obj, $, scope, selector, v, filters));
}
return filter(obj, $, scope, selector, value, filters);
}
function resolveStringSelector($, scope, selector, filters) {
const value = resolve($, root(scope), selector, filters);
return Promise.resolve(value);
}
function resolveXraySelector(xray, $) {
return xray($);
}
function multipleResolve($, node, scope) {
const $scope = $.find ? $.find(scope) : $(scope);
if (!$scope.length) return Promise.resolve([]);
const promises = $scope.map(i => node($scope.eq(i))).get();
return Promise.all(promises).then(compact);
}
function resolveArraySelector($, xray, scope, selector, filters) {
if (typeof selector[0] === 'string') {
return resolveStringSelector($, scope, selector, filters);
} else if (typeof selector[0] === 'object') {
const node = xray(scope, selector[0]);
return multipleResolve($, node, scope);
} else if (typeof selector[0] === 'function') {
return multipleResolve($, selector[0], scope);
}
throw new Error("can 't resolve array selector");
}
function resolveObjectSelectorRecursive($, xray, scope, selector, filters) {
return Promise.reduce(
Object.keys(selector),
(accum, k) =>
resolveSelector($, xray, selector[k], scope, filters).then(value => {
if (value !== undefined && value !== '') {
accum[k] = value;
}
return accum;
}),
{}
);
}
function resolveSelector($, xray, selector, scope, filters) {
if (typeof selector === 'string') {
return resolveStringSelector($, scope, selector, filters);
} else if (typeof selector === 'function') {
return resolveXraySelector(selector, $);
} else if (Array.isArray(selector)) {
return resolveArraySelector($, xray, scope, selector, filters);
} else if (isObject(selector)) {
return resolveObjectSelectorRecursive($, xray, scope, selector, filters);
}
throw new Error("can 't resolve selector");
}
module.exports = { resolve, resolveSelector };