x-ray-scraper
Version:
Scraper next gen based on x-ray (2.3.2)
124 lines (103 loc) • 3.08 kB
JavaScript
const fs = require('fs');
const enstore = require('enstore');
const Crawler = require('./crawler');
const { resolveSelector } = require('./resolve');
const { getSendToStreamFn } = require('./utils/streamHelpers');
const assignParameters = require('./utils/parameterHelper');
const { getLoadedSource, getNextUrl } = require('./source');
const Request = require('./request');
function Xray(driver) {
const crawler = Crawler(driver);
let filters = {};
function xray(source, scope, selector) {
({ source, scope, selector } = assignParameters(source, scope, selector));
const store = enstore();
const request = Request(crawler);
let paginate = false;
let limit = Infinity;
let pageNumber = 1;
let abort = false;
let stream = false;
let pages = [];
let sendToStream;
function crawl(source2) {
sendToStream = sendToStream || getSendToStreamFn(paginate, stream);
return getLoadedSource(source2 || source, scope, filters, request).then(
$ => {
return resolveSelector($, xray, selector, scope, filters).then(
result => {
pages = Array.isArray(result)
? [...pages, ...result]
: [...pages, result];
const nextUrl = getNextUrl(
$,
paginate,
--limit,
filters,
result,
abort,
++pageNumber
);
if (nextUrl) {
sendToStream(result);
return crawl(nextUrl);
}
sendToStream(result, true);
return Promise.resolve(paginate ? pages : result);
}
);
}
);
}
crawl.abort = function(validator) {
if (!validator) return abort;
abort = validator;
return crawl;
};
crawl.paginate = function(paginateParam) {
if (!paginateParam) return paginate;
paginate = paginateParam;
return crawl;
};
crawl.limit = function(limitParam) {
if (!limitParam) return limit;
limit = limitParam;
return crawl;
};
crawl.stream = function() {
stream = store.createWriteStream();
crawl(source).catch(error => stream.emit('error', error));
return store.createReadStream();
};
crawl.then = function(resolve, reject) {
return crawl(source).then(resolve).catch(reject);
};
crawl.write = function(path) {
if (!path) return crawl.stream();
stream = fs.createWriteStream(path);
crawl(source).catch(error => stream.emit('error', error));
return stream;
};
return crawl;
}
[
'concurrency',
'throttle',
'timeout',
'driver',
'delay',
'limit',
'abort'
].forEach(method => {
xray[method] = function() {
if (!arguments.length) return crawler[method]();
crawler[method].apply(crawler, arguments);
return this;
};
});
xray.setFilters = function(f) {
filters = f;
};
return xray;
}
module.exports = Xray;