UNPKG

x-ray-scraper

Version:
124 lines (103 loc) 3.08 kB
const fs = require('fs'); const enstore = require('enstore'); const Crawler = require('./crawler'); const { resolveSelector } = require('./resolve'); const { getSendToStreamFn } = require('./utils/streamHelpers'); const assignParameters = require('./utils/parameterHelper'); const { getLoadedSource, getNextUrl } = require('./source'); const Request = require('./request'); function Xray(driver) { const crawler = Crawler(driver); let filters = {}; function xray(source, scope, selector) { ({ source, scope, selector } = assignParameters(source, scope, selector)); const store = enstore(); const request = Request(crawler); let paginate = false; let limit = Infinity; let pageNumber = 1; let abort = false; let stream = false; let pages = []; let sendToStream; function crawl(source2) { sendToStream = sendToStream || getSendToStreamFn(paginate, stream); return getLoadedSource(source2 || source, scope, filters, request).then( $ => { return resolveSelector($, xray, selector, scope, filters).then( result => { pages = Array.isArray(result) ? [...pages, ...result] : [...pages, result]; const nextUrl = getNextUrl( $, paginate, --limit, filters, result, abort, ++pageNumber ); if (nextUrl) { sendToStream(result); return crawl(nextUrl); } sendToStream(result, true); return Promise.resolve(paginate ? pages : result); } ); } ); } crawl.abort = function(validator) { if (!validator) return abort; abort = validator; return crawl; }; crawl.paginate = function(paginateParam) { if (!paginateParam) return paginate; paginate = paginateParam; return crawl; }; crawl.limit = function(limitParam) { if (!limitParam) return limit; limit = limitParam; return crawl; }; crawl.stream = function() { stream = store.createWriteStream(); crawl(source).catch(error => stream.emit('error', error)); return store.createReadStream(); }; crawl.then = function(resolve, reject) { return crawl(source).then(resolve).catch(reject); }; crawl.write = function(path) { if (!path) return crawl.stream(); stream = fs.createWriteStream(path); crawl(source).catch(error => stream.emit('error', error)); return stream; }; return crawl; } [ 'concurrency', 'throttle', 'timeout', 'driver', 'delay', 'limit', 'abort' ].forEach(method => { xray[method] = function() { if (!arguments.length) return crawler[method](); crawler[method].apply(crawler, arguments); return this; }; }); xray.setFilters = function(f) { filters = f; }; return xray; } module.exports = Xray;