UNPKG

@viclafouch/fetch-crawler

Version:

A Node.JS Web crawler using the API Fetch

github.com/viclafouch/fetch-crawler

viclafouch/fetch-crawler

405 lines (345 loc) • 11.1 kB

JavaScript

"use strict"; var _abortController = _interopRequireDefault(require("abort-controller")); var _url = require("url"); var _utils = require("./utils"); var _nodeFetch = _interopRequireDefault(require("node-fetch")); var _cheerio = _interopRequireDefault(require("cheerio")); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } class Crawler { constructor(options = {}) { this._options = Object.assign({}, { maxRequest: -1, skipStrictDuplicates: true, sameOrigin: true, maxDepth: 3, parallel: 5, debugging: false, retryCount: 2, retryTimeout: 5000, timeBetweenRequest: 0 }, options); if (this._options.timeBetweenRequest > 0) this._options.parallel = 1; this.hostdomain = ''; this.linksToCrawl = new Map(); this.linksCrawled = new Map(); this._actions = { preRequest: this._options.preRequest || (x => x), onSuccess: this._options.onSuccess || null, onError: this._options.onError || null, evaluatePage: this._options.evaluatePage || null, onRedirection: this._options.onRedirection || (({ previousUrl }) => previousUrl) }; } fetch(...args) { let retries = 0; const _retry = () => { const controller = new _abortController.default(); const timeout = setTimeout(() => controller.abort(), this._options.retryTimeout); return (0, _nodeFetch.default)(...args, { signal: controller.signal }).then(e => { clearTimeout(timeout); return Promise.resolve(e); }, err => { throw err; }).catch(error => { clearTimeout(timeout); if (retries < this._options.retryCount) { retries++; return _retry(...args); } else { if (error.name === 'AbortError') { error.message = `Request canceled (timeout = ${this._options.retryTimeout}ms)`; } throw error; } }); }; return _retry(); } /** * Init the app. * Begin with the first link, and start the pulling * @return {Promise<pending>} */ async init() { try { if (!(0, _utils.isUrl)(this._options.url)) throw new Error(); const link = new _url.URL(this._options.url); this.hostdomain = link.origin; if (!this.hostdomain) throw new Error(); } catch (error) { console.error(error); throw new Error('URL provided (' + this._options.url + ') is not valid'); } this.linksCrawled.set(this._options.url); await this.pull(this._options.url, 1); if (this.linksToCrawl.size > 0) await this.crawl(); } /** * Get all links from the page. * @param {Cheerio} $ * @param {String} actualHref * @return {Promise<Array<String>} */ collectAnchors($, actualHref) { let linksCollected = []; try { linksCollected = $('a').map((i, e) => (0, _utils.relativePath)($(e).attr('href') || '', actualHref)) // Cheerio map method .filter((i, href) => (0, _utils.isUrl)(href)) // Cheerio filter method .map((i, href) => { const url = new _url.URL(href); url.hash = ''; return url.href; }).get(); // Cheerio get method to transform as an array } catch (error) { console.error(`Something wrong happened with this url: ${actualHref}`); console.error(error); } return [...new Set(linksCollected)]; // Avoid duplication } /** * Check if link can be crawled (Same origin ? Already collected ? preRequest !false ?). * @param {String} link * @return {Promise<Boolean>} */ async skipRequest(link) { const allowOrigin = this.checkSameOrigin(link); if (!allowOrigin) return true; const urlSanitazed = await this.shouldRequest(link); if (!urlSanitazed) return true; if (this._options.skipStrictDuplicates && this.linkAlreadyCollected(urlSanitazed)) return true; return false; } /** * If preRequest is provided by the user, get new link or false. * @param {String} link * @return {Promise<String || Boolean>} */ async shouldRequest(link) { if (this._actions.preRequest instanceof Function) { try { const preRequest = await this._actions.preRequest(link); if (preRequest === false && this._options.debugging) console.info(`\x1b[33m Ignoring ${link} \x1b[m`); if (typeof preRequest === 'string' || preRequest === false) return preRequest; throw new Error('preRequest function must return a String or False'); } catch (error) { console.error('Please try/catch your preRequest function'); console.error(error.message); } } return link; } /** * Check if link has the same origin as the host link. * @param {String} url * @return {Boolean} */ checkSameOrigin(url) { if (this._options.sameOrigin) return new _url.URL(url).origin === this.hostdomain; return true; } /** * If evaluatePage is provided by the user, await for it. * @param {Cheerio} $ * @return {Promise} */ async evaluate($) { let result = null; if (this._actions.evaluatePage && this._actions.evaluatePage instanceof Function) { try { result = await this._actions.evaluatePage($); } catch (error) { console.error('Please try/catch your evaluatePage function'); console.error(error); } } return result; } /** * Add links collected to queue. * @param {Array<String>} urlCollected * @param {Number} depth * @return {Promise} */ async addToQueue(urlCollected, depth = 0) { for (const url of urlCollected) { if (depth <= this._options.maxDepth) { if (!(await this.skipRequest(url))) { const linkEdited = await this.shouldRequest(url); this.linksToCrawl.set(linkEdited, depth); } } } } /** * Crawl links from 'linksToCrawl' and wait for having 'canceled' to true. * @return {Promise>} */ crawl() { return new Promise((resolve, reject) => { let canceled = false; let currentCrawlers = 0; const pullQueue = () => { if (canceled) return; while (currentCrawlers < this._options.parallel && this.linksToCrawl.size > 0) { canceled = !this.checkMaxRequest(); if (canceled) { currentCrawlers === 0 && resolve(); break; } currentCrawlers++; const currentLink = this.linksToCrawl.keys().next().value; const currentDepth = this.linksToCrawl.get(currentLink); this.linksToCrawl.delete(currentLink); this.linksCrawled.set(currentLink); this.pull(currentLink, currentDepth).then(() => { currentCrawlers--; if (currentCrawlers === 0 && (this.linksToCrawl.size === 0 || canceled)) resolve();else pullQueue(); }).catch(error => { canceled = true; reject(error); }); } }; pullQueue(); }); } /** * Pull result and links from a page and add them to the queue. * @param {String} link * @param {Number} depth * @return {Promise<pending>} */ async pull(link, depth) { if (this._options.timeBetweenRequest) await new Promise(resolve => setTimeout(resolve, this._options.timeBetweenRequest)); this._options.debugging && console.info(`\x1b[1;32m [${this.linksCrawled.size}${this._options.maxRequest !== -1 ? '/' + this._options.maxRequest : ''}] Crawling ${link} ...\x1b[m`); const { result, linksCollected, url, wrongRedirection, error } = await this.scrapePage(link); if (error) { await this.scrapeError({ urlScraped: url, error }); } else if (!wrongRedirection) { await this.scrapeSucceed({ urlScraped: url, result }); await this.addToQueue(linksCollected, depth + 1); } } /** * Know if a link will be crawled or has already been crawled. * @param {String} url * @return {Boolean} */ linkAlreadyCollected(url) { return this.linksCrawled.has(url) || this.linksToCrawl.has(url); } /** * Know if we have exceeded the number of request max provided in the options. * @return {Boolean} */ checkMaxRequest() { if (this._options.maxRequest === -1) return true; return this.linksCrawled.size < this._options.maxRequest; } /** * If onSuccess action's has been provided, await for it. * @param {Object<{urlScraped: string, result: any}>} * @return {Promise<pending>} */ async scrapeSucceed({ urlScraped, result }) { if (this._actions.onSuccess && this._actions.onSuccess instanceof Function) { try { await this._actions.onSuccess({ result, url: urlScraped }); } catch (error) { console.error('Please try/catch your onSuccess function'); } } } /** * If onError action's has been provided, await for it. * @param {Object<{urlScraped: string, error: Error}>} * @return {Promise<pending>} */ async scrapeError({ urlScraped, error }) { if (this._actions.onError && this._actions.onError instanceof Function) { try { await this._actions.onError({ error, url: urlScraped }); } catch (error) { console.error('Please try/catch your onError function'); } } } /** * Scrap a page, evaluate and get new links to visit. * @param {String} url * @return {Promise<{linksCollected: array, result: any, url: string}>} */ async scrapePage(url) { try { const response = await this.fetch(url); if (response.redirected) { url = await this._options.onRedirection({ previousUrl: url, response }); if (!url) return { wrongRedirection: true }; } else if (response.status !== 200) throw response; const textResponse = await response.text(); const $ = _cheerio.default.load(textResponse); const [result, linksCollected] = await Promise.all([this.evaluate($), this.collectAnchors($, url)]); return { linksCollected, result, url }; } catch (error) { return { url, error }; } } /** * Starting the crawl. * @param {{debugging: Boolean, maxRequest: Number, parallel: Number, maxDepth: Number, sameOrigin: Boolean, skipStrictDuplicates: Boolean, retryCount: Number, retryTimeout: Number, timeBetweenRequest: Number }} options Options of the crawler. * @return {Promise<{startCrawlingAt: Date, finishCrawlingAt: Date, linksVisited: Number}>} */ static async launch(options) { const startCrawlingAt = new Date(); const crawler = new Crawler(options); await crawler.init(); const finishCrawlingAt = new Date(); return { startCrawlingAt, finishCrawlingAt, linksVisited: crawler.linksCrawled.size }; } } module.exports = Crawler; //# sourceMappingURL=index.js.map