UNPKG

supercrawler

Version:

A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.

github.com/brendonboshell/supercrawler

brendonboshell/supercrawler

571 lines (488 loc) • 16 kB

JavaScript

var Crawler, util = require("util"), EventEmitter = require("events").EventEmitter, FifoUrlList = require("./FifoUrlList"), Url = require("./Url"), Promise = require("bluebird"), urlMod = require("url"), NodeCache = require("node-cache"), request = Promise.promisify(require("request")), robotsParser = require("robots-parser"), mime = require('mime-types'), _ = require("lodash"), error = require("./error"), DEFAULT_INTERVAL = 1000, DEFAULT_CONCURRENT_REQUESTS_LIMIT = 5, DEFAULT_ROBOTS_CACHE_TIME = 1000 * 60 * 60, DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)"; /** * Object represents an instance of a crawler, i.e. a HTTP client that * automatically crawls webpages according to the settings passed to it. * * @param {Object} [opts] Object of configuration options. */ Crawler = function (opts) { if (!(this instanceof Crawler)) { return new Crawler(opts); } if (typeof opts === "undefined") { opts = {}; } this._urlList = opts.urlList || new FifoUrlList(); this._interval = opts.interval || DEFAULT_INTERVAL; this._concurrentRequestsLimit = opts.concurrentRequestsLimit || DEFAULT_CONCURRENT_REQUESTS_LIMIT; this._robotsCache = new NodeCache({ stdTTL: (opts.robotsCacheTime || DEFAULT_ROBOTS_CACHE_TIME) / 1000 }); this._userAgent = opts.userAgent || DEFAULT_USER_AGENT; this._request = opts.request || {}; this._handlers = []; this._outstandingRequests = 0; this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false; this._robotsEnabled = (opts.robotsEnabled !== false); }; util.inherits(Crawler, EventEmitter); /** * Returns the instance of a \UrlList object that is being used. Unless * specified to the constructor, this will be \FifoUrlList type * * @return {UrlList} Instance of \UrlList type object. */ Crawler.prototype.getUrlList = function () { return this._urlList; }; /** * Get the interval setting, that is the number of milliseconds that the * crawler waits before performing another request. * * @return {number} Interval in milliseconds. */ Crawler.prototype.getInterval = function () { return this._interval; }; /** * Get the maximum number of requests that can be in progress at any one time. * * @return {number} Maximum number of requests */ Crawler.prototype.getConcurrentRequestsLimit = function () { return this._concurrentRequestsLimit; }; /** * Get the user agent that is used to make requests. * * @return {string} User agent */ Crawler.prototype.getUserAgent = function (url) { if (typeof this._userAgent === 'function') { return this._userAgent(url); } return this._userAgent; }; /** * Custom options to be passed to the request library. * * @return {Object} Object of request options to be merged with the defaults. */ Crawler.prototype.getRequestOptions = function () { return this._request; }; /** * Start the crawler. Pages will be crawled according to the configuration * provided to the Crawler's constructor. * * @return {Boolean} True if crawl started; false if crawl already running. */ Crawler.prototype.start = function () { var concurrentRequestsLimit, i; // TODO can only start when there are no outstanding requests. if (this._started) { return false; } concurrentRequestsLimit = this.getConcurrentRequestsLimit(); this._started = true; for (i = 0; i < concurrentRequestsLimit; i++) { this._crawlTick(); } return true; }; /** * Prevent crawling of any further URLs. */ Crawler.prototype.stop = function () { this._started = false; }; Crawler.prototype.addHandler = function (contentType, handler) { // if this method is called as addHandler(\Function), that means the // handler will deal with all content types. if (arguments.length === 1) { return this.addHandler("*", arguments[0]); } this._handlers.push({ contentType: contentType, handler: handler }); return true; }; /** * Check if we are allowed to send a request and, if we are, send it. If we * are not, reschedule the request for NOW + INTERVAL in the future. */ Crawler.prototype._crawlTick = function () { var urlList, nextRequestDate, concurrentRequestsLimit, nowDate, self = this; // Crawling has stopped, so don't start any new requests if (!this._started) { return; } urlList = this.getUrlList(); nextRequestDate = this._getNextRequestDate(); concurrentRequestsLimit = this.getConcurrentRequestsLimit(); nowDate = new Date(); // Check if we are allowed to send the request yet. If we aren't allowed, // schedule the request for LAST_REQUEST_DATE + INTERVAL. if (nextRequestDate - nowDate > 0) { this._scheduleNextTick(); return; } // lastRequestDate must always be set SYNCHRONOUSLY! This is because there // will be multiple calls to _crawlTick. this._lastRequestDate = nowDate; urlList.getNextUrl().then(function (urlObj) { var url = urlObj.getUrl(); // We keep track of number of outstanding requests. If this is >= 1, the // queue is still subject to change -> so we do not wish to declare // urllistcomplete until those changes are synced with the \UrlList. self._outstandingRequests++; return self._processUrl(url).then(function (resultUrl) { return urlList.upsert(resultUrl); }).finally(function () { self._outstandingRequests--; }); }).catch(RangeError, function () { self.emit("urllistempty"); if (self._outstandingRequests === 0) { self.emit("urllistcomplete"); } }).finally(function () { // We must schedule the next check. Note that _scheduleNextTick only ever // gets called once and once only PER CALL to _crawlTick. self._scheduleNextTick(); }); }; /** * Start the crawl process for a specific URL. This method will first check * robots.txt to make sure it allowed to crawl the URL. * * @param {string} url The URL to crawl. * @return {Promise} Promise of result URL object. */ Crawler.prototype._processUrl = function (url) { var self = this, response, urlList; urlList = this.getUrlList(); this.emit("crawlurl", url); // perform url download var downloadPromise = null; if (this._robotsEnabled) { downloadPromise = this._downloadAndCheckRobots(url).then(function () { return self._downloadUrl(url, false); }); } else { downloadPromise = this._downloadUrl(url, false); } return downloadPromise.then(function (_response) { var contentType, statusCode, location; response = _response; contentType = response.headers["content-type"] || mime.lookup(url); statusCode = response.statusCode; location = response.headers.location; // If this is a redirect, we follow the location header. // Otherwise, we get the discovered URLs from the content handlers. if (statusCode >= 300 && statusCode < 400) { self.emit("redirect", url, location); return [urlMod.resolve(url, location)]; } else { return self._fireHandlers(contentType, response.body, url).catch(function (err) { self.emit("handlersError", err); err = new error.HandlersError("A handlers error occured. " + err.message); return Promise.reject(err); }); } }).then(function (links) { var insertProm; self.emit("links", url, links); if (typeof urlList.insertIfNotExistsBulk === "undefined") { insertProm = Promise.map(links, function (link) { return urlList.insertIfNotExists(new Url({ url: link })); }); } else { insertProm = urlList.insertIfNotExistsBulk(links.map(function (link) { return new Url({ url: link }); })); } return insertProm; }).then(function () { return new Url({ url: url, errorCode: null, statusCode: response.statusCode }); }).catch(error.RobotsNotAllowedError, function (err) { return new Url({ url: url, errorCode: "ROBOTS_NOT_ALLOWED", errorMessage: err.message }); }).catch(error.HttpError, function (err) { self.emit("httpError", err, url); return new Url({ url: url, errorCode: "HTTP_ERROR", statusCode: err.statusCode }); }).catch(error.RequestError, function (err) { return new Url({ url: url, errorCode: "REQUEST_ERROR", errorMessage: err.message }); }).catch(error.HandlersError, function (err) { return new Url({ url: url, errorCode: "HANDLERS_ERROR", errorMessage: err.message }); }).catch(function (err) { return new Url({ url: url, errorCode: "OTHER_ERROR", errorMessage: err.message }); }).then(function (url) { self.emit("crawledurl", url.getUrl(), url.getErrorCode(), url.getStatusCode(), url.getErrorMessage()); return url; }); }; /** * Fire any matching handlers for a particular page that has been crawled. * * @param {string} contentType Content type, e.g. "text/html; charset=utf8" * @param {string} body Body content. * @param {string} url Page URL, absolute. * @return {Promise} Promise returning an array of discovered links. */ Crawler.prototype._fireHandlers = function (contentType, body, url) { var ctx; contentType = contentType.replace(/;.*$/g, ""); ctx = { body: body, url: url, contentType: contentType }; return Promise.reduce(this._handlers, function (arr, handlerObj) { var handlerContentType = handlerObj.contentType, handlerFun = handlerObj.handler, match = false; if (handlerContentType === "*") { match = true; } else if (Array.isArray(handlerContentType) && (handlerContentType).indexOf(contentType) > -1) { match = true; } else if ((contentType + "/").indexOf(handlerContentType + "/") === 0) { match = true; } if (!match) { return Promise.resolve(arr); } return Promise.try(function () { return handlerFun(ctx); }).then(function (subArr) { if (!(subArr instanceof Array)) { subArr = []; } return arr.concat(subArr); }); }, []); }; /** * Download a particular URL. Generally speaking, we do not want to follow * redirects, because we just add the destination URLs to the queue and crawl * them later. But, when requesting /robots.txt, we do follow the redirects. * This is an edge case. * * @param {string} url URL to fetch. * @param {Boolean} followRedirect True if redirect should be followed. * @return {Promise} Promise of result. */ Crawler.prototype._downloadUrl = function (url, followRedirect) { var defaultOptions, requestOptions; defaultOptions = { url: url, forever: true, headers: { "User-Agent": this.getUserAgent(url) }, encoding: null, followRedirect: Boolean(followRedirect), gzip: true }; requestOptions = _.merge(defaultOptions, this.getRequestOptions()); return request(requestOptions).catch(function (err) { err = new error.RequestError("A request error occured. " + err.message); return Promise.reject(err); }).then(function (response) { var err; if (response.statusCode >= 400) { err = new error.HttpError("HTTP status code is " + response.statusCode); err.statusCode = response.statusCode; return Promise.reject(err); } return response; }); }; /** * For a specific URL, download the robots.txt file and check the URL against * it. * * @param {string} url URL to be checked. * @return {Promise} Promise resolves if allowed, rejects if not allowed. */ Crawler.prototype._downloadAndCheckRobots = function (url) { var self = this; return this._getOrDownloadRobots(url).then(function (robotsTxt) { var robots, isAllowed; robots = robotsParser(self._getRobotsUrl(url), robotsTxt); isAllowed = robots.isAllowed(url, self.getUserAgent(url)); if (!isAllowed) { return Promise.reject(new error.RobotsNotAllowedError("The URL is " + url + " is not allowed to be crawled due to robots.txt exclusion")); } }); }; /** * Fetch the robots.txt file from our cache or, if the cache has expired, * send a request to the server to download it. * * @param {string} url URL to get robots.txt for. * @return {Promise} Promise returning the string result of robots.txt. */ Crawler.prototype._getOrDownloadRobots = function (url) { var robotsUrl, robotsTxt, ignoreServerError, self = this; // Check if this robots.txt file already exists in the cache. robotsUrl = this._getRobotsUrl(url); robotsTxt = this._robotsCache.get(robotsUrl); ignoreServerError = this._robotsIgnoreServerError; if (typeof robotsTxt !== "undefined") { return Promise.resolve(robotsTxt); } // We want to add /robots.txt to the crawl queue. This is because we may // parse the robots.txt file with a content handler, in order to extract // it's Sitemap: directives. (And then we'll crawl those sitemaps too!) return this.getUrlList().insertIfNotExists(new Url({ url: robotsUrl })).then(function () { // robots.txt doesn't exist in the cache, so we have to hit the // server to get it. return self._downloadUrl(robotsUrl, true); }).catch(error.HttpError, function (err) { var robotsStatusCode = err.statusCode; // if robots returns a dismissable status code, we assume // there are no restrictions. switch (robotsStatusCode) { case 404: case 410: case 500: if (robotsStatusCode === 500 && !ignoreServerError) { break; } return Promise.resolve({ statusCode: 200, body: "" }); } // but if there is another status code, we stop crawling the entire website return Promise.reject(new error.RobotsNotAllowedError("No crawling is " + "allowed because robots.txt could not be crawled. Status code " + robotsStatusCode)); }).then(function (response) { var body, robotsTxt; body = response.body; robotsTxt = body.toString(); self._robotsCache.set(robotsUrl, robotsTxt); return robotsTxt; }); }; /** * Given any URL, find the corresponding URL for the /robots.txt file. Robots * files are unique per (host, protcol, port) combination. * * @param {string} url Any URL. * @return {string} URL of robots.txt, e.g. https://example.com/robots.txt */ Crawler.prototype._getRobotsUrl = function (url) { var parsedUrl, robotsUrl; parsedUrl = urlMod.parse(url); // There's a robots for every (host, protocol, port) combination robotsUrl = urlMod.format({ host: parsedUrl.host, protocol: parsedUrl.protocol, port: parsedUrl.port || null, pathname: "/robots.txt" }); return robotsUrl; }; /** * Get the \Date that we are allowed to send another request. If we haven't * already sent a request, this will return the current date. * * @return {Date} Date of next request. */ Crawler.prototype._getNextRequestDate = function () { var interval, lastRequestDate, nextRequestDate; interval = this.getInterval(); lastRequestDate = this._lastRequestDate; if (!lastRequestDate) { nextRequestDate = new Date(); } else { nextRequestDate = new Date(lastRequestDate.getTime() + interval); } return nextRequestDate; }; /** * Work out when we are allowed to send another request, and schedule a call * to _crawlTick. */ Crawler.prototype._scheduleNextTick = function () { var nextRequestDate, nowDate, delayMs, self = this; nextRequestDate = this._getNextRequestDate(); nowDate = new Date(); delayMs = Math.max(0, nextRequestDate - nowDate); setTimeout(function () { self._crawlTick(); }, delayMs); }; module.exports = Crawler;