UNPKG

flexible

Version:

Easily build flexible, scalable, and distributed, web crawlers.

391 lines (333 loc) 12.2 kB
'use strict'; /** * Flexible Web-Crawler Module * (https://github.com/eckardto/flexible.git) * * This file is part of Flexible. * * Flexible is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Flexible is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Flexible. If not, see <http://www.gnu.org/licenses/>. */ var async = require('async'); var request = require('request'); var iconv = require('iconv-lite'); var htmlparser = require('htmlparser'); var traverse = require('traverse'); var url = require('url'); var util = require('util'); var events = require('events'); var querystring = require('querystring'); var queue = require('./queue.js'); var router = require('./router.js'); /** * Initiate a crawler and start crawling. */ module.exports = function (options) { if (typeof options === 'string') { options = {url: options}; } else if (!options.url && options.uri) { options.url = options.uri; } if (options.url && !options.domains) { options.domains = [url.parse(options.url).hostname]; } var crawler = (new Crawler(options)) .use(queue()).use(router()); async.waterfall([ function (next) { if (!options.url) {next(null);} else {crawler.navigate(options.url, next);} }, function (next) {crawler.crawl(next);} ], function (error) { if (error) { crawler.emit('error', error); crawler._complete(); } }); return crawler; }; module.exports.Crawler = Crawler; module.exports.queue = queue; module.exports.pgQueue = require('./pg-queue.js'); module.exports.router = router; util.inherits(Crawler, events.EventEmitter); function Crawler(options) { events.EventEmitter.call(this); this._middleware = []; this._domains = options.domains; this._completed = false; this._paused = false; this._max_concurrency = options .max_concurrency || 4; this._max_crawl_queue_length = options .max_crawl_queue_length || 10; this._interval = options.interval || 250; this._encoding = options.encoding; this._proxy = options.proxy; this._headers = options.headers || { 'user-agent': 'Node/Flexible 0.1.12 ' + '(https://github.com/eckardto/flexible)' }; this._timeout = options.timeout; this._follow_redirect = options .follow_redirect || true; this._max_redirects = options .max_redirects || 10; this._auth = options.auth; this._pool = options.pool; this._jar = options.jar; var self = this; this._crawl_queue = async .queue(function (doc, callback) { self._process(doc, callback); }, this._max_concurrency); this._crawl_queue.drain = function () { self._complete(); }; /** * Crawl (recursive) */ this.crawl = function (callback) { this._crawl(function (error) { if (!self._crawl_queue.tasks.length && !self._crawl_queue.running()) { self._complete(); } if (error) { if (callback) {callback(error);} else {self.emit('error', error);} } else if (callback) {callback(null);} }); }; } /** * Use a component. */ Crawler.prototype.use = function (component) { // Plug in the component. component(this); return this; }; /** * Navigate to a location. */ Crawler.prototype.navigate = function (location, callback) { var parsed_location = url.parse(location); if (!parsed_location.protocol) { location = 'http://' + location; } if (this._domains && this._domains[0] && this._domains.indexOf(parsed_location.hostname) === -1) { if (callback) { callback(new Error('Location is not allowed.')); } } else { // Add to the queue. this.queue.add(location, function (error) { if (callback) {callback(error);} }); } return this; }; Crawler.prototype._process = function (doc, callback) { var self = this; async.waterfall([ // Delay according to crawler interval. function (next) {setTimeout(next, self._interval);}, // Download, while parsing, the doc. function (next) { var req = request({ url: doc.url, encoding: self._encoding ? null : undefined, headers: self._headers, proxy: self._proxy, timeout: self._timeout, followRedirect: self._follow_redirect, maxRedirects: self._max_redirects, auth: self._auth, pool: self._pool, jar: self._jar }).on('response', function (res) { if (!res.headers['content-type']) { res.request.end(); return next(new Error('Missing the content-type.')); } if (res.headers['content-type'].indexOf('html') === -1) { res.request.end(); return next(new Error('Unsupported content-type.')); } var handler = new htmlparser.DefaultHandler(function (error, dom) { if (error) {return next(error);} if (req.uri && req.uri.query) { req.params = querystring.parse(req.uri.query); } else {req.params = {};} next(null, req.toJSON(), res.toJSON(), body, dom); }), parser = new htmlparser.Parser(handler); var body = ''; res.on('data', function (chunk) { if (self._encoding) { chunk = iconv.decode(chunk, self._encoding); } body += chunk.toString(); parser.parseChunk(chunk); }); res.on('error', next); res.on('end', function () {parser.done();}); }).on('error', next); }, // Discover, and navigate to, locations. function (req, res, body, dom, next) { var locations = []; traverse(dom).forEach(function (node) { if (!node.attribs || !node.attribs.href) {return;} var href = node.attribs.href; var protocol = url.parse(href).protocol; if (href === '/') {href = res.request.uri.hostname;} else if (!protocol) { if (href.substring(0, 2) === '//') { href = 'http:' + href; } else if (href.charAt(0) === '/') { href = res.request.uri.protocol + '//' + res.request.uri.hostname + href; } else { href = res.request.uri.protocol + '//' + res.request.uri.hostname + '/' + href; } } else if (protocol.indexOf('http') === -1) { // Only crawl locations using HTTP. return; } var start = href .substring(0, href.indexOf('.') + 1); href = start + href.replace(start, '') .replace('//', '/'); if (href.charAt(href.length - 1) === '/') { href = href.substring(0, href.length - 1); } locations.push(href); }); async.forEach(locations, function (location, callback) { self.navigate(location, function (error) { if (error) {self.emit('error', error);} else {self.emit('navigated', location);} callback(null); }); }, function () {next(null, req, res, body, dom);}); } ], function (error, req, res, body, dom) { doc.request = req; doc.response = res; doc.body = body; doc.dom = dom; if (error) {callback(error, doc);} else { self.crawl(function (error) { callback(error, doc); }); } }); return this; }; Crawler.prototype._crawl = function (callback) { var self = this, fill = true; async.whilst(function () { return fill && self._crawl_queue.length() < self._max_crawl_queue_length; }, function (callback) { self.queue.get(function (error, doc) { if (error) {return callback(error);} if (!doc) {return callback(fill = false);} self._crawl_queue.push(doc, function (error, doc) { if (error) { error.doc = doc; self.emit('error', error); if (!error.message) { error.message = 'An unknown error has occurred.'; } } self.queue.end(doc, function (end_error, doc) { if (end_error) { end_error.doc = doc; return self.emit('error', end_error); } if (error) {return;} async.waterfall([ function (next) {next(null, self, doc);} ].concat(self._middleware.concat([ function (crawler, doc, next) { self.emit('document', doc); next(null); } ])), function (error) { if (error) {self.emit('error', error);} }); }); }); callback(null); }); }, callback); return this; }; /** * Pause crawling. */ Crawler.prototype.pause = function () { if (!this._paused) { this.crawl = function (callback) { var self = this; self.once('resumed', function () { self._crawl(callback); }); }; this._paused = true; this.emit('paused'); } return this; }; /** * Resume crawling. */ Crawler.prototype.resume = function () { if (!this._completed && this._paused) { this.crawl = this._crawl; this._paused = false; this.emit('resumed'); } return this; }; /** * Abort crawling. */ Crawler.prototype.abort = function () { if (!this._completed) { if (this._paused) {return this.resume();} this.crawl = function (callback) { callback(null); }; this._crawl_queue.tasks.length = 0; if (!this._crawl_queue.running()) { this._complete(); } } return this; }; Crawler.prototype._complete = function () { if (!this._completed) { this._completed = true; this.emit('complete'); } return this; };