node-web-crawler
Version:
Node Web Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously. Scraping should be simple and fun!
481 lines (397 loc) • 15.3 kB
JavaScript
;
var path = require('path');
var util = require('util');
var EventEmitter = require('events').EventEmitter;
var request = require('request');
request = request.defaults({jar: true});
var j = request.jar();
var _ = require('lodash');
var jschardet = require('jschardet');
var cheerio = require('cheerio');
var zlib = require('zlib');
var fs = require('fs');
var Pool = require('generic-pool').Pool;
// Fallback on iconv-lite if we didn't succeed compiling iconv
// https://github.com/sylvinus/node-crawler/pull/29
var iconv, iconvLite;
try {
iconv = require('iconv').Iconv;
} catch (e) {}
if (!iconv) {
iconvLite = require('iconv-lite');
}
function useCache (options) {
return (
options.uri &&
(options.cache || options.skipDuplicates) &&
(options.method === 'GET' || options.method === 'HEAD'));
}
function checkJQueryNaming (options) {
if ('jquery' in options) {
options.jQuery = options.jquery;
delete options.jquery;
}
return options;
}
function readJqueryUrl (url, callback) {
if (url.match(/^(file\:\/\/|\w+\:|\/)/)) {
fs.readFile(url.replace(/^file\:\/\//,''),'utf-8', function(err,jq) {
callback(err, jq);
});
} else {
callback(null, url);
}
}
function Crawler (options) {
var self = this;
self.init(options);
}
// augment the prototype for node events using util.inherits
util.inherits(Crawler, EventEmitter);
Crawler.prototype.init = function init (options) {
var self = this;
var defaultOptions = {
autoWindowClose: true,
cache: false,
forceUTF8: false,
incomingEncoding: null, //TODO remove or optimize
jQuery: true,
maxConnections: 10,
method: 'GET',
onDrain: false,
priority: 5,
priorityRange: 10,
rateLimits: 0,
referer: false,
retries: 3,
retryTimeout: 10000,
skipDuplicates: true
};
//return defaultOptions with overriden properties from options.
self.options = _.extend(defaultOptions, options);
// you can use jquery or jQuery
self.options = checkJQueryNaming(self.options);
// if using rateLimits we want to use only one connection with delay in between requests
if (self.options.rateLimits !== 0) {
self.options.maxConnections = 1;
}
// Don't make these options persist to individual queries
self.globalOnlyOptions = ['maxConnections', 'priorityRange', 'onDrain'];
//Setup a worker pool w/ https://github.com/coopernurse/node-pool
self.pool = Pool({
name : 'crawler',
max : self.options.maxConnections,
priorityRange: self.options.priorityRange,
create : function(callback) {
callback(1);
},
destroy : function() {}
});
self.plannedQueueCallsCount = 0;
self.queueItemSize = 0;
self.cache = {};
self.on('pool:release', function(options) {
self._release(options);
});
self.on('pool:drain', function() {
if (self.options.onDrain) {
self.options.onDrain();
}
});
};
Crawler.prototype._release = function _release (options) {
var self = this;
self.queueItemSize--;
if (options._poolReference) {
self.pool.release(options._poolReference);
}
// Pool stats are behaving weird - have to implement our own counter
if (self.queueItemSize + self.plannedQueueCallsCount === 0) {
self.emit('pool:drain');
}
};
Crawler.prototype._inject = function _inject (response, options, callback) {
var $;
var self = this;
if (options.jQuery === 'cheerio' || options.jQuery.name === 'cheerio' || options.jQuery === true) {
var defaultCheerioOptions = {
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: true
};
var cheerioOptions = options.jQuery.options || defaultCheerioOptions;
$ = cheerio.load(response.body, cheerioOptions);
callback(null, $);
}
else if (options.jQuery.jsdom) {
var jsdom = options.jQuery.jsdom;
var scriptLocation = path.resolve(__dirname, '../vendor/jquery-2.1.1.min.js');
//Use promises
readJqueryUrl(scriptLocation, function(err, jquery) {
try {
jsdom.env({
url: options.uri,
html: response.body,
src: [jquery],
done: function (errors, window) {
$ = window.jQuery;
callback(errors, $);
try {
window.close();
window = null;
} catch (err) {
console.log('Couldn\'t window.close : ' + err);
}
}
});
} catch (e) {
options.callback(e);
self.emit('pool:release', options);
}
});
}
// Jquery is set to false are not set
else {
callback(null);
}
};
Crawler.prototype.queue = function queue (options) {
var self = this;
// Did you get a single object or string? Make it compatible.
options = _.isString(options) || _.isPlainObject(options) ? [ options ] : options;
if (options !== undefined && options.length == 1) {
self._pushToQueue(
_.isString(options[0]) ? { uri: options[0] } : options[0]
);
// Did you get multiple requests? Queue the URLs.
} else if (options !== undefined) {
self.queue(
_.isString(options[0]) ? { uri: options[0] } : options[0]
);
self.queue(options.slice(1))
}
};
Crawler.prototype._pushToQueue = function _pushToQueue (options) {
var self = this;
self.queueItemSize++;
// you can use jquery or jQuery
options = checkJQueryNaming(options);
_.defaults(options, self.options);
// Remove all the global options from our options
// TODO we are doing this for every _pushToQueue, find a way to avoid this
_.each(self.globalOnlyOptions, function(globalOnlyOption) {
delete options[globalOnlyOption];
});
// If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled
if (options.skipDuplicates && self.cache[options.uri]) {
return self.emit('pool:release', options);
}
// acquire connection - callback function is called
// once a resource becomes available
self.pool.acquire(function(error, poolReference) {
options._poolReference = poolReference;
// this is and operation error
if (error) {
console.error('pool acquire error:',error);
options.callback(error);
return;
}
//Static HTML was given, skip request
if (options.html) {
self._onContent(null, options, {body:options.html}, false);
} else if (typeof options.uri === 'function') {
options.uri( function(uri) {
options.uri = uri;
self._makeCrawlerRequest(options);
});
} else {
self._makeCrawlerRequest(options);
}
}, options.priority);
};
Crawler.prototype._makeCrawlerRequest = function _makeCrawlerRequest (options) {
var self = this;
if (typeof options.rateLimits === 'number' && options.rateLimits !== 0) {
setTimeout(function() {
self._executeCrawlerRequest(options);
}, options.rateLimits);
} else {
self._executeCrawlerRequest(options);
}
};
Crawler.prototype._executeCrawlerRequest = function _executeCrawlerRequest (options) {
var self = this;
var cacheData = self.cache[options.uri];
//If a query has already been made to self URL, don't callback again
if (useCache(options) && cacheData) {
// Make sure we actually have cached data, and not just a note
// that the page was already crawled
if (_.isArray(cacheData)) {
self._onContent(null, options, cacheData[0], true);
} else {
self.emit('pool:release', options);
}
} else {
self._buildHttpRequest(options);
}
};
Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
var self = this;
if (options.debug) {
console.log(options.method+' '+options.uri+' ...');
}
options.jQuery = 'cheerio';
options.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0)';
// Cloning keeps the opts parameter clean:
// - some versions of "request" apply the second parameter as a
// property called "callback" to the first parameter
// - keeps the query object fresh in case of a retry
// Doing parse/stringify instead of _.clone will do a deep clone and remove functions
var ropts = JSON.parse(JSON.stringify(options));
if (!ropts.headers) { ropts.headers={}; }
if (ropts.forceUTF8) {
if (!ropts.headers['Accept-Charset'] && !ropts.headers['accept-charset']) {
ropts.headers['Accept-Charset'] = 'utf-8;q=0.7,*;q=0.3';
}
if (!ropts.encoding) {
ropts.encoding=null;
}
}
if (typeof ropts.encoding === 'undefined') {
ropts.headers['Accept-Encoding'] = 'gzip';
ropts.encoding = null;
}
if (ropts.userAgent) {
ropts.headers['User-Agent'] = ropts.userAgent;
}
if (ropts.referer) {
ropts.headers.Referer = ropts.referer;
}
if (ropts.proxies && ropts.proxies.length) {
ropts.proxy = ropts.proxies[0];
}
var requestArgs = ['uri','url','qs','method','headers','body','form','json','multipart','followRedirect',
'followAllRedirects', 'maxRedirects','encoding','pool','timeout','proxy','auth','oauth','strictSSL',
'jar','aws'];
var req = request(_.pick.apply(this,[ropts].concat(requestArgs)), function(error,response) {
if (error) {
return self._onContent(error, options);
}
response.uri = response.request.href;
// Won't be needed after https://github.com/mikeal/request/pull/303 is merged
if (
response.headers['content-encoding'] &&
response.headers['content-encoding'].toLowerCase().indexOf('gzip') >= 0
) {
zlib.gunzip(response.body, function (error, body) {
if (error) {
return self.onContent(error, options);
}
if (!options.forceUTF8) {
response.body = body.toString(req.encoding);
} else {
response.body = body;
}
self._onContent(error,options,response,false);
});
} else {
self._onContent(error,options,response,false);
}
});
req.setMaxListeners(0);
};
Crawler.prototype._onContent = function _onContent (error, options, response, fromCache) {
var self = this;
if (error) {
if (options.debug) {
console.log('Error '+error+' when fetching '+
options.uri+(options.retries?' ('+options.retries+' retries left)':''));
}
if (options.retries) {
self.plannedQueueCallsCount++;
setTimeout(function() {
options.retries--;
self.plannedQueueCallsCount--;
// If there is a "proxies" option, rotate it so that we don't keep hitting the same one
if (options.proxies) {
options.proxies.push(options.proxies.shift());
}
self.queue(options);
},options.retryTimeout);
} else if (options.callback) {
options.callback(error);
}
return self.emit('pool:release', options);
}
if (!response.body) { response.body=''; }
if (options.debug) {
console.log('Got '+(options.uri||'html')+' ('+response.body.length+' bytes)...');
}
if (options.forceUTF8) {
//TODO check http header or meta equiv?
var iconvObj;
if (!options.incomingEncoding) {
var detected = jschardet.detect(response.body);
if (detected && detected.encoding) {
if (options.debug) {
console.log(
'Detected charset ' + detected.encoding +
' (' + Math.floor(detected.confidence * 100) + '% confidence)'
);
}
if (detected.encoding !== 'utf-8' && detected.encoding !== 'ascii') {
if (iconv) {
iconvObj = new iconv(detected.encoding, 'UTF-8//TRANSLIT//IGNORE');
response.body = iconvObj.convert(response.body).toString();
// iconv-lite doesn't support Big5 (yet)
} else if (detected.encoding !== 'Big5') {
response.body = iconvLite.decode(response.body, detected.encoding);
}
} else if (typeof response.body !== 'string') {
response.body = response.body.toString();
}
} else {
response.body = response.body.toString('utf8'); //hope for the best
}
} else { // do not hope to best use custom encoding
if (iconv) {
iconvObj = new iconv(options.incomingEncoding, 'UTF-8//TRANSLIT//IGNORE');
response.body = iconvObj.convert(response.body).toString();
// iconv-lite doesn't support Big5 (yet)
} else if (options.incomingEncoding !== 'Big5') {
response.body = iconvLite.decode(response.body, options.incomingEncoding);
}
}
} else {
response.body = response.body.toString();
}
if (useCache(options) && !fromCache) {
if (options.cache) {
self.cache[options.uri] = [response];
//If we don't cache but still want to skip duplicates we have to maintain a list of fetched URLs.
} else if (options.skipDuplicates) {
self.cache[options.uri] = true;
}
}
if (!options.callback) {
return self.emit('pool:release', options);
}
response.options = options;
// This could definitely be improved by *also* matching content-type headers
var isHTML = response.body.match(/^\s*</);
if (isHTML && options.jQuery && options.method !== 'HEAD') {
self._inject(response, options, function(errors, $) {
self._onInject(errors, options, response, $);
});
} else {
options.callback(null,response);
self.emit('pool:release', options);
}
};
Crawler.prototype._onInject = function _onInject (errors, options, response, $) {
var self = this;
options.callback(errors, response, $);
self.emit('pool:release', options);
};
module.exports = Crawler;
module.exports.VERSION = '0.3.1';