flexible
Version:
Easily build flexible, scalable, and distributed, web crawlers.
117 lines (105 loc) • 3.38 kB
JavaScript
;
var pg = require('pg');
module.exports = function (options) {
if (typeof options === 'string') {
options = {uri: options};
} else if (options.url) {
options.uri = options.url;
}
return function (crawler) {
crawler.queue = new Queue(options);
crawler.on('complete', function () {
crawler.queue._client.end();
});
};
};
function Queue(options) {
this._get_interval = options.get_interval || 250;
this._max_get_attempts = options.max_get_attempts || 4;
this._client = new pg.Client(options.uri);
this._client.connect();
}
/**
* Setup the database.
*/
Queue.prototype._setup = function (callback) {
var query = 'CREATE TABLE IF NOT EXISTS queue ' +
'(url text UNIQUE, processing ' +
'boolean, completed boolean)';
this._client.query(query, function (error) {
callback(error);
});
};
/**
* Add an doc to the queue.
*/
Queue.prototype.add = function (location, callback) {
var doc = {
queue: this,
url: location,
processing: false,
completed: false
};
var self = this;
var query = 'INSERT INTO queue VALUES ($1, $2, $3)';
this._client.query(query, [
doc.url,
doc.processing,
doc.completed
], function (error) {
if (error) {
if (error.code) {
if (error.code === '42P01') {
self._setup(function (error) {
if (error) {callback(error);}
else {self.add(location, callback);}
});
} else if (error.code === '23505') {
callback(null, doc);
} else {callback(error);}
} else {callback(error);}
} else {callback(null, doc);}
});
};
/**
* Get an doc to process.
*/
Queue.prototype.get = function (callback) {
var query = 'UPDATE queue SET processing = true WHERE ' +
'url IN (SELECT url FROM queue WHERE NOT processing ' +
'AND NOT completed LIMIT 1) RETURNING url';
var attempts = 0, self = this;
(function get() {
self._client.query(query, function (error, results) {
if (error) {
if (error.code &&
error.code === '42P01') {
self._setup(callback);
} else {callback(error, null);}
} else if (!results.rows[0]) {
if (attempts < self._max_get_attempts) {
++attempts;
setTimeout(get, self._get_interval);
} else {callback(null, null);}
} else {
callback(null, results.rows[0] ? {
url: results.rows[0].url,
processing: true,
completed: false
} : null);
}
});
})();
};
/**
* End processing of doc.
*/
Queue.prototype.end = function (doc, callback) {
doc.processing = false;
doc.completed = true;
var query = 'UPDATE queue SET processing = false, ' +
'completed = true WHERE url = $1';
this._client.query(query, [doc.url], function (error) {
callback(error, doc);
});
};