UNPKG

bot-marvin

Version:

Highly scalable crawler with best features.

206 lines (158 loc) 6.25 kB
var parent_dir = process.getAbsolutePath(__dirname); var request = require("request"); var fs = require('fs'); var check = require("check-types"); var robots = require('robots'); /** Represents Robots.txt parser @author Tilak Patidar <tilakpatidar@gmail.com> @constructor @param {Message} message_obj */ var Robots = function Robots(message_obj, parallel_requests) { var message = message_obj; var log = message.get('log'); var pool = message.get('pool'); var urls = message.get('robots_links'); var that = this; var requests = parallel_requests; var requests_made = 0; var called = false; /** Parses robots.txt for loaded urls. @public @param {Function} fn - callback */ this.parse = function(fn) { try { msg("Preparing robots.txt files this will take time . . .", "info"); var err; loadCache(function init_loadCache() { function robots_iterator() { if(called){ return; } msg("requests_made " + requests_made + " requests" + requests, "info"); var req_url = urls.splice(0, 1)[0]; //console.log(req_url); if(!check.assigned(req_url)){ --requests_made; if (urls.length === 0 && !called) { called = true; return fn(err, that.bots); } return; } if (check.assigned(that.bots[req_url])) { --requests_made; if (urls.length === 0 && !called) { called = true; return fn(err, that.bots); } for(var i = requests_made; i < requests; i++){ ++requests_made; process.nextTick(robots_iterator); } return; } var parser = new robots.RobotsParser(); var req_url_new = req_url.split("/").slice(0, 3).join("/"); var init_time = new Date().getTime(); if (!check.assigned(that.bots[req_url])) { parser.setUrl(req_url_new + '/robots.txt', function parser_setUrl(parser_obj, success) { if (success) { msg(("Robots.txt parsed for " + req_url), "success"); addToCache(req_url_new, parser_obj, function() { }); that.bots[req_url] = parser_obj; //saving robots obj } else { msg(("No Robots.txt found for " + req_url), "error"); var parser_obj = { "NO_ROBOTS": true }; addToCache(req_url_new, parser_obj, function() { }); that.bots[req_url] = parser_obj; //saving robots obj } --requests_made; if (urls.length === 0 && !called) { called = true; return fn(err, that.bots); } for(var i = requests_made; i < requests; i++){ ++requests_made; process.nextTick(robots_iterator); } return; }); } else { --requests_made; if (urls.length === 0 && !called) { called = true; return fn(err, that.bots); } for(var i = requests_made; i < requests; i++){ ++requests_made; process.nextTick(robots_iterator); } return; } }; ++requests_made; process.nextTick(robots_iterator); }); } catch (err) { //console.log(err) return fn(err, null); } }; /** Loades already parsed robots.txt from mongodb store. @private @param {Function} fn - callback */ var loadCache = function loadCache(fn) { pool.robots_collection.find({}).toArray(function loadCacheFn(err, docs) { //console.log(err,docs) if (!check.assigned(docs) || check.emptyArray(docs) || check.assigned(err)) { msg("Robots.txt cache is empty.", "info"); return fn(); } else { for (var i = docs.length - 1; i >= 0; i--) { var data = docs[i]["robot"]; var domain = docs[i]["_id"]; if (check.assigned(data)) { that.bots[domain] = data; msg(("Robots.txt loaded from cache for " + domain), "success"); } if (i === 0) { return fn(); } }; } }); }; /** Parsed data is stored in this JSON object. @public */ this.bots = {}; /** Adds parsed data for a url into mongodb. @private @param {String} key - url @param {String} value - parsed data from robots.txt */ function addToCache(key, value, fn) { pool.robots_collection.insert({ "_id": key, "robot": value }, function addToCache(err) { return fn(); }); }; function msg() { log.put(arguments[0], arguments[1], __filename.split('/').pop(), arguments.callee.caller.name.toString()); } }; module.exports = Robots;