UNPKG

witchypoo

Version:

Stores a unique list of domain names and their page rank at time of crawling

226 lines (188 loc) 6.35 kB
module.exports = function (program) { var http = require('http'), https = require('https'), async = require('async'), hansel = require('hansel'), badwords = require('badwords/regexp'), seenDomains = {}, whiteLists = {}, gretel, db; db = require('./database')(program.connectionString); function validateResponceLength(queueItem, response, callback){ var client = queueItem.protocol === 'https' ? https : http; client.get(queueItem.protocol + '://' + queueItem.host, function(response) { response.on("data", function(chunk) { response.pause(); response.destroy(); callback(chunk.length > 100); }); }); } function validateSafeness(data, callback){ var html = data.toString(); callback(!html.match(badwords)); } function isSubdomainOf(host, uri) { reversedHost = host.toLowerCase().split('').reverse().join(''); reversedUri = uri.toLowerCase().split('').reverse().join('').substr(0, host.length); return reversedHost === reversedUri; } function isWhiteListDomain(host){ var isWhiteListed = false; for (var i = 0; i < whiteLists.domains.length; i++) { if(isSubdomainOf(whiteLists.domains[i], host)){ isWhiteListed = true; break; } } return isWhiteListed; } function isInvalidSubDomain(host){ var isInvalid = false, keys = Object.keys(seenDomains); for (var i = 0; i < keys.length; i++) { if(isSubdomainOf(keys[i], host)){ isInvalid = true; break; } } return isInvalid; } function isWhiteListSubDomain(host){ var isWhiteListed = false; for (var i = 0; i < whiteLists.subDomains.length; i++) { if(host.indexOf(whiteLists.subDomains[i]) === 0){ isWhiteListed = true; break; } } return isWhiteListed; } function hasBeenSeen(queueItem){ if(seenDomains[queueItem.host]){ return true; } if(isWhiteListDomain(queueItem.host) || isWhiteListSubDomain(queueItem.host)){ seenDomains[queueItem.host] = true; return false; } if(isInvalidSubDomain(queueItem.host)){ seenDomains[queueItem.host] = true; return true; } seenDomains[queueItem.host] = true; return false; } function passToHansel(domainObject){ hansel.getPageRank(domainObject.uri, function(error, results) { if(error){ return console.log(error.stack || error); } domainObject.pageRank = results[0].pageRank; saveDomain(domainObject); }); } function validateDomain(queueItem, data, response){ if(hasBeenSeen(queueItem)){ return; } validateResponceLength(queueItem, response, function(isValid){ validateSafeness(data, function(isSafe){ passToHansel( { uri: queueItem.host, isValid: isValid, isSafe: isSafe, responeCode: response.statusCode } ); }); }); } function setupGretel(){ gretel = require('gretel')(program.startUris); process.on( 'SIGINT', function() { gretel.save(program.queuePath, function(error){ if(error){ console.log(error.stack || error); process.exit(1); } process.exit(0); }); }); gretel.on('complete ', function() { console.log( "All breadcrumbs have been followed..." ); }); gretel.on('fetchcomplete', validateDomain); gretel.load(program.queuePath, function(error){ if(error){ return console.log(error.stack || error); } gretel.start(); }); } function loadDomains(callback){ console.log( "Loading recorded domains..." ); db.Domain.find({}, 'uri', function(error, domains){ if(error){ return callback(error); } async.map( domains, function(domain, callback){ seenDomains[domain.uri] = true; callback(); }, callback ); }); } function loadWhiteLists(callback){ console.log( "Loading white lists..." ); db.WhiteList.find({}, 'whiteListType whiteList', function(error, lists){ if(error){ return callback(error); } async.map( lists, function(list, callback){ whiteLists[list.whiteListType] = list.whiteList; callback(); }, callback ); }); } function saveDomain(domainObject){ console.log(domainObject); db.Domain.update({uri: domainObject.uri}, domainObject, {upsert: true}, function(error){ if(error){ console.log(error.stack || error); } }); } function exitProcessWithError(error){ console.log(error.stack || error); if(gretel){ gretel.save(program.queuePath); } console.log('EXITING PROCESS'); return process.exit(1); } return { passToHansel: passToHansel, validateResponceLength: validateResponceLength, validateSafeness: validateSafeness, isSubdomainOf: isSubdomainOf, isWhiteListDomain: isWhiteListDomain, isWhiteListSubDomain: isWhiteListSubDomain, hasBeenSeen: hasBeenSeen, validateDomain: validateDomain, setupGretel: setupGretel, loadDomains: loadDomains, loadWhiteLists: loadWhiteLists, saveDomain: saveDomain, exitProcessWithError: exitProcessWithError }; };