UNPKG

phantalyzer

Version:

A PhantomJS script for running Wappalyzer over many sites using a headless Webkit browser

github.com/mlconnor/phantalyzer

mlconnor/phantalyzer

178 lines (159 loc) • 5.45 kB

JavaScript

var fs = require('fs'); var U = require('underscore'); var path = require('path'); var csv = require('finite-csv'); var program = require('commander'); program .version('0.0.1') .option('-d, --dataDir <path>', 'Data directory') .option('-x, --regexFile <path>', 'Regex file path') .option('-c, --csvFile <path>', 'CVS file containing site list') .option('-m, --maxRows [count]', 'Max number of records to process.', parseInt, 100000) .option('-u, --urlColumn <name>', 'Max number of records to process.', 'url') .parse(process.argv); //console.log(program); //console.log('foo usage=' + program.usage); if ( program.dataDir == undefined || program.regexFile == undefined || program.csvFile == undefined ) { program.help(); } /* process the data directory */ if ( ! (fs.existsSync(program.dataDir) && fs.lstatSync(program.dataDir).isDirectory() ) ) { console.error('directory ' + program.dataDir + ' does not exist or is not a directory'); process.exit(1); } var files = fs.readdirSync(program.dataDir); /* process the regex file */ if ( ! fs.existsSync(program.regexFile) ) { console.error('json regex file ' + program.regexFile + ' does not exist'); process.exit(1); } var regexFile = fs.readFileSync(program.regexFile, 'utf8'); var regexObj = null; try { regexObj = JSON.parse(regexFile); } catch (msg) { console.error("json regex file " + program.regexFile + " was not valid JSON. Check out JSON lint online to validate it."); process.exit(1); } /* process the csv file */ if ( ! fs.existsSync(program.csvFile) ) { console.error('CSV file ' + program.csvFile + ' does not exist'); process.exit(1); } var csvFile = fs.readFileSync(program.csvFile, 'utf8'); csvFile = csvFile.replace(/\cm[\r\n]*/g, "\n"); var sites = []; try { //var csvRecs = csv.parseCSV("a,b,c\n1,2,3"); var records = csv.parseCSV(csvFile); var skipRows = -1; /* let's look for the header. the header is the first row that contains a column with the urlColumn in it */ outer: for ( var rI = 0; rI < records.length; rI++ ) { var record = records[rI]; for ( var cI = 0; cI < record.length; cI++ ) { if ( record[cI] == program.urlColumn ) { skipRows = rI; break outer; } } } if ( skipRows < 0 ) { throw "Unable to find a row in the data with a column matching " + program.urlColumn; } sites = csv_to_obj(records.slice(skipRows)); //console.log(sites); //console.log(sites[0]); sites = sites.slice(0,program.maxRows); } catch (msg) { console.error("csv file " + program.csvFile + " was not valid CSV. " + msg); console.log(msg.stack); process.exit(1); } //console.log("site count=" + sites.length); var rows = []; for ( var i = 0; i < sites.length; i++ ) { var site = sites[i]; //console.log("record [" + i + "] ", site[' URL ' ]); var file = U.filter(files, function(entry) { // the i+1 is due to a bug in the crawler return entry.indexOf('site_' + (i+1) + '_') == 0 && entry.match(/\.txt$/i); }); var row = []; if ( file.length > 0 ) { var fullPath = program.dataDir + path.sep + file[0]; var infile = fs.readFileSync(program.dataDir + path.sep + file[0], 'utf8'); for (var key in regexObj) { var def = regexObj[key]; var exp = new RegExp(def.pattern, def.modifiers); var match = infile.match(exp); var result = ""; //console.log("checking " + def.pattern + ", match=", match); if ( match ) { result = match[0]; //console.log('KEY=' + key); if ( def.hit != undefined ) { result = def.hit; for (var k = 0; k < match.length; k++ ) { result = result.replace('\{\{' + k + '\}\}', match[k]); } } result = result.replace('\{\{count\}\}', match.length); //console.log('output=' + result); } else { if ( def.miss != undefined ) { result = def.miss; } } row.push(result); //console.log(result + " == " + key); } //console.log("processing file " + files[0]); } else { for ( var key in regexObj) { row.push(""); } //console.log("no file found for site_" + (i+1) + "_..."); } rows.push(row); //console.log('res=', row); } for ( var r = 0; r < rows.length; r++ ) { for ( var c = 0; c < rows[r].length; c++ ) { /* i'm going to remove all carriage returns here */ rows[r][c] = rows[r][c].replace(/[\r\n]/g, " "); /* if we see a double quote then we will wrap the whole column with quotes and escape the quotes within */ if ( rows[r][c].match(/[",]/) ) { rows[r][c] = '"' + rows[r][c].replace(/"/g, '""') + '"'; } } rows[r] = rows[r].join(','); } rows.unshift(U.keys(regexObj)); console.log(rows.join("\n")); /** * This awesome function will return an * array of rows with the key values of * each row matching the column header * which should be provided in the first row. */ function csv_to_obj(records) { var objects = []; var header = []; for ( var i = 0; i < records.length; i++ ) { var values = records[i]; if ( i == 0 ) { header = values; } else { var item = []; for ( var recI = 0; recI < header.length; recI++ ) { item[header[recI]] = recI < values.length ? values[recI] : ""; } objects.push(item); } } return objects; } function escapeRegExp(str) { return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&"); }