phantalyzer
Version:
A PhantomJS script for running Wappalyzer over many sites using a headless Webkit browser
178 lines (159 loc) • 5.45 kB
JavaScript
var fs = require('fs');
var U = require('underscore');
var path = require('path');
var csv = require('finite-csv');
var program = require('commander');
program
.version('0.0.1')
.option('-d, --dataDir <path>', 'Data directory')
.option('-x, --regexFile <path>', 'Regex file path')
.option('-c, --csvFile <path>', 'CVS file containing site list')
.option('-m, --maxRows [count]', 'Max number of records to process.', parseInt, 100000)
.option('-u, --urlColumn <name>', 'Max number of records to process.', 'url')
.parse(process.argv);
//console.log(program);
//console.log('foo usage=' + program.usage);
if ( program.dataDir == undefined || program.regexFile == undefined || program.csvFile == undefined ) {
program.help();
}
/* process the data directory */
if ( ! (fs.existsSync(program.dataDir) && fs.lstatSync(program.dataDir).isDirectory() ) ) {
console.error('directory ' + program.dataDir + ' does not exist or is not a directory');
process.exit(1);
}
var files = fs.readdirSync(program.dataDir);
/* process the regex file */
if ( ! fs.existsSync(program.regexFile) ) {
console.error('json regex file ' + program.regexFile + ' does not exist');
process.exit(1);
}
var regexFile = fs.readFileSync(program.regexFile, 'utf8');
var regexObj = null;
try {
regexObj = JSON.parse(regexFile);
} catch (msg) {
console.error("json regex file " + program.regexFile + " was not valid JSON. Check out JSON lint online to validate it.");
process.exit(1);
}
/* process the csv file */
if ( ! fs.existsSync(program.csvFile) ) {
console.error('CSV file ' + program.csvFile + ' does not exist');
process.exit(1);
}
var csvFile = fs.readFileSync(program.csvFile, 'utf8');
csvFile = csvFile.replace(/\cm[\r\n]*/g, "\n");
var sites = [];
try {
//var csvRecs = csv.parseCSV("a,b,c\n1,2,3");
var records = csv.parseCSV(csvFile);
var skipRows = -1;
/* let's look for the header. the header is the first row that contains a column with the urlColumn in it */
outer:
for ( var rI = 0; rI < records.length; rI++ ) {
var record = records[rI];
for ( var cI = 0; cI < record.length; cI++ ) {
if ( record[cI] == program.urlColumn ) {
skipRows = rI;
break outer;
}
}
}
if ( skipRows < 0 ) {
throw "Unable to find a row in the data with a column matching " + program.urlColumn;
}
sites = csv_to_obj(records.slice(skipRows));
//console.log(sites);
//console.log(sites[0]);
sites = sites.slice(0,program.maxRows);
} catch (msg) {
console.error("csv file " + program.csvFile + " was not valid CSV. " + msg);
console.log(msg.stack);
process.exit(1);
}
//console.log("site count=" + sites.length);
var rows = [];
for ( var i = 0; i < sites.length; i++ ) {
var site = sites[i];
//console.log("record [" + i + "] ", site[' URL ' ]);
var file = U.filter(files, function(entry) {
// the i+1 is due to a bug in the crawler
return entry.indexOf('site_' + (i+1) + '_') == 0 && entry.match(/\.txt$/i);
});
var row = [];
if ( file.length > 0 ) {
var fullPath = program.dataDir + path.sep + file[0];
var infile = fs.readFileSync(program.dataDir + path.sep + file[0], 'utf8');
for (var key in regexObj) {
var def = regexObj[key];
var exp = new RegExp(def.pattern, def.modifiers);
var match = infile.match(exp);
var result = "";
//console.log("checking " + def.pattern + ", match=", match);
if ( match ) {
result = match[0];
//console.log('KEY=' + key);
if ( def.hit != undefined ) {
result = def.hit;
for (var k = 0; k < match.length; k++ ) {
result = result.replace('\{\{' + k + '\}\}', match[k]);
}
}
result = result.replace('\{\{count\}\}', match.length);
//console.log('output=' + result);
} else {
if ( def.miss != undefined ) {
result = def.miss;
}
}
row.push(result);
//console.log(result + " == " + key);
}
//console.log("processing file " + files[0]);
} else {
for ( var key in regexObj) {
row.push("");
}
//console.log("no file found for site_" + (i+1) + "_...");
}
rows.push(row);
//console.log('res=', row);
}
for ( var r = 0; r < rows.length; r++ ) {
for ( var c = 0; c < rows[r].length; c++ ) {
/* i'm going to remove all carriage returns here */
rows[r][c] = rows[r][c].replace(/[\r\n]/g, " ");
/* if we see a double quote then we will wrap the whole column with quotes and escape the quotes within */
if ( rows[r][c].match(/[",]/) ) {
rows[r][c] = '"' + rows[r][c].replace(/"/g, '""') + '"';
}
}
rows[r] = rows[r].join(',');
}
rows.unshift(U.keys(regexObj));
console.log(rows.join("\n"));
/**
* This awesome function will return an
* array of rows with the key values of
* each row matching the column header
* which should be provided in the first row.
*/
function csv_to_obj(records) {
var objects = [];
var header = [];
for ( var i = 0; i < records.length; i++ ) {
var values = records[i];
if ( i == 0 ) {
header = values;
} else {
var item = [];
for ( var recI = 0; recI < header.length; recI++ ) {
item[header[recI]] = recI < values.length ? values[recI] : "";
}
objects.push(item);
}
}
return objects;
}
function escapeRegExp(str) {
return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
}