slog
Version:
Log parser with configurable column processors.
480 lines (390 loc) • 13 kB
JavaScript
// Logs are a time series. This system works with CSV rows, arranged sequentially.
// Column naming is ignored (eg. first row of file being "time,value" or similar).
// The first two columns of a logged row *MUST* be <timestamp>,<value>.
// Any number of other columns may be added. A #columnProcessor must be defined for these
// rows if their data is to be analyzed. If no #columnProcessor is defined, these additional
// columns are ignored.
//
var clio = require('clio')({
useLines : false,
options : {
"-f --file" : "The log file to parse.",
"-rmin" : "The minimum range value.",
"-rmax" : "The maximum range value.",
"-cp --colProc" : "Establish column processors",
"-v --verbose" : "Whether to export results"
}
});
clio.parse();
var fs = require('fs');
var util = require('util');
var path = require('path');
var moment = require("moment");
var rmdir = require("rimraf");
var child = require('child_process')
var exec = child.exec;
// Note that no checking is done. Format:
// > node logparser targetlog
//
var filename = clio.get("-f");
var rmin = clio.get("-rmin");
var rmax = clio.get("-rmax");
var normalizedFilename = filename.replace(/\//g, "_").replace(/[^_\w]/g, "");
// Alter this to change the number of lines of main log file each worker receives.
// TODO: as cli option.
//
var splitOnLines = 1000000;
// Custom column processors will write to this. See below.
//
var columnProcessors = [];
// Column processors write to this array, indexed by the column the processor worked against.
//
var columnData = [];
var functionInstance = function(fbody) {
return Function(
"with(this) { return (function(){" + fbody + "})(); };"
)
};
var pad = function(d, padstr, len, sub) {
var s = new String(sub);
while(s.length < len) {
s = d === "r" ? s + padstr : padstr + s;
}
return s;
};
// ##padRight
//
var padRight = function(p, l, s) {
return pad("r", p, l, s);
}
// ##asyncEach
//
// Non-blocking *serial* iteration through an array of methods.
//
// @param {Function} fn The iterator method.
// @param {Function{ [finalCb] A method to call when stack is cleared. Passed the
// result object.
// @param {Array} [targ] The array to iterate through. Defaults to Subject.
//
var asyncEach = function(fn, finalCb, targ) {
targ = targ || [];
finalCb = finalCb || function(){};
var results = {
errored : false,
last : null,
stack : []
};
var $this = this;
var len = targ.length;
var idx = 0;
// Call the sent iterator method for each member of #targ. If #iterator returns
// false (not falsy...===false) push #idx to #len, terminating further iteration.
// Otherwise, update the #results object. Ultimately, fire #finalCb.
//
var iter = function() {
if(false === fn.call(this, targ[idx], idx, results, function(err, res) {
++idx;
results.errored = results.errored || err;
results.last = res;
results.stack.push(res);
if(idx < len) {
process.nextTick(iter);
} else {
finalCb.call(this, results);
}
})) {
idx = len;
finalCb.call(this, results);
}
}
iter();
};
// ##asyncParEach
//
// Non-blocking *parallel* execution of an array of methods.
//
// @param {Function} fn The iterator method.
// @param {Function{ [finalCb] A method to call when stack is cleared. Passed the
// result object.
// @param {Array} [targ] The array to iterate through. Defaults to Subject.
//
var asyncParEach = function(fn, finalCb, targ) {
targ = targ || [];
finalCb = finalCb || function(){};
var results = {
errored : false,
last : null,
stack : []
};
var $this = this;
var len = targ.length;
var idx = 0;
var cnt = 0;
while(idx < len) {
fn.call(this, targ[idx], idx, results, function(err, res, ridx) {
results.errored = results.errored || err;
results.last = res;
if(ridx !== void 0) {
results.stack[ridx] = res;
} else {
results.stack.push(res);
}
++cnt
if(cnt === len) {
finalCb.call(this, results);
}
});
++idx;
}
};
// ##writeVerboseResults
//
// Called if `-v` option set.
//
var writeVerboseResults = function(data, out) {
var timeFormat = "MMMM Do YYYY, h:mm:ss a";
var i;
var seconds = (data.end - data.start) / 1000;
var str = "";
// Show results... eventually write to a data file and/or create an html page.
//
str += "\
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`\
+ FILE: @blue" + filename + "@@\tCPUS: @red" + require('os').cpus().length + "@@ +`\
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`\
+++++++++++++++++++++++++++++++++++++@yellow@_blackStats@@++++++++++++++++++++++++++++++++++++++`\
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`\
@greenStart Time: " + moment(data.start).format(timeFormat) + "`\
End Time: " + moment(data.end).format(timeFormat) + "`\
Total Seconds: " + seconds + "`\
Total Datapoints: " + data.totalPoints + "`\
Throughput: " + (data.totalPoints / seconds).toFixed(3) + "/second`\
Outliers under (" + rmin + "): " + data.outliers.under + " (%" + (data.outliers.under / data.totalPoints * 100).toFixed(3) + ")`\
Outliers over (" + rmax + "): " + data.outliers.over + " (%" + (data.outliers.over / data.totalPoints * 100).toFixed(3) + ")`\
@@++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`\
++++++++++++++++++++++@yellow@_blackDistribution (Milliseconds : Count)@@+++++++++++++++++++++++`\
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`";
for(i=rmin; i <= rmax; i++) {
str += "+ " + i + " :\t" + padRight(" ", 10, data.range[i]) + "\t@black@_cyan(%" + (100/data.totalPoints*data.range[i]).toFixed(3) + ")@@`";
}
str += "\
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`\
+++++++++++++++++++++++++++++++++@yellow@_blackPercentiles@@++++++++++++++++++++++++++++++++++++`\
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`";
for(i=rmin; i <= rmax; i++) {
str += "+ " + i + " :\t" + padRight(" ", 10, data.percentiles[i]) + "\t@black@_cyan(" + (100.000 - data.percentiles[i]).toFixed(3) + ")@@`";
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// //
// Generate display of custom processor results //
// //
//////////////////////////////////////////////////////////////////////////////////////////////////
columnProcessors.forEach(function(cp) {
if(cp.writer) {
str += cp.writer.call({
columnIndex : cp.colIdx,
columnData : columnData[cp.colIdx]
});
}
});
str += "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`";
// Sanity check. If line count for the file (`wc -l`) matches # of datapoints, the reduction
// is valid.
//
exec("wc -l " + filename, function(e, r) {
str += "+ Sanity check (should match) -> line count: @black@_white" + parseInt(r.replace(filename, "")) + "<>" + data.totalPoints + " @@:datapoints`";
str += "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++`";
if(out) {
str = clio.detokenize(str);
}
(out || clio).write(str);
});
};
// Create a temp directory, split log file into smaller files | temp directory,
// fetch the file list. Terminal function then creates workers.
//
asyncEach(function(f, idx, res, next) {
f(res, next);
}, function(r) {
if(r.errored) {
throw new Error("Unable to read shard directory");
}
// The last action was a request for the shard directory listing.
// Sorted, as listings are not necessarily so.
//
var files = r.last.sort();
var colProc = clio.get("-cp");
// If there are column processors, push processor data onto #columnProcessors.
//
(colProc ? util.isArray(colProc) ? colProc : [colProc] : []).forEach(function(f) {
var cs = f.split(":");
var v = {
colIdx : cs[0],
mapFile : cs[1] + ".mapper.js"
}
// Reducers and writers are optional.
//
try {
v.reducer = fs.readFileSync(cs[1] + ".reducer.js");
} catch(e) {
throw new Error("Unable to load reducer. Got: " + cs[1] + " prefix");
}
v.reducer = v.reducer ? functionInstance(v.reducer) : void 0;
try {
v.writer = fs.readFileSync(cs[1] + ".writer.js");
} catch(e) {
throw new Error("Unable to load writer. Got: " + cs[1] + " prefix");
}
v.writer = v.writer ? functionInstance(v.writer) : void 0;
columnProcessors.push(v);
});
// For each file create a worker, send that worker the file to work on, when
// worker is finished pass along results, aggregate, analyze.
//
asyncParEach(function(file, idx, res, next) {
var w = child.fork('bin/worker');
w.send({
filename : normalizedFilename + '/' + file,
rmin : rmin,
rmax : rmax,
columnProcessors : columnProcessors
});
w.on('message', function(m) {
next(null, m, idx);
});
}, function(rs) {
// #rs is a stack of such objects:
//
// #range is an introspective array, which displays the frequency of a key in
// column #1 of the log. Below, we see that `2` appeared `90344` times.
// #outliers #under === number below #rmin; #over === number above #rmax.
// #start & #end === min, max timestamp in log file (Column #0)
// #accumulatedColumns : If a column processor exists for Column #2, you would
// see the values below, where [object] === the output of processor#reducer file.
//
// { range : [
// 123,
// 0,
// 90344
// ],
// outliers : {under: 0, over: 135},
// start : timestamp,
// end : timestamp,
// accumulatedColumns : [null, null, [object], null]
// }
//
// @see parselog.worker.js
//
var data = rs.stack;
// Unlink temp folder
//
fs.rmdir(normalizedFilename, function(err) {
if(err) {
throw err;
}
var range = [];
var outliers = {
over : 0,
under : 0
};
var percentiles = [];
var start = Infinity;
var end = -Infinity;
var r = data.length;
var pc = 0;
var x = rmax;
var totalPoints = 0;
var rr;
var oo;
var aa;
var i;
var rv;
// Initialize final #range with zeros(0)
//
do {
range[x] = 0;
--x;
} while(x >= rmin);
// Run through each worker result, ending up with data set start time, end time,
//
while(r--) {
rr = data[r].range;
oo = data[r].outliers;
aa = data[r].accumulatedColumns || [];
outliers.over += oo.over;
outliers.under += oo.under;
start = Math.min(start, data[r].start);
end = Math.max(end, data[r].end);
for(i=rmin; i <= rmax; i++) {
range[i] += rr[i];
totalPoints += rr[i];
}
// For each of the columns which have a processor, send the column
// reducer the current column value, and set #columnData to the result.
//
columnProcessors.forEach(function(m) {
if(m.reducer) {
columnData[m.colIdx] = columnData[m.colIdx] || void 0;
columnData[m.colIdx] = m.reducer.call({
input : aa[m.colIdx],
output : columnData[m.colIdx]
});
} else {
columnData[m.colIdx] = columnData[m.colIdx] || [];
columnData[m.colIdx].push(aa[m.colIdx]);
}
});
}
// Outliers form part of total datapoint set.
//
totalPoints += outliers.over + outliers.under;
// Calculate percentile rank.
//
// ((scores lower than candidate) + (scores the same as candidate) / (total scores) * 100;
//
for(i=rmin; i <= rmax; i++) {
rv = range[i] === void 0 ? 0 : parseInt(range[i]);
percentiles[i] = ((pc + rv) / totalPoints * 100).toFixed(3);
pc += rv;
};
var fileout = clio.get("-o");
if(clio.get("-v") || fileout) {
writeVerboseResults({
range : range,
percentiles : percentiles,
outliers : outliers,
totalPoints : totalPoints,
start : start,
end : end
}, fileout)
}
});
}, files);
}, [
// These methods run serially
//
function(res, next) {
rmdir(normalizedFilename, function(err) {
if(!err) {
fs.mkdir(normalizedFilename, "0777", next);
}
});
},
function(res, next) {
if(res.errored) {
throw new Error("Unable to mkdir");
}
// Splits #filename(file) every #splitOnLines lines and writes the shards
// into #normalizedFilename folder
//
exec("split -" + splitOnLines + " " + filename + " " + normalizedFilename + "/", next);
},
function(res, next) {
if(res.errored) {
throw new Error("Unable to create shards");
}
// Fetch the shard directory listing into a structure that we can iterate through.
//
fs.readdir(normalizedFilename, next);
}
]);