synopsize
Version:
Print out a synopsis of values from a stream of newline-delimited JSON objects
163 lines (162 loc) • 6.01 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const optimist = require("optimist");
const tarry_1 = require("tarry");
const json_1 = require("streaming/json");
const sv_1 = require("@chbrown/sv");
const index_1 = require("../index");
function exit(error) {
if (error) {
console.error(`ERROR: ${error.toString()}`);
process.exit(1);
}
console.error('DONE');
process.exit(0);
}
function printCounts(counts, sampleLength, showCount = true) {
const uniqueValues = Array.from(counts.keys());
// if there aren't many unique values, print them all
const isSample = uniqueValues.length > sampleLength;
const values = isSample ? (0, tarry_1.sample)(uniqueValues, sampleLength) : uniqueValues;
if (isSample) {
console.log(` ${sampleLength} random examples:`);
}
for (let value of values) {
if (showCount) {
const count = counts.get(value);
console.log(` ${value}: ${count}`);
}
else {
console.log(` ${value}`);
}
}
}
function printSynopsis(synopsis, sampleLength) {
const { typeName, values, nonEmptyValues, minimum, maximum, counts } = synopsis;
console.log(` Type: ${typeName}`);
// print totals summary
const hasEmptyValues = nonEmptyValues.length < values.length;
if (hasEmptyValues) {
const totalEmptyValues = values.length - nonEmptyValues.length;
const emptyValuesRatio = totalEmptyValues / values.length;
console.log(` ${totalEmptyValues} missing values (out of ${values.length} total), or ${(emptyValuesRatio * 100).toFixed(2)}%`);
}
else {
console.log(` No missing values (${values.length} total)`);
}
const valueRef = hasEmptyValues ? '(non-null) value' : 'value';
// print values / counts summary
// We want to avoid printing too many values -- that's not a synopsis.
// But we don't want to assume that all numeric values are continuous,
// or to show _only_ aggregate statistics.
if (nonEmptyValues.length === 0) {
// redundant (remove?)
console.log(' No values to show');
}
else if (counts.size === 1) {
console.log(` There is only one unique ${valueRef}: ${minimum}`);
}
else if (counts.size === nonEmptyValues.length) {
console.log(` All ${valueRef}s are unique and range from ${minimum} to ${maximum}`);
printCounts(counts, sampleLength, false);
}
else {
// the values aren't all unique
console.log(` There are ${counts.size} unique ${valueRef}s, which range from ${minimum} to ${maximum}`);
printCounts(counts, sampleLength);
}
}
function consumeStream(inputStream, callback) {
inputStream.once('readable', () => {
const initialBytes = inputStream.read(100);
inputStream.unshift(initialBytes);
if (initialBytes.toString().match(/^\s*\{/)) {
// it's JSON
const keySet = new Set();
const records = [];
const parser = inputStream.pipe(new json_1.Parser());
parser.on('error', error => callback(error))
.on('data', record => {
Object.keys(record).forEach(key => keySet.add(key));
records.push(record);
})
.on('end', () => {
const columns = [...keySet].map(name => {
const values = records.map(record => {
// use '' as the value for missing values for parity with SV
if (record[name] === undefined || record[name] === null) {
record[name] = '';
}
return String(record[name]);
});
return { name, values };
});
callback(null, columns);
});
}
else {
// it's CSV/TSV
const records = [];
// keep reference to parser available so that we have access to its inferences
const parser = inputStream.pipe(new sv_1.Parser());
parser.on('error', error => callback(error))
.on('data', record => records.push(record))
.on('end', () => {
// TODO: customize sv.Parser so that we can get out string[] rows if we want
const columns = parser.config.columns.map(name => {
return { name, values: records.map(record => record[name]) };
});
callback(null, columns);
});
}
});
}
function main() {
const argvparser = optimist
.usage([
'Usage: synopsize <my_data.csv',
' synopsize <apiRslt.json',
].join('\n'))
.options({
help: {
alias: 'h',
describe: 'print this help message',
type: 'boolean',
},
version: {
describe: 'print version',
type: 'boolean',
},
sample: {
describe: 'maximum number of example values to show for each column',
type: 'number',
default: 10,
},
});
const argv = argvparser.argv;
const sampleLength = argv.sample;
if (argv.help) {
argvparser.showHelp();
}
else if (argv.version) {
console.log(require('../package').version);
}
else if (process.stdin['isTTY']) {
exit(new Error('Data must be piped in on STDIN'));
}
else {
consumeStream(process.stdin, (error, columns) => {
if (error)
exit(error);
columns.forEach((column, index) => {
console.log(`[${index}] "${column.name}"`);
const synopsis = (0, index_1.synopsize)(column.values);
printSynopsis(synopsis, sampleLength);
});
});
}
}
if (require.main === module) {
main();
}