dtl-js
Version:
Data Transformation Language - JSON templates and data transformation
874 lines (807 loc) • 31.8 kB
JavaScript
/* =================================================
* Copyright (c) 2015-2023 Jay Kuri
*
* This file is part of DTL.
*
* DTL is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* DTL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with DTL; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
* =================================================
*/
const { version } = require('../../../package.json');
const program = require('commander');
let DTL = require('../lib/DTL.js');
const util = require('util');
const JSON5 = require('json5');
const YAML = require('yaml');
const fs = require('fs');
const stream = require('stream');
const jsonlines = require('jsonlines');
const csv_parse = require('csv-parse');
const csv_stringify = require('csv-stringify');
const colorize = require('json-colorizer');
const prettyoutput = require('prettyoutput');
const readline = require('readline');
function list(str) {
return str.split(/,\s?/);
}
let json_stringify = function(obj, indent) {
return JSON.stringify(obj, undefined, indent);
}
let json_parse = function(str) {
return JSON5.parse(str);
}
// five input modes:
// 1: input data is single json
// 2: input data is json stream (one object per line)
// 3: input data is csv - array of arrays
// 4: input data as csv - array of objects (fields taken from first line)
//
// two processing options
// 1: process input as one item into transform
// 2: process input as array of single items, one item at a time into the transform
//
//
//
function output_help() {
console.log('Usage: dtl [options] inputfile');
console.log('');
console.log(" -e --execute '$.' - provide transform on command line");
console.log(' -f --transform-file - Load transform from file');
console.log(' -n --transform - name of transform to use (default to "out")');
console.log(' -a --apply_to_array - Apply transform directly to each item in input');
console.log(' (rather than input as a whole)');
console.log('');
console.log(' -p --pretty [n] - Produce pretty json output (indented by n spaces)');
console.log(' -c --condensed - Produce condensed json output');
console.log(' -N --preserve-undefined - Preserve undefined as null in json output');
console.log(' --init <init_file> - Initialize DTL with the contents of init_file');
console.log('');
console.log(' -m --read-mode <mode> - read mode');
console.log(' - line - process line-by-line (default for csv)');
console.log(' - all - process data as one large data struct');
console.log('');
console.log(' -d --delim <char> - Output delimiter character (for csv and unix)');
console.log(' -id --idelim <char> - Input delimiter character (for csv and unix)');
console.log('');
console.log(' -o --output-file <file> - place output into file instead of stdout');
console.log(' -I --input-type - input file type');
console.log(' - auto - try to auto-determine file type (default)');
console.log(' - csv - input file is csv');
console.log(' - csv-objects - input is csv with column headers, turn each row');
console.log(' into an object');
console.log(' - json - input file is json');
console.log(' - json-lines - input file is multiple json records, one to a line.');
console.log(' - yaml - input file is yaml');
console.log(' - unix - input data is whitespace separated plaintext');
console.log(' like standard unix cli tools');
console.log('');
console.log(' -O --output-type - type of output to produce');
console.log(' - auto - default - produce the same type of output as input.');
console.log(' - csv - produce csv data - expects result of transform to be ');
console.log(' an array of objects or an array of arrays');
console.log(' - csv-objects - produce csv data with column headers based on objects');
console.log(' - json - produce json');
console.log(' - json-lines - produce multiple json, one to a line');
console.log(' - yaml - produce yaml');
console.log(' - unix - produce space separated values');
console.log('');
console.log(' -u --unix - take unix-style plaintext as input, produce plaintext');
console.log(' unix mode is the default if taking input on stdin');
console.log('');
console.log(' -C --csv-columns - When producing CSV data, use the provided columns and');
console.log(' order to produce the CSV. Will output a header with');
console.log(' comma-separated columns provided. If the results are');
console.log(' objects, the columns will be filled with data from the');
console.log(' matching keys in the output objects. If the results');
console.log(' are arrays, the columns are assumed to be correct and');
console.log(' this serves only to create the header line.');
console.log('');
console.log(' -S --skipheader - When outputting CSV, do not output a header line');
console.log('');
console.log(' -s --strict-json - When reading JSON files, be strict. By default dtl');
console.log(' parses JSON files with JSON5. This flag forces parsing');
console.log(' with vanilla JSON.');
console.log('');
console.log(' -V --version - Show DTL version and exit');
console.log('');
}
function output_version() {
// get the version out of our package.json
// const packageJSON = JSON.parse(fs.readFileSync('../../../package.json', 'utf8'));
//console.log(util.inspect(packageJSON));
console.log(version);
}
program.option('-e --execute <transform>', 'Use transform from command line instead of from file');
program.option('-f --transform-file <transform_file>', 'Load transforms from file');
program.option('-n --transform <transform_name>', 'name of transform to use, default is to use "out" transform');
program.option('-N --preserve-undefined', 'preserve undefined as null in JSON output');
program.option('-m --read-mode <read_mode>', 'Processing mode: "line" = line at a time, "all" = all lines at once', /^(line|all)$/);
program.option('-o --output-file <output_file>', 'Place output in a file instead of stdout', 'stdout');
program.option('--init --init-file <init_file>', 'Initialize DTL with the contents of init_file');
program.option('-C --csv-columns <columns>', 'Comma separated list of columns to use for output', list);
program.option('-d --delim <output_delim>', 'Output csv delimiter character');
program.option('-id --idelim <input_delim>', 'Input csv delimiter character');
program.option('-I --input-type <input_type>', 'Input file type, default is auto', /^(auto|csv|csv\-objects|json|json\-lines|yaml|unix)$/);
program.option('-O --output-type <output_type>', 'Output file type to produce', /^(auto|csv|csv\-objects|json|json\-lines|yaml|unix)$/);
program.option('-u --unix', 'Take unix-style text data as input');
program.option('-p --pretty [spaces]', 'Pretty-print json (spaces = number of spaces for each indentation, default 4)');
program.option('-c --condensed', 'Produce condensed json output (good for piping into other programs)');
program.option('-i --interactive', 'Operate in interactive mode');
program.option('-a --apply_to_array', 'Apply transform directly to each item in input');
program.option('-s --strict-json', 'Be strict about JSON (do not use JSON5 for parsing)');
program.option('-v --verbose', 'Be verbose');
program.option('-h --get-help', 'Get Help');
program.option('-S --skipheader', 'When outputting CSV, do not output a header line');
program.option('-V --version', 'Show DTL version');
// unfortunately, this is the only way to override --version in commander.
program.on('option:version', function() {
output_version();
process.exit();
});
let files = program.parseOptions(process.argv).args.slice(2);
if (program.getHelp) {
output_help();
process.exit();
}
const identity_transform = { "out": "(: $. :)" };
let options = {
mode: "file",
read_mode: "line",
input_type: "auto",
output_type: "auto",
output_mode: "all_at_once",
output_file: "stdout",
transform: identity_transform,
preserve_undefined: false,
output_delimiter: ",",
input_delimiter: ",",
pretty: 0,
add_header: true,
color: false,
condensed: true,
verbose: false,
transform_name: "out"
}
let matches, extension, filename;
if (typeof program.interactive != 'undefined') {
options.mode = 'REPL';
}
if (typeof program.verbose != 'undefined') {
options.verbose = true;
}
if (process.stdout.isTTY) {
options.color = true;
options.pretty = 4;
options.condensed = false;
}
if (!program.condensed) {
if (typeof program.pretty != 'undefined') {
if (typeof program.pretty != 'boolean') {
options.pretty = parseInt(program.pretty);
} else {
options.pretty = 4;
}
}
options.condensed = false;
} else {
options.pretty = 0;
options.color = false;
options.condensed = true;
}
if (typeof program.csvColumns != 'undefined') {
options.columns = program.csvColumns;
}
if (program.skipheader) {
options.add_header = false;
}
if (typeof program.strictJson != 'undefined') {
json_parse = function(str) {
return JSON.parse(str);
};
}
if (typeof program.preserveUndefined != 'undefined') {
// TODO - allow auto columns ?
options.preserve_undefined = true;
let replacer = function(k, v) {
if (v === undefined) {
return null;
} else {
return v;
}
}
json_stringify = function(obj, indent) {
return JSON.stringify(obj, replacer, indent);
};
}
if (program.unix) {
options.input_type = 'unix';
options.output_type = 'unix';
}
if (typeof program.outputType != 'undefined') {
options.output_type = program.outputType;
}
if (typeof program.inputType != 'undefined') {
options.input_type = program.inputType;
}
if (typeof program.outputFile != 'undefined') {
options.output_file = program.outputFile;
}
if (typeof program.execute != 'undefined') {
if (/^{.*}\s*$/.test(program.execute)) {
options.transform = json_parse(program.execute);
} else if (/^\(:.*:\)$/.test(program.execute)) {
options.transform = { out: program.execute };
} else {
options.transform = { out: "(: " + program.execute + " :)" };
}
}
if (typeof program.initFile != 'undefined') {
// transform should be a filename. Try to load file.
try {
let resolvedPath = path.resolve(program.initFile);
const initializer = require(resolvedPath);
DTL = initializer(DTL);
} catch (e) {
console.error('Unable to load '+ program.initFIle + ': ', e.message);
process.exit(1);
}
}
if (typeof program.transformFile != 'undefined') {
// transform should be a filename. Try to load file.
try {
options.transform = json_parse(fs.readFileSync(program.transformFile));
//console.log(options.transform);
} catch (e) {
console.error('Unable to load '+ program.transformFile + ': ', e.message);
process.exit(1);
}
}
if (typeof program.transform != 'undefined') {
options.transform_name = program.transform;
}
if (typeof options.transform[options.transform_name] == 'undefined') {
console.error('Unable to find transform named '+ options.transform_name + ' in provided transform');
process.exit(1);
}
if (program.apply_to_array) {
let txname = DTL.apply({}, { "out": "(: &('tx_' uuid()) :)"});
options.transform[txname] = "(: map($. '(: $item -> `" + options.transform_name + "` :)') :)"
options.transform_name = txname;
}
if (typeof files[0] == 'string') {
// console.log('setting filename to: ', files[0]);
filename = files[0];
} else if (typeof files[0] == 'undefined') {
filename = '-';
}
if (filename !== '-' ) {
try {
fs.accessSync(filename, fs.constants.R_OK);
} catch (e) {
console.error("Unable to access " + filename +":", e.message);
process.exit(2);
}
} else {
if (options.input_type == 'auto') {
options.input_type = 'unix';
options.output_type = 'unix';
}
}
// figure out the input type, as that determines our mode.
// TODO: turn this into a function
//
if (options.input_type == 'auto') {
matches = /.*\.([^.]+)$/.exec(filename);
if (matches !== null) {
extension = matches[1];
switch(extension) {
case 'json':
options.input_type = 'json';
options.read_mode = 'all';
break;
case 'jsonl':
case 'jsonlines':
options.input_type = 'json-lines';
options.read_mode = 'line';
break;
case 'csv':
options.input_type = 'csv-objects';
break;
case 'tsv':
options.input_type = 'csv-objects';
options.input_delimiter = '\t';
break;
case 'yaml':
options.input_type = 'yaml';
break;
case 'unix':
options.input_type = 'unix';
options.read_mode = 'line';
break;
}
}
// if we are here, and input_type is still auto, we failed to guess the file type
if (options.input_type == 'auto') {
options.input_type = 'json';
options.read_mode = 'all';
options.file_type_guessed = true;
// console.error('Unable to determine input file type, please provide the appropriate -I flag');
// process.exit(3);
}
}
if (options.input_type == 'unix') {
options.read_mode = 'line';
if (typeof program.idelim == 'undefined') {
options.input_delimiter = /[\s\t]+/;
}
}
if (program.readMode) {
if (program.readMode == 'line') {
options.read_mode = 'line';
} else {
options.read_mode = 'all';
}
}
if (options.output_type == 'unix') {
options.output_mode = 'line_at_a_time';
if (typeof program.delim == 'undefined') {
options.output_delimiter = " ";
}
if (options.input_type == 'unix' && options.transform == identity_transform) {
// our input type is unix text and the transform has not been set by any args
// so we set it to output the input unchanged.
options.transform = { out: "(: $0 :)" };
}
}
if (options.output_type == 'auto') {
matches = /.*\.([^.]+)$/.exec(options.output_file);
if (matches !== null) {
extension = matches[1];
switch(extension) {
case 'json':
options.output_type = 'json';
break;
case 'jsonl':
case 'jsonlines':
options.output_type = 'json-lines';
break;
case 'csv':
options.output_type = 'csv';
break;
case 'tsv':
options.output_type = 'csv';
options.output_delimiter = '\t';
break;
case 'yaml':
options.output_type = 'yaml';
break;
}
}
if (options.output_type == 'auto') {
// we failed to match anything. Default to json output
options.output_type = 'json'
}
}
if (options.output_type == 'json-lines' && typeof program.pretty == 'undefined') {
console.error("pretty", program.pretty)
// pretty must be disabled for json-lines to be properly formatted
// so we disable pretty unless it's been explicitly set
options.pretty = 0;
}
// allow manually specified delimiters to override the defaults
if (typeof program.delim != 'undefined') {
//console.log('delim', util.inspect(program.delim));
options.output_delimiter = program.delim;
}
if (typeof program.idelim != 'undefined') {
//console.log('idelim', util.inspect(program.idelim));
options.input_delimiter = program.idelim;
}
if (options.output_type == 'json-lines' || (options.output_type == 'csv' && Array.isArray(options.columns)) ) {
options.output_mode = 'line_at_a_time';
}
// at this point, we know where our data is coming from, we've failed if we don't
// understand our options or can't access our files... so we proceed into actually
// setting up the data handling
// inputTransform is the transform that processes input from the
// file or stdin.
//
// DTLTransform is the transform that actually runs the DTL
//
// outputTransform is the thing that handles the final output
//
// raw data gets piped into inputTransform, and that is piped into
// DTLTransform
//
// DTLTransform is then piped into the outputTransform
let DTLTransform = new stream.Transform({
readableObjectMode: true,
writableObjectMode: true,
transform: function(input_data, encoding, callback) {
//console.log('DTL Transform', util.inspect(input_data, { depth: null}));
// //console.log(util.inspect(options.transform, { depth: null}));
let result = DTL.apply(input_data, options.transform, options.transform_name);
//console.log('DTL result', util.inspect(result, { depth: null}));
if(typeof this.results == 'undefined') {
this.results = [];
}
this.push(result);
callback();
}
});
// inputTransform is where the raw file data goes
// the code below decides what processing happens
let inputTransform, outputTransform;
// now we set up the file handling
if (options.input_type == 'json-lines') {
// simplest case first
// console.log('processing input as json-lines');
inputTransform = jsonlines.parse();
inputTransform.pipe(DTLTransform);
} else if (options.input_type == 'unix') {
//console.log('unix input');
// we have a unix stream. Need to process it with readline
inputTransform = new stream.Transform({
readableObjectMode: true,
writableObjectMode: false,
transform: function(chunk, encoding, callback) {
this.push(chunk);
callback();
}
});
let readlineTransform = readline.createInterface({
input: inputTransform,
//output: DTLTransform
});
readlineTransform.on('line', function(line) {
// split the line on the input delimeter (by default whitespace)
let data = line.split(options.input_delimiter);
// add entire line as $0
data.unshift(line);
DTLTransform.write(data);
});
readlineTransform.on('close', function() {
//console.log('ENDING');
DTLTransform.end();
});
} else if (options.input_type == 'json') {
// we have a single json, but we still need to emit a data event into our
// transform - so we setup inputTransform to collect the data entirely
// and then push it to the next thing in the chain.
inputTransform = new stream.Transform({
readableObjectMode: true,
writableObjectMode: false,
transform: function(chunk, encoding, callback) {
if(typeof this.stream_data == 'undefined') {
this.stream_data = Buffer.from(chunk);
} else {
this.stream_data = Buffer.concat([this.stream_data, chunk]);
}
callback();
},
flush: function(callback) {
let result;
try {
result = json_parse(this.stream_data.toString('utf8'));
if (options.read_mode == 'line' && Array.isArray(result)) {
result.forEach(function(item) {
this.push(item)
}.bind(this));
} else {
this.push(result);
}
} catch(e) {
let file_accessed = filename;
if (file_accessed == '-') {
file_accesses = 'standard input';
}
if (options.file_type_guessed) {
console.error('Unable to parse ' + file_accessed + ' as JSON:\n');
console.error(e.message);
console.error('\nIf it is not JSON data, use the -I flag to set the input file type.');
process.exit(2);
} else {
console.error('JSON parse error on ' + file_accessed);
console.error(e.message);
process.exit(2);
}
}
callback();
}
});
inputTransform.pipe(DTLTransform);
} else if (options.input_type == 'csv' || options.input_type == 'csv-objects') {
// csv processing is a bit more complicated - we have two modes
// line-by-line and file-at-once... and these require different handling.
let csv_parser;
let csv_options = {
trim: true,
delimiter: options.input_delimiter
};
if (options.input_type == 'csv-objects') {
csv_options.columns = true;
}
if (options.read_mode == 'line') {
// set up csv transform - straight from csv-parse - no problem.
inputTransform = csv_parse(csv_options);
inputTransform.on('error', function(err){
console.log('csv Processing Error:', err.message);
});
inputTransform.pipe(DTLTransform);
} else {
// file-at-once requires a transform like file-at-once json
inputTransform = csv_parse(csv_options);
inputTransform.records = [];
csv_parser = new stream.Transform({
readableObjectMode: true,
writableObjectMode: true,
transform: function(record, encoding, callback) {
if(typeof this.records == 'undefined') {
this.records = [];
}
this.records.push(record);
callback();
},
flush: function(callback) {
this.push(this.records);
callback();
}
});
inputTransform.on('error', function(err){
console.log('csv Processing Error:', err.message);
});
inputTransform.pipe(csv_parser).pipe(DTLTransform);
}
} else if (options.input_type == 'yaml') {
// we have a single json, but we still need to emit a data event into our
// transform - so we setup inputTransform to collect the data entirely
// and then push it to the next thing in the chain.
inputTransform = new stream.Transform({
readableObjectMode: true,
writableObjectMode: false,
transform: function(chunk, encoding, callback) {
if(typeof this.stream_data == 'undefined') {
this.stream_data = Buffer.from(chunk);
} else {
this.stream_data = Buffer.concat([this.stream_data, chunk]);
}
callback();
},
flush: function(callback) {
let result;
try {
result = YAML.parse(this.stream_data.toString('utf8'));
this.push(result);
} catch(e) {
console.error('yaml data failed to parse!!: ', e.message);
}
callback();
}
});
inputTransform.pipe(DTLTransform);
}
// output handling.
// Possibilities:
// 1) Output JSON
// 2) Output JSONLines
// 3) Output CSV
//
// Output modes:
// 1) one item at a time - less memory, faster.
// 2) All at once - more memory, slower.
// What determines which one we use is our output type, plus, in the case of CSV, whether
// we have the columns predefined.
// 1) Output is JSON - All at once mode.
// 2) Output is JSONLines - One item at a time mode.
// 3) Output is CSV - no columns provided - All at once mode.
// 4) Output is CSV - columns provided - One item at a time mode.
// for now, we always output JSON data
let outputStream;
let collector;
// process:
// line_at_a_time mode - Take a single piece of data and send it
// to the output element which will handle it.
//
// all_at_once - take the data and add it to the collected data
// Then, when no more data is available, send it to the output
// element.
//
//
if (options.output_mode == 'all_at_once') {
// define a collector
collector = new stream.Transform({
readableObjectMode: true,
writableObjectMode: true,
transform: function(record, encoding, callback) {
if(typeof this.records == 'undefined') {
this.records = [];
}
this.records.push(record);
callback();
},
flush: function(callback) {
if (Array.isArray(this.records)) {
if (this.records.length > 1) {
this.push(this.records);
} else {
this.push(this.records[0]);
}
} else {
this.push([])
}
callback();
}
});
DTLTransform.pipe(collector);
} else {
collector = DTLTransform;
}
if (options.output_type == 'csv' || options.output_type == 'csv-objects') {
// depending on whether we have columns or not, we may get records one-line at a time
// or all at once.
let extractor = collector;
let csv_options = {
delimiter: options.output_delimiter,
header: !!options.add_header
};
if (Array.isArray(options.columns)) {
csv_options.columns = options.columns;
} else {
// if we don't have columns, we can't output a header
// this way.
csv_options.header = false;
}
if (options.output_mode == 'all_at_once') {
// if we are all at once, we likely need to figure out our columns from the
// input data.
let get_columns = {
"out": "(: sort(reduce($. 'get_keys' [] )) :)",
"get_keys": "(: union($memo keys(flatten($item))) :)"
};
let flatten = {
"out": "(: flatten($.) :)"
};
extractor = new stream.Transform({
readableObjectMode: true,
writableObjectMode: true,
transform: function(obj, encoding, callback) {
let object = obj;
let columns;
// We need an array of data to process.
// If we didn't get an array, put it in an array,
// so we can process it correctly.
if (!Array.isArray(obj)) {
object = [ obj ];
}
if (Array.isArray(options.columns)) {
columns = options.columns;
} else {
columns = DTL.apply(object, get_columns);
}
if (options.add_header) {
this.push(columns);
}
for (let i = 0, len = object.length; i < len; i++) {
let output = object[i];
if (!Array.isArray(object[i])) {
output = [];
let flattened_obj = DTL.apply(object[i], flatten);
columns.forEach(function(key) {
output.push(flattened_obj[key]);
});
}
this.push(output);
}
callback();
}
});
outputStream = csv_stringify(csv_options);
extractor.pipe(outputStream);
collector.pipe(extractor);
} else {
csv_options.columns = options.columns;
outputStream = csv_stringify(csv_options);
collector.pipe(outputStream);
}
} else if (options.output_type == 'yaml') {
// this handles yaml
outputStream = new stream.Transform({
writableObjectMode: true,
transform: function(object, encoding, callback) {
let yaml_out;
if (options.pretty && options.output_file == 'stdout') {
yaml_out = prettyoutput(object, {maxDepth: 1000}, 2);
} else {
yaml_out = YAML.stringify(object);
}
this.push(yaml_out + "\n");
callback();
}
});
collector.pipe(outputStream);
} else if (options.output_type == 'unix') {
// this handles unix textual output
let extractor;
let object_to_plaintext_tx = {
"out": "(: map(flatten($.) '(: &($index `:` $item) :)') :)"
};
outputStream = new stream.Transform({
writableObjectMode: true,
transform: function(data, encoding, callback) {
let text_out;
if (Array.isArray(data)) {
text_out = data.join(options.output_delimiter);
this.push(text_out + "\n");
} else if (typeof data == 'object') {
let new_data = DTL.apply(data, object_to_plaintext_tx);
text_out = new_data.join(options.output_delimiter);
this.push(text_out + "\n");
} else {
this.push(data + "\n");
}
callback();
}
});
if (options.read_mode == 'all') {
extractor = new stream.Transform({
writableObjectMode: true,
readableObjectMode: true,
transform: function(data, encoding, callback) {
if (Array.isArray(data)) {
data.forEach(item => {
this.push(item);
});
} else {
this.push(data);
}
callback();
}
});
collector.pipe(extractor);
extractor.pipe(outputStream);
} else {
collector.pipe(outputStream);
}
} else {
// this handles json
outputStream = new stream.Transform({
writableObjectMode: true,
transform: function(object, encoding, callback) {
let json;
if (options.pretty && options.output_file == 'stdout') {
json = colorize(json_stringify(object, options.pretty));
} else {
json = json_stringify(object, options.pretty);
}
this.push(json + "\n");
callback();
}
});
collector.pipe(outputStream);
}
// write to stdout or to file:
if (options.output_file != 'stdout') {
let fileOut = fs.createWriteStream(options.output_file);
outputStream.pipe(fileOut);
} else {
outputStream.pipe(process.stdout);
}
// now we handle loading the input:
if (filename != '-') {
let readStream = fs.createReadStream(filename);
readStream.pipe(inputTransform);
} else {
process.stdin.pipe(inputTransform);
}