@engine9/packet-tools
Version:
Tools for dealing with Engine9 packets
742 lines (667 loc) • 21.5 kB
JavaScript
/* eslint-disable no-await-in-loop */
const fs = require('node:fs');
const fsp = fs.promises;
const path = require('node:path');
const zlib = require('node:zlib');
const {
Readable, Transform, PassThrough,
} = require('node:stream');
const { pipeline } = require('node:stream/promises');
const { stringify } = require('csv');
const debug = require('debug')('FileWorker');
// const through2 = require('through2');
const csv = require('csv');
const JSON5 = require('json5');// Useful for parsing extended JSON
const languageEncoding = require('detect-file-encoding-and-language');
const S3Worker = require('./S3');
const ParquetWorker = require('./Parquet');
const { streamPacket } = require('./tools');
const { bool, getTempDir } = require('./tools');
function Worker({ accountId }) { this.accountId = accountId; }
class LineReaderTransform extends Transform {
constructor(options = {}) {
super({ ...options, readableObjectMode: true });
this.buffer = '';
}
// eslint-disable-next-line no-underscore-dangle
_transform(chunk, encoding, callback) {
this.buffer += chunk.toString();
const lines = this.buffer.split(/\r?\n/);
this.buffer = lines.pop();
lines.forEach((line) => this.push(line));
callback();
}
// eslint-disable-next-line no-underscore-dangle
_flush(callback) {
if (this.buffer) {
this.push(this.buffer);
}
callback();
}
}
Worker.prototype.csvToObjectTransforms = function (options) {
const transforms = [];
const delimiter = options.delimiter || ',';
const headerMapping = options.headerMapping || function (d) { return d; };
let lastLine = null;
let head = null;
const skipLinesWithError = bool(options.skip_lines_with_error, false);
const parserOptions = {
relax: true,
skip_empty_lines: true,
delimiter,
max_limit_on_data_read: 10000000,
skip_lines_with_error: skipLinesWithError,
};
if (options.skip) parserOptions.from_line = options.skip;
if (options.relax_column_count) parserOptions.relax_column_count = true;
if (options.quote_escape) {
parserOptions.escape = options.quote_escape;
}
debug('Parser options=', parserOptions);
const parser = csv.parse(parserOptions);
parser.on('error', (error) => {
debug('fileToObjectStream: Error parsing csv file');
debug(lastLine);
throw new Error(error);
});
const blankAndHeaderCheck = new Transform({
objectMode: true,
transform(row, enc, cb) {
// Blank rows
if (row.length === 0) return cb();
if (row.length === 1 && !row[0]) return cb();
if (!head) {
head = row.map(headerMapping);
return cb();
}
const o = {};
head.forEach((_h, i) => {
const h = _h.trim();
if (h) {
o[h] = row[i];
}
});
lastLine = row.join(delimiter);
return cb(null, o);
},
});
transforms.push(parser);
transforms.push(blankAndHeaderCheck);
return { transforms };
};
Worker.prototype.detectEncoding = async function (options) {
if (options.encoding_override) return { encoding: options.encoding_override };
// Limit to only the top N bytes -- for perfomance
// Be wary, though, as gzip files may require a certain minimum number of bytes to decompress
const bytes = 64 * 1024;
const buff = Buffer.alloc(bytes);
const fd = await fsp.open(options.filename);
await fd.read(buff, 0, bytes);
let finalBuff = buff;
if (options.filename.slice(-3) === '.gz') {
// This code deals with scenarios where the buffer coming in may not be exactly the gzip
// needed chunk size.
finalBuff = await new Promise((resolve, reject) => {
const bufferBuilder = [];
const decompressStream = zlib.createGunzip()
.on('data', (chunk) => {
bufferBuilder.push(chunk);
}).on('close', () => {
resolve(Buffer.concat(bufferBuilder));
}).on('error', (err) => {
if (err.errno !== -5) {
// EOF: expected
reject(err);
}
});
decompressStream.write(buff);
decompressStream.end();
});
}
return languageEncoding(finalBuff);
};
Worker.prototype.detectEncoding.metadata = {
options: {
filename: { required: true },
},
};
/*
Internal method to transform a file into a stream of objects.
*/
Worker.prototype.fileToObjectStream = async function (options) {
const { filename, columns, limit: limitOption } = options;
// handle stream item
if (options.stream) {
if (Array.isArray(options.stream)) {
return { stream: Readable.from(options.stream) };
}
// probably already a stream
if (typeof options.stream === 'object') return { stream: options.stream };
throw new Error(`Invalid stream type:${typeof options.stream}`);
}
let limit;
if (limitOption) limit = parseInt(limitOption, 10);
if (!filename) throw new Error('fileToObjectStream: filename is required');
let postfix = options.sourcePostfix || filename.toLowerCase().split('.').pop();
if (postfix === 'zip') {
debug('Invalid filename:', { filename });
throw new Error('Cowardly refusing to turn a .zip file into an object stream, turn into a csv first');
}
const streamInfo = await this.stream({
filename,
columns,
limit,
});
const { encoding } = streamInfo;
let { stream } = streamInfo;
if (!stream) throw new Error(`No stream found in fileToObjectStream from filename ${filename}`);
if (encoding === 'object') {
// already an object
return { stream };
}
let count = 0;
debug(`Reading file ${filename} with encoding:`, encoding);
const head = null;
let transforms = [];
if (postfix === 'gz') {
const gunzip = zlib.createGunzip();
transforms.push(gunzip);
gunzip.setEncoding(encoding);
// encoding = null;// Default encoding
postfix = filename.toLowerCase().split('.');
postfix = postfix[postfix.length - 2];
debug(`Using gunzip parser because postfix is .gz, encoding=${encoding}`);
} else {
stream.setEncoding(encoding);
}
if (postfix === 'csv') {
const csvTransforms = this.csvToObjectTransforms({ ...options });
transforms = transforms.concat(csvTransforms.transforms);
} else if (postfix === 'txt') {
const csvTransforms = this.csvToObjectTransforms({ ...options, delimiter: '\t' });
transforms = transforms.concat(csvTransforms.transforms);
} else if (postfix === 'jsonl') {
/* Type of JSON that has the names in an array in the first record,
and the values in JSON arrays thereafter
*/
let headers = null;
const lineReader = new LineReaderTransform();
const jsonlTransform = new Transform({
objectMode: true,
transform(d, enc, cb) {
if (!d) return cb();
let obj;
try {
obj = JSON5.parse(d);
} catch (e) {
debug('Invalid line:');
debug(d);
throw e;
}
/* JSONL could potentially start with an array of names,
in which case we need to map the subsequent values
*/
if (headers === null) {
if (Array.isArray(obj)) {
headers = obj;
return cb();
}
headers = false;
}
if (headers) {
const mapped = {};
headers.forEach((name, i) => { mapped[name] = obj[i]; });
this.push(mapped);
} else {
this.push(obj);
}
return cb();
},
});
transforms.push(lineReader);
transforms.push(jsonlTransform);
} else {
throw new Error(`Unsupported file type: ${postfix}`);
}
const countAndDebug = new Transform({
objectMode: true,
transform(d, enc, cb) {
if (count === 0) { debug('Sample object from file:', d); }
count += 1;
if ((count < 5000 && count % 1000 === 0) || (count % 50000 === 0)) {
debug(`fileToObjectStream transformed ${count} lines`);
}
this.push(d);
cb();
},
flush(cb) {
// If there's no records at all, push a dummy record, and specify 0 records
debug(`Completed reading file, records=${count}`);
if (count === 0) {
const o = { _is_placeholder: true };
if (head) head.forEach((c) => { o[c] = null; });
this.push(o);
}
cb();
},
});
transforms.push(countAndDebug);
transforms.forEach((t) => {
stream = stream.pipe(t);
});
return { stream };
};
Worker.prototype.getFileWriterStream = async function (options = {}) {
const accountId = options.accountId || this.accountId;
if (!accountId) throw new Error('getFileWriterStream has no accountId');
const targetFormat = options.targetFormat || 'csv';
const tempDir = await getTempDir({ accountId });
let { fileExtendedType } = options;
if (fileExtendedType) fileExtendedType += '.';
else fileExtendedType = '';
// So, this could change, but it's easier to read
// dates in a filename than UUIDs, so this is
// a unique-ish filename generator
const uniqueNumberedDate = `${new Date().toISOString().replace(/[^0-9]*/g, '')}.${Math.floor(Math.random() * 1000)}`;
let filename = `${tempDir}${path.sep}${uniqueNumberedDate}.${fileExtendedType}${targetFormat}`;
if (bool(options.gzip, false)) filename += '.gz';
const stream = fs.createWriteStream(filename);
debug('FileWriterStream writing to file ', filename);
return { filename, stream };
};
Worker.prototype.getOutputStreams = async function (options) {
const { filename, stream: fileWriterStream } = await this.getFileWriterStream(options);
let { transform } = options;
if (typeof options.transform === 'function') {
if (options.transform.length === 3) {
transform = new Transform({
objectMode: true,
async transform(item, encoding, cb) {
options.transform(item, encoding, cb);
},
});
} else {
transform = new Transform({
objectMode: true,
async transform(item, encoding, cb) {
cb(null, options.transform(item));
},
});
}
} else if (options.transform) {
transform = options.transform;
}
const { flatten } = options;
let flattenTransform = null;
if (bool(flatten, false)) {
flattenTransform = new Transform({
objectMode: true,
async transform(item, enc, cb) {
// first item establishes the keys to use
let o = {};
Object.keys(item).forEach((k) => {
let v = item[k];
if (!o[k]) {
if (typeof v === 'object') {
while (Array.isArray(v)) [v] = v;// get first array item
o = { ...o, ...v };
} else {
o[k] = v;
}
}
});
cb(null, o);
},
});
}
const stats = {
records: 0,
};
let stringifier;
if (options.targetFormat === 'jsonl') {
stringifier = new Transform({
objectMode: true,
transform(d, encoding, cb) {
cb(false, `${JSON.stringify(d)}\n`);
},
});
} else {
stringifier = stringify({ header: true });
}
let gzip = new PassThrough();
if (options.gzip) {
gzip = zlib.createGzip();
}
const streams = [
transform,
flattenTransform,
new Transform({
objectMode: true,
transform(d, enc, cb) {
stats.records += 1;
cb(null, d);
},
}),
stringifier,
gzip,
fileWriterStream,
].filter(Boolean);
return { filename, streams, stats };
};
Worker.prototype.objectStreamToFile = async function (options) {
const { filename, streams, stats } = await this.getOutputStreams(options);
const { stream: inStream } = options;
streams.unshift(inStream);
await pipeline(
streams,
);
return { filename, records: stats.records };
};
Worker.prototype.transform = async function (options) {
const worker = this;
const filename = worker.getFilename(options);
debug(`Transforming ${filename}`);
options.filename = filename;
let { stream } = await worker.fileToObjectStream(options);
if (typeof stream.pipe !== 'function') {
debug(stream);
throw new Error('No pipe in stream');
}
let t = options.transform;
// No longer need this
delete options.transform;
if (!t) {
t = function (d, enc, cb) {
d.is_test_transform = true;
cb(null, d);
};
}
if (!Array.isArray(t)) t = [t];
Object.keys(t).forEach((key) => {
let f = t[key];
if (typeof f === 'function') {
f = new Transform({
objectMode: true,
transform: f,
});
}
stream = stream.pipe(f);
});
const { targetFormat } = options;
if (!targetFormat && (filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')) {
options.targetFormat = 'csv';
}
return worker.objectStreamToFile({ ...options, stream });
};
Worker.prototype.transform.metadata = {
options: {
sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
encoding: { description: 'Manual override of source file encoding' },
names: { description: 'Target field names (e.g. my_new_field,x,y,z)' },
values: { description: "Comma delimited source field name, or Handlebars [[ ]] merge fields (e.g. 'my_field,x,y,z', '[[field1]]-[[field2]]', etc)" },
targetFilename: { description: 'Custom name of the output file (default auto-generated)' },
targetFormat: { description: 'Output format -- csv supported, or none for txt (default)' },
targetRowDelimiter: { description: 'Row delimiter (default \n)' },
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' },
},
};
Worker.prototype.testTransform = async function (options) {
return this.transform({
...options,
transform(d, enc, cb) { d.transform_time = new Date(); cb(null, d); },
});
};
Worker.prototype.testTransform.metadata = {
options: {
filename: true,
},
};
/* Get a stream from an actual stream, or an array, or a file, or a packet */
Worker.prototype.stream = async function (
options,
) {
const {
stream: inputStream, packet, type, columns, limit,
filename: filenameOpt,
} = options;
let filename = filenameOpt;
if (inputStream) {
if (Array.isArray(inputStream)) {
return { stream: Readable.from(inputStream) };
}
// probably already a stream
if (typeof inputStream === 'object') return { stream: inputStream, encoding: 'object' };
throw new Error(`Invalid stream type:${typeof inputStream}`);
} else if (filename) {
if (filename.startsWith('engine9-accounts/')) {
filename = `${process.env.ENGINE9_ACCOUNT_DIR}/${filename.slice('engine9-accounts/'.length)}`;
debug(`Prepending file with ${process.env.ENGINE9_ACCOUNT_DIR}, filename=${filename}`);
} else {
debug(`Not prepending filename:${filename}`);
}
let encoding; let stream;
if (filename.slice(-8) === '.parquet') {
const pq = new ParquetWorker(this);
stream = (await pq.stream({ filename, columns, limit })).stream;
encoding = 'object';
} else if (filename.indexOf('s3://') === 0) {
const s3Worker = new S3Worker(this);
stream = (await s3Worker.stream({ filename, columns, limit })).stream;
encoding = 'UTF-8';
} else {
// Check if the file exists, and fast fail if not
// Otherwise the stream hangs out as a handle
try {
await fsp.stat(filename);
} catch (e) {
debug(`Error reading file ${filename}, current directory: ${process.cwd()},__dirname:${__dirname}`);
throw e;
}
stream = fs.createReadStream(filename);
encoding = (await this.detectEncoding({ filename })).encoding;
}
return { stream, encoding };
} else if (packet) {
let { stream: packetStream } = await streamPacket({ packet, type, limit });
const { transforms } = this.csvToObjectTransforms({});
transforms.forEach((t) => {
packetStream = packetStream.pipe(t);
});
return { stream: packetStream };
} else {
throw new Error('stream must be passed a stream, filename, or packet');
}
};
Worker.prototype.sample = async function (opts) {
opts.limit = opts.limit || 10;
const { stream } = await this.fileToObjectStream(opts);
return stream.toArray();
};
Worker.prototype.sample.metadata = {
options: {
filename: {},
},
};
Worker.prototype.write = async function (opts) {
const { filename, content } = opts;
if (filename.indexOf('s3://') === 0) {
const s3Worker = new S3Worker(this);
const parts = filename.split('/');
const directory = parts.slice(0, -1).join('/');
const file = parts.slice(-1)[0];
// debug(JSON.stringify({ parts, directory, file }));
await s3Worker.write({
directory,
file,
content,
});
} else {
await fsp.writeFile(filename, content);
}
return { success: true, filename };
};
Worker.prototype.write.metadata = {
options: {
filename: { description: 'Location to write content to, can be local or s3://' },
content: {},
},
};
async function streamToString(stream) {
// lets have a ReadableStream as a stream variable
const chunks = [];
// eslint-disable-next-line no-restricted-syntax
for await (const chunk of stream) {
chunks.push(Buffer.from(chunk));
}
return Buffer.concat(chunks).toString('utf-8');
}
/*
Retrieves and parsed
*/
Worker.prototype.json = async function (opts) {
const { stream } = await this.stream(opts);
const str = await streamToString(stream);
try {
return JSON5.parse(str);
} catch (e) {
debug(e);
throw new Error(`Unparseable JSON received: ${opts.filename || '(native stream)'}`);
}
};
Worker.prototype.json.metadata = {
options: {
filename: { description: 'Get a javascript object from a file' },
},
};
Worker.prototype.list = async function ({ directory }) {
if (!directory) throw new Error('directory is required');
if (directory.indexOf('s3://') === 0) {
const s3Worker = new S3Worker(this);
return s3Worker.list({ directory });
}
const a = await fsp.readdir(directory, { withFileTypes: true });
return a.map((f) => ({
name: f.name,
type: f.isDirectory() ? 'directory' : 'file',
}));
};
Worker.prototype.list.metadata = {
options: {
directory: { required: true },
},
};
Worker.prototype.listAll = async function ({ directory }) {
if (!directory) throw new Error('directory is required');
if (directory.indexOf('s3://') === 0) {
const s3Worker = new S3Worker(this);
return s3Worker.listAll({ directory });
}
const a = await fsp.readdir(directory, { recursive: true });
return a.map((f) => `${directory}/${f}`);
};
Worker.prototype.listAll.metadata = {
options: {
directory: { required: true },
},
};
Worker.prototype.empty = async function ({ directory }) {
if (!directory) throw new Error('directory is required');
if (directory.indexOf('s3://') === 0) {
// currently not emptying S3 this way -- dangerous
throw new Error('Cannot empty an s3:// directory');
}
const removed = [];
// eslint-disable-next-line no-restricted-syntax
for (const file of await fsp.readdir(directory)) {
removed.push(file);
await fsp.unlink(path.join(directory, file));
}
return { directory, removed };
};
Worker.prototype.empty.metadata = {
options: {
directory: { required: true },
},
};
Worker.prototype.move = async function ({ filename, target }) {
if (!target) throw new Error('directory is required');
if (target.indexOf('s3://') === 0) {
const s3Worker = new S3Worker(this);
const parts = target.split('/');
return s3Worker.put({ filename, directory: parts.slice(0, -1).join('/'), file: parts.slice(-1)[0] });
}
await fsp.mkdir(path.dirname(target), { recursive: true });
await fsp.rename(filename, target);
return { filename: target };
};
Worker.prototype.move.metadata = {
options: {
filename: {},
target: {},
},
};
Worker.prototype.stat = async function ({ filename }) {
if (!filename) throw new Error('filename is required');
if (filename.indexOf('s3://') === 0) {
const s3Worker = new S3Worker(this);
return s3Worker.stat({ filename });
}
const {
ctime,
birthtime,
size,
} = await fsp.stat(filename);
const modifiedAt = new Date(ctime);
let createdAt = birthtime;
if (createdAt === 0 || !createdAt) createdAt = ctime;
createdAt = new Date(createdAt);
return {
createdAt,
modifiedAt,
size,
};
};
Worker.prototype.stat.metadata = {
options: {
filename: {},
},
};
Worker.prototype.head = async function (options) {
const { stream } = await this.fileToObjectStream(options);
const chunks = [];
const limit = options.limit || 3;
let counter = 0;
// eslint-disable-next-line no-restricted-syntax
for await (const chunk of stream) {
chunks.push(chunk);
counter += 1;
if (counter >= limit) break;
}
return chunks;
};
Worker.prototype.head.metadata = {
options: {
filename: { required: true },
},
};
Worker.prototype.count = async function (options) {
const { stream } = await this.fileToObjectStream(options);
const sample = [];
const limit = options.limit || 5;
let records = 0;
// eslint-disable-next-line no-restricted-syntax
for await (const chunk of stream) {
records += 1;
if (records < limit) {
sample.push(chunk);
}
}
return { sample, records };
};
Worker.prototype.count.metadata = {
options: {
filename: { required: true },
},
};
module.exports = Worker;