@engine9-io/input-tools
Version:
Tools for dealing with Engine9 inputs
190 lines (161 loc) • 6.31 kB
JavaScript
const fs = require('node:fs');
const { Transform, Writable } = require('node:stream');
const { pipeline } = require('node:stream/promises');
const { throttle } = require('throttle-debounce');
const parallelTransform = require('parallel-transform');
const debug = require('debug')('@engine9-io/input-tools');
const debugThrottle = throttle(1000, debug, { noLeading: false, noTrailing: false });
const { Mutex } = require('async-mutex');
const csv = require('csv');
const handlebars = require('handlebars');
const ValidatingReadable = require('./ValidatingReadable');
const FileUtilities = require('./file/FileUtilities');
const {
getTempFilename, getBatchTransform, getFile, streamPacket,
} = require('./file/tools');
class ForEachEntry {
constructor({ accountId } = {}) {
this.fileUtilities = new FileUtilities({ accountId });
}
getOutputStream({ name, postfix = '.timeline.csv', validatorFunction = () => true }) {
this.outputStreams = this.outputStreams || {};
if (this.outputStreams[name]?.items) return this.outputStreams[name].items;
this.outputStreams[name] = this.outputStreams[name] || {
mutex: new Mutex(),
};
return this.outputStreams[name].mutex.runExclusive(async () => {
const fileInfo = {
filename: await getTempFilename({ postfix }),
records: 0,
};
debug(`Output file requested, writing output to to: ${fileInfo.filename}`);
const outputStream = new ValidatingReadable({
objectMode: true,
}, validatorFunction);
// eslint-disable-next-line no-underscore-dangle
outputStream._read = () => {};
const writeStream = fs.createWriteStream(fileInfo.filename);
const finishWritingOutputPromise = new Promise((resolve, reject) => {
writeStream.on('finish', () => {
resolve();
}).on('error', (err) => {
reject(err);
});
});
this.outputStreams[name].items = {
stream: outputStream,
promises: [finishWritingOutputPromise],
files: [fileInfo],
};
outputStream
.pipe(new Transform({
objectMode: true,
transform(o, enc, cb) {
fileInfo.records += 1;
cb(null, o);
},
}))
.pipe(csv.stringify({ header: true }))
.pipe(writeStream);
return this.outputStreams[name].items;
});
}
async process({
packet,
filename,
transform: userTransform,
batchSize = 500,
concurrency = 10,
bindings = {},
}) {
let inStream = null;
if (filename) {
debug(`Processing file ${filename}`);
inStream = (await this.fileUtilities.stream({ filename })).stream;
} else if (packet) {
debug(`Processing person file from packet ${packet}`);
inStream = (await streamPacket({ packet, type: 'person' })).stream;
}
if (typeof userTransform !== 'function') throw new Error('async transform function is required');
if (userTransform.length > 1) throw new Error('transform should be an async function that accepts one argument');
let records = 0;
let batches = 0;
const outputFiles = {};
const transformArguments = {};
// An array of promises that must be completed, such as writing to disk
let bindingPromises = [];
// new Streams may be created, and they have to be completed when the file is completed
const newStreams = [];
const bindingNames = Object.keys(bindings);
// eslint-disable-next-line no-await-in-loop
await Promise.all(bindingNames.map(async (bindingName) => {
const binding = bindings[bindingName];
if (!binding.path) throw new Error(`Invalid binding: path is required for binding ${bindingName}`);
if (binding.path === 'output.timeline') {
const { stream: streamImpl, promises, files } = await this.getOutputStream({
name: bindingName,
postfix: binding.options?.postfix || '.timeline.csv',
validatorFunction: (data) => {
if (!data) return true;
if (typeof data !== 'object') throw new Error('Invalid timeline data push, must be an object');
// Is this necessary?
if (!data.person_id) throw new Error('Invalid timeline data push, must have a person_id, even if 0');
if (!data.ts) data.ts = new Date().toISOString();
return true;
},
});
newStreams.push(streamImpl);
transformArguments[bindingName] = streamImpl;
bindingPromises = bindingPromises.concat(promises || []);
outputFiles[bindingName] = files;
} else if (binding.path === 'output.stream') {
const { stream: streamImpl, promises, files } = await this.getOutputStream({
name: bindingName,
postfix: binding.options?.postfix || '.timeline.csv',
});
newStreams.push(streamImpl);
transformArguments[bindingName] = streamImpl;
bindingPromises = bindingPromises.concat(promises || []);
outputFiles[bindingName] = files;
} else if (binding.path === 'file') {
transformArguments[bindingName] = await getFile(binding);
} else if (binding.path === 'handlebars') {
transformArguments[bindingName] = handlebars;
} else {
throw new Error(`Unsupported binding path for binding ${bindingName}: ${binding.path}`);
}
}));
await pipeline(
inStream,
csv.parse({
relax: true,
skip_empty_lines: true,
max_limit_on_data_read: 10000000,
columns: true,
}),
getBatchTransform({ batchSize }).transform,
parallelTransform(
concurrency,
(batch, cb) => {
userTransform({ ...transformArguments, batch })
.then((d) => cb(null, d))
.catch(cb);
},
),
new Writable({
objectMode: true,
write(batch, enc, cb) {
batches += 1;
records += batch?.length || 0;
debugThrottle(`Processed ${batches} batches for a total of ${records} outbound records`);
cb();
},
}),
);
debug('Completed all batches');
newStreams.forEach((s) => s.push(null));
await Promise.all(bindingPromises);
return { outputFiles };
}
}
module.exports = ForEachEntry;