@engine9/packet-tools
Version:
Tools for dealing with Engine9 packets
607 lines (539 loc) • 20.4 kB
JavaScript
const fs = require('node:fs');
const path = require('node:path');
const os = require('node:os');
const { PassThrough, Readable, Transform } = require('node:stream');
const { stringify } = require('csv');
const debug = require('debug')('packet-tools');
const progress = require('debug')('info:packet-tools');
const unzipper = require('unzipper');
const {
v4: uuidv4, v5: uuidv5, v7: uuidv7, validate: uuidIsValid,
} = require('uuid');
const archiver = require('archiver');
const handlebars = require('handlebars');
const { mkdirp } = require('mkdirp');
const etl = require('etl');
const JSON5 = require('json5');
const {
S3Client,
HeadObjectCommand,
GetObjectCommand,
} = require('@aws-sdk/client-s3');
const { TIMELINE_ENTRY_TYPES } = require('./timelineTypes');
function getStringArray(s, nonZeroLength) {
let a = s || [];
if (typeof a === 'number') a = String(a);
if (typeof a === 'string') a = [a];
if (typeof s === 'string') a = s.split(',');
a = a.map((x) => x.toString().trim()).filter(Boolean);
if (nonZeroLength && a.length === 0) a = [0];
return a;
}
async function getTempDir({ accountId = 'engine9' }) {
const dir = [os.tmpdir(), accountId, new Date().toISOString().substring(0, 10)].join(path.sep);
try {
await mkdirp(dir);
} catch (err) {
if (err.code !== 'EEXIST') throw err;
}
return dir;
}
/*
Get a new, timestamp based filename, creating any necessary directories
options:
prefix/postfix of file
source:source file, used to generate friendly name
*/
async function getTempFilename(options) {
let dir = await getTempDir(options);
const target = options.targetFilename;
if (target) {
if (target.indexOf('/') === 0 || target.indexOf('\\') === 0) {
// assume a full directory path has been specified
return target;
}
// make a distinct directory, so we don't overwrite the file
dir = `${dir}/${new Date().toISOString().slice(0, -6).replace(/[^0-9]/g, '_')}`;
const newDir = await mkdirp(dir);
return `${newDir}/${target}`;
}
let { prefix } = options;
let { postfix } = options;
const { targetFormat } = options;
if (!postfix && targetFormat === 'csv') postfix = '.csv';
if (options.source) {
postfix = `_${options.source.split('/').pop()}`;
postfix = postfix.replace(/['"\\]/g, '').replace(/[^a-zA-Z0-9_.-]/g, '_');
}
if (prefix) prefix += '_';
const p = `${dir}/${prefix || ''}${uuidv7()}${postfix || '.txt'}`;
return p;
}
async function list(_path) {
const directory = await unzipper.Open.file(_path);
return new Promise((resolve, reject) => {
directory.files[0]
.stream()
.pipe(fs.createWriteStream('firstFile'))
.on('error', reject)
.on('finish', resolve);
});
}
async function extract(_path, _file) {
const directory = await unzipper.Open(_path);
// return directory.files.map((f) => f.path);
const file = directory.files.find((d) => d.path === _file);
const tempFilename = await getTempFilename({ source: _file });
return new Promise((resolve, reject) => {
file
.stream()
.pipe(fs.createWriteStream(tempFilename))
.on('error', reject)
.on('finish', resolve);
});
}
function appendFiles(existingFiles, _newFiles, options) {
const newFiles = getStringArray(_newFiles);
if (newFiles.length === 0) return;
let { type, dateCreated } = options || {};
if (!type) type = 'unknown';
if (!dateCreated)dateCreated = new Date().toISOString();
let arr = newFiles;
if (!Array.isArray(newFiles)) arr = [arr];
arr.forEach((p) => {
const item = {
type,
originalFilename: '',
isNew: true,
dateCreated,
};
if (typeof p === 'string') {
item.originalFilename = path.resolve(process.cwd(), p);
} else {
item.originalFilename = path.resolve(process.cwd(), item.originalFilename);
}
const file = item.originalFilename.split(path.sep).pop();
item.path = `${type}/${file}`;
const existingFile = existingFiles.find((f) => f.path === item.path);
if (existingFile) throw new Error('Error adding files, duplicate path found for path:', +item.path);
existingFiles.push(item);
});
}
async function create(options) {
const {
accountId = 'engine9',
pluginId = '',
target = '', // target filename, creates one if not specified
messageFiles = [], // file with contents of message, used for delivery
personFiles = [], // files with data on people
timelineFiles = [], // activity entry
statisticsFiles = [], // files with aggregate statistics
} = options;
if (options.peopleFiles) throw new Error('Unknown option: peopleFiles, did you mean personFiles?');
const files = [];
const dateCreated = new Date().toISOString();
appendFiles(files, messageFiles, { type: 'message', dateCreated });
appendFiles(files, personFiles, { type: 'person', dateCreated });
appendFiles(files, timelineFiles, { type: 'timeline', dateCreated });
appendFiles(files, statisticsFiles, { type: 'statistics', dateCreated });
const zipFilename = target || await getTempFilename({ postfix: '.packet.zip' });
const manifest = {
accountId,
source: {
pluginId,
},
dateCreated,
files,
};
// create a file to stream archive data to.
const output = fs.createWriteStream(zipFilename);
const archive = archiver('zip', {
zlib: { level: 9 }, // Sets the compression level.
});
return new Promise((resolve, reject) => {
debug(`Setting up write stream to ${zipFilename}`);
// listen for all archive data to be written
// 'close' event is fired only when a file descriptor is involved
output.on('close', () => {
debug('archiver has been finalized and the output file descriptor has closed, calling success');
debug(zipFilename);
return resolve({
filename: zipFilename,
bytes: archive.pointer(),
});
});
// This event is fired when the data source is drained no matter what was the data source.
// It is not part of this library but rather from the NodeJS Stream API.
// @see: https://nodejs.org/api/stream.html#stream_event_end
output.on('end', () => {
// debug('end event -- Data has been drained');
});
// warnings could be file not founds, etc, but we error even on those
archive.on('warning', (err) => {
reject(err);
});
// good practice to catch this error explicitly
archive.on('error', (err) => {
reject(err);
});
archive.pipe(output);
files.forEach(({ path: name, originalFilename }) => archive.file(originalFilename, { name }));
files.forEach((f) => {
delete f.originalFilename;
delete f.isNew;
});
archive.append(Buffer.from(JSON.stringify(manifest, null, 4), 'utf8'), { name: 'manifest.json' });
archive.finalize();
});
}
async function getPacketDirectory({ packet }) {
if (packet.indexOf('s3://') === 0) {
const parts = packet.split('/');
const Bucket = parts[2];
const Key = parts.slice(3).join('/');
const s3Client = new S3Client({});
debug('Getting ', { Bucket, Key });
// const directory = await unzipper.Open.s3(s3Client, { Bucket, Key });
let size = null;
const directory = await unzipper.Open.custom({
async size() {
const info = await s3Client.send(
new HeadObjectCommand({
Bucket,
Key,
}),
);
size = info.ContentLength;
progress(`Retrieving file of size ${size / (1024 * 1024)} MB`);
return info.ContentLength;
},
stream(offset, length) {
const ptStream = new PassThrough();
s3Client.send(
new GetObjectCommand({
Bucket,
Key,
Range: `bytes=${offset}-${length ?? ''}`,
}),
)
.then((response) => {
response.Body.pipe(ptStream);
})
.catch((error) => {
ptStream.emit('error', error);
});
return ptStream;
},
});
return directory;
}
const directory = await unzipper.Open.file(packet);
return directory;
}
async function getManifest({ packet }) {
if (!packet) throw new Error('no packet option specififed');
const directory = await getPacketDirectory({ packet });
const file = directory.files.find((d) => d.path === 'manifest.json');
const content = await file.buffer();
const manifest = JSON.parse(content.toString());
return manifest;
}
async function getFile({ packet, type }) {
if (!packet) throw new Error('no packet option specififed');
const manifest = await getManifest({ packet });
const files = manifest.files?.filter((d) => d.type === type);
if (!files?.length) throw new Error(`No files of type ${type} found in packet`);
if (files?.length > 1) throw new Error(`Multiple files of type ${type} found in packet`);
const filePath = files[0].path;
const directory = await getPacketDirectory({ packet });
const handle = directory.files.find((d) => d.path === filePath);
const content = await handle.buffer().toString();
if (filePath.slice(-5) === '.json' || filePath.slice(-6) === '.json5') {
try {
return JSON5.parse(content);
} catch (e) {
debug(`Erroring parsing json content from ${path}`, content);
throw e;
}
}
return content;
}
async function stream({ packet, type }) {
if (!packet) throw new Error('no packet option specififed');
const manifest = await getManifest({ packet });
const files = manifest.files?.filter((d) => d.type === type);
if (!files?.length) throw new Error(`No files of type ${type} found in packet`);
if (files?.length > 1) throw new Error(`Multiple files of type ${type} found in packet`);
const filePath = files[0].path;
const directory = await getPacketDirectory({ packet });
const handle = directory.files.find((d) => d.path === filePath);
return { stream: handle.stream(), path: filePath };
}
async function downloadFile({ packet, type = 'person' }) {
const { stream: fileStream, path: filePath } = await stream({ packet, type });
const filename = await getTempFilename({ targetFilename: filePath.split('/').pop() });
return new Promise((resolve, reject) => {
fileStream.pipe(fs.createWriteStream(filename))
.on('error', reject)
.on('finish', () => {
resolve({ filename });
});
});
}
async function getTimelineOutputStream() {
const timelineFile = await getTempFilename({ postfix: '.csv' });
debug(`Writing timeline file:${timelineFile}`);
const timelineOutputStream = new Readable({
objectMode: true,
});
// eslint-disable-next-line no-underscore-dangle
timelineOutputStream._read = () => {};
const timelineOutputTransform = new Transform({
objectMode: true,
transform(obj, enc, cb) {
debug(`Pushing person_id ${obj.person_id}`, enc, cb);
this.push({
uuid: uuidv7(),
entry_type: obj.entry_type || 'UNKNOWN',
person_id: obj.person_id || 0,
reference_id: obj.reference_id || 0,
});
cb();
},
});
const writeStream = fs.createWriteStream(timelineFile);
const finishWritingTimelinePromise = new Promise((resolve, reject) => {
writeStream.on('finish', () => {
resolve();
}).on('error', (err) => {
reject(err);
});
});
timelineOutputStream
.pipe(timelineOutputTransform)
.pipe(stringify({ header: true }))
.pipe(writeStream);
return {
stream: timelineOutputStream,
promises: [finishWritingTimelinePromise],
files: [timelineFile],
};
}
async function forEachPersonImpl({
packet,
transform,
batchSize = 500,
bindings = {},
start = 0, // which record to start with, defaults to 0
end, // record to end with, non-inclusive
}) {
const manifest = await getManifest({ packet });
const personFile = (manifest.files || []).find((p) => p.type === 'person');
let timelineFiles = [];
const transformArguments = {};
// An array of promises that must be completed, such as writing to disk
let bindingPromises = [];
// new Streams may be created, and they have to be completed when the file is completed
const newStreams = [];
const bindingNames = Object.keys(bindings);
// eslint-disable-next-line no-await-in-loop
await Promise.all(bindingNames.map(async (bindingName) => {
const binding = bindings[bindingName];
if (!binding.type) throw new Error(`type is required for binding ${bindingName}`);
if (binding.type === 'packet.output.timeline') {
const { stream: streamImpl, promises, files } = await getTimelineOutputStream({});
newStreams.push(streamImpl);
transformArguments[bindingName] = streamImpl;
bindingPromises = bindingPromises.concat(promises || []);
timelineFiles = timelineFiles.concat(files);
} else if (binding.type === 'packet.message') {
transformArguments[bindingName] = await getFile({ packet, type: 'message' });
} else if (binding.type === 'handlebars') {
transformArguments[bindingName] = handlebars;
} else {
throw new Error(`Unsupported binding type for binding ${bindingName}: ${binding.type}`);
}
}));
let recordCounter = 0;
return new Promise((resolve, reject) => {
fs.createReadStream(path.resolve(process.cwd(), packet))
.pipe(unzipper.Parse())
// we should not return null here, as it will cancel the pipe,
// so we disable the consistent-return rule
// eslint-disable-next-line consistent-return
.pipe(etl.map(async (entry) => {
if (entry.path === personFile.path) {
return entry
.pipe(etl.csv())
// eslint-disable-next-line array-callback-return
.pipe(etl.map(function (item) {
if (recordCounter < start) return;
if (end && recordCounter >= end) return;
recordCounter += 1;
this.push(item);
}))
.pipe(etl.collect(batchSize))
.pipe(etl.map(async function (batch) {
const out = await transform({ batch, handlebars, ...transformArguments });
this.push(out);
}))
.promise()
// .then(Promise.all(bindingPromises));
.then(() => {}, reject);
}
entry.autodrain();
// don't return null, as it will cancel the pipe
}))
.promise()
.then(() => {
// close new streams
newStreams.forEach((s) => s.push(null));
resolve({ timelineFiles });
}, reject);
});
}
async function forEachPerson({
packet,
transform,
batchSize = 500,
bindings = {},
}) {
if (!packet) throw new Error('no packet specified');
if (typeof transform !== 'function') throw new Error('transform function is required');
const manifest = await getManifest({ packet });
const personFile = (manifest.files || []).find((p) => p.type === 'person');
if (!personFile) {
return { no_data: true, no_person_file: true };
}
const totalPersonRecords = 1000000;
const maxRecordsPerProcess = 1000000;
const parallelItems = [];
for (let start = 0; start < totalPersonRecords; start += maxRecordsPerProcess) {
let end = start + maxRecordsPerProcess;
if (end > totalPersonRecords) end = totalPersonRecords;
parallelItems.push(forEachPersonImpl({
packet, transform, batchSize, bindings, start, end,
}));
}
const results = await Promise.all(parallelItems);
return results;
}
function intToByteArray(_v) {
// we want to represent the input as a 8-bytes array
const byteArray = [0, 0, 0, 0, 0, 0, 0, 0];
let v = _v;
for (let index = 0; index < byteArray.length; index += 1) {
// eslint-disable-next-line no-bitwise
const byte = v & 0xff;
byteArray[index] = byte;
v = (v - byte) / 256;
}
return byteArray;
}
function getPluginUUID(uniqueNamespaceLikeDomainName, valueWithinNamespace) {
// Random custom namespace for plugins -- not secure, just a namespace:
return uuidv5(`${uniqueNamespaceLikeDomainName}::${valueWithinNamespace}`, 'f9e1024d-21ac-473c-bac6-64796dd771dd');
}
function getInputUUID(pluginId, remoteInputId) {
if (!pluginId) throw new Error('getInputUUID: Cowardly rejecting a blank plugin_id');
if (!uuidIsValid(pluginId)) throw new Error(`Invalid pluginId:${pluginId}, should be a UUID`);
if (!remoteInputId) throw new Error('getInputUUID: Cowardly rejecting a blank remote_input_id, set a default');
// Random custom namespace for inputs -- not secure, just a namespace:
// 3d0e5d99-6ba9-4fab-9bb2-c32304d3df8e
return uuidv5(`${pluginId}:${remoteInputId}`, '3d0e5d99-6ba9-4fab-9bb2-c32304d3df8e');
}
function getUUIDv7(date, inputUuid) { /* optional date and input UUID */
const uuid = inputUuid || uuidv7();
const bytes = Buffer.from(uuid.replace(/-/g, ''), 'hex');
if (date !== undefined) {
const d = new Date(date);
// isNaN behaves differently than Number.isNaN -- we're actually going for the
// attempted conversion here
// eslint-disable-next-line no-restricted-globals
if (isNaN(d)) throw new Error(`getUUIDv7 got an invalid date:${date || '<blank>'}`);
const dateBytes = intToByteArray(d.getTime()).reverse();
dateBytes.slice(2, 8).forEach((b, i) => { bytes[i] = b; });
}
return uuidv4({ random: bytes });
}
/* Returns a date from a given uuid (assumed to be a v7, otherwise the results are ... weird */
function getUUIDTimestamp(uuid) {
const ts = parseInt((`${uuid}`).replace(/-/g, '').slice(0, 12), 16);
return new Date(ts);
}
const requiredTimelineEntryFields = ['ts', 'entry_type_id', 'input_id', 'person_id'];
function getTimelineEntryUUID(inputObject, { defaults = {} } = {}) {
const o = { ...defaults, ...inputObject };
/*
Outside systems CAN specify a unique UUID as remote_entry_uuid,
which will be used for updates, etc.
If not, it will be generated using whatever info we have
*/
if (o.remote_entry_uuid) {
if (!uuidIsValid(o.remote_entry_uuid)) throw new Error('Invalid remote_entry_uuid, it must be a UUID');
return o.remote_entry_uuid;
}
/*
Outside systems CAN specify a unique remote_entry_id
If not, it will be generated using whatever info we have
*/
if (o.remote_entry_id) {
// get a temp ID
if (!o.input_id) throw new Error('Error generating timeline entry uuid -- remote_entry_id specified, but no input_id');
const uuid = uuidv5(o.remote_entry_id, o.input_id);
// Change out the ts to match the v7 sorting.
// But because outside specified remote_entry_uuid
// may not match this standard, uuid sorting isn't guaranteed
return getUUIDv7(o.ts, uuid);
}
const missing = requiredTimelineEntryFields
.filter((d) => o[d] === undefined);// 0 could be an entry type value
if (missing.length > 0) throw new Error(`Missing required fields to append an entry_id:${missing.join(',')}`);
const ts = new Date(o.ts);
// isNaN behaves differently than Number.isNaN -- we're actually going for the
// attempted conversion here
// eslint-disable-next-line no-restricted-globals
if (isNaN(ts)) throw new Error(`getTimelineEntryUUID got an invalid date:${o.ts || '<blank>'}`);
const idString = `${ts.toISOString()}-${o.person_id}-${o.entry_type_id}-${o.source_code_id || 0}`;
// get a temp ID
const uuid = uuidv5(idString, o.input_id);
// Change out the ts to match the v7 sorting.
// But because outside specified remote_entry_uuid
// may not match this standard, uuid sorting isn't guaranteed
return getUUIDv7(ts, uuid);
}
function getEntryTypeId(o, { defaults = {} } = {}) {
let id = o.entry_type_id || defaults.entry_type_id;
if (id) return id;
const etype = o.entry_type || defaults.entry_type;
if (!etype) {
throw new Error('No entry_type, nor entry_type_id specified, specify a defaultEntryType');
}
id = TIMELINE_ENTRY_TYPES[etype];
if (id === undefined) throw new Error(`Invalid entry_type: ${etype}`);
return id;
}
module.exports = {
list,
extract,
create,
forEachPerson,
stream,
getManifest,
getFile,
downloadFile,
getTempFilename,
getTimelineOutputStream,
getTimelineEntryUUID,
getPacketDirectory,
getPluginUUID,
getInputUUID,
getUUIDv7,
getUUIDTimestamp,
uuidIsValid,
uuidv4,
uuidv5,
uuidv7,
TIMELINE_ENTRY_TYPES,
getEntryTypeId,
};