@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
332 lines (331 loc) • 19.5 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const os_1 = __importDefault(require("os"));
const fs_1 = __importDefault(require("fs"));
const promises_1 = __importDefault(require("fs/promises"));
const path_1 = __importDefault(require("path"));
const promises_2 = require("stream/promises");
const workerpool_1 = __importDefault(require("workerpool"));
const Affirm_1 = __importDefault(require("../core/Affirm"));
const UsageManager_1 = __importDefault(require("../engines/usage/UsageManager"));
const Helper_1 = __importDefault(require("../helper/Helper"));
const Environment_1 = __importDefault(require("../engines/Environment"));
const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor"));
const Constants_1 = __importDefault(require("../Constants"));
const DriverHelper_1 = __importDefault(require("../drivers/DriverHelper"));
const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor"));
const OutputExecutor_1 = __importDefault(require("./OutputExecutor"));
const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager"));
const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance"));
const ExecutorProgress_1 = __importDefault(require("./ExecutorProgress"));
const Algo_1 = __importDefault(require("../core/Algo"));
const ConsumerOnFinishManager_1 = __importDefault(require("../engines/consumer/ConsumerOnFinishManager"));
const ExecutorScope_1 = __importDefault(require("./ExecutorScope"));
const ProcessENVManager_1 = __importDefault(require("../engines/ProcessENVManager"));
class ExecutorOrchestratorClass {
constructor() {
this.init = () => {
if (!this._executorPool) {
const options = {
workerThreadOpts: {
resourceLimits: {
maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB
}
}
};
const workerPath = this._getWorkerPath();
this._executorPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ExecutorWorker.js'), options);
}
};
this.launch = (request) => __awaiter(this, void 0, void 0, function* () {
var _a, _b;
(0, Affirm_1.default)(request, 'Invalid options');
const { consumer, details, logProgress, options } = request;
(0, Affirm_1.default)(consumer, 'Invalid consumer');
(0, Affirm_1.default)(details, 'Invalid execution details');
const tracker = new ExecutorPerformance_1.default();
const _progress = new ExecutorProgress_1.default(logProgress);
const { usageId } = UsageManager_1.default.startUsage(consumer, details);
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [] };
try {
const start = performance.now();
this.init();
const executorResults = [];
let counter = performance.now();
const sourceFilesByProducer = yield this.readySourceFiles(consumer, scope);
tracker.measure('ready-producers', performance.now() - counter);
let globalWorkerIndex = 0;
for (const pair of sourceFilesByProducer) {
const { prod, cProd, response } = pair;
// Make sure that the data files are there, if missing and isOptional = true, then skip
if (!fs_1.default.existsSync(response.files[0].fullUri)) {
if (!cProd.isOptional)
throw new Error(`Expected data file ${response.files[0].fullUri} of producer ${prod.name} in consumer ${consumer.name} is missing.`);
else if (cProd.isOptional === true)
continue;
}
console.log('Starting operations on ', response.files[0].fullUri);
// Extract the dimensions for this producer just once
const firstLine = (yield DriverHelper_1.default.quickReadFile(response.files[0].fullUri, 1))[0];
const header = ProducerExecutor_1.default.processHeader(firstLine, prod);
const prodDimensions = ProducerExecutor_1.default.reconcileHeader(header, prod);
const totalFiles = response.files.length;
for (const [fileIndex, file] of response.files.entries()) {
const chunks = ExecutorOrchestrator.scopeWork(file.fullUri);
const workerThreads = [];
for (const chunk of chunks) {
// Spawn off thread
const workerId = `${usageId}_${globalWorkerIndex}`;
const currentWorkerIndex = globalWorkerIndex;
globalWorkerIndex++;
const workerData = {
producer: prod,
chunk,
consumer,
prodDimensions,
workerId,
scope,
options
};
_progress.register((currentWorkerIndex + 1).toString(), prod.name, fileIndex, totalFiles);
scope.workersId.push(workerId);
workerThreads.push(this._executorPool.exec('executor', [workerData], {
on: payload => this.onWorkAdvanced(payload, currentWorkerIndex, _progress)
}));
}
executorResults.push(...yield Promise.all(workerThreads));
// WARNING: will this not create problems when multiple are executed together at the same time since this is a singleton?!?
yield this._executorPool.terminate();
}
}
_progress.complete();
if (executorResults.some(x => !Algo_1.default.hasVal(x)))
throw new Error(`${executorResults.filter(x => !Algo_1.default.hasVal(x)).length} worker(s) failed to produce valid results`);
yield this.reconcileExecutorThreadsResults(scope, executorResults, tracker);
// If there is more than one worker, then I need to redo the operations that are done on multiple lines (cause now the worker files have been merged together)
const postOperation = { totalOutputCount: null };
if (executorResults.length > 1) {
if (((_a = consumer.options) === null || _a === void 0 ? void 0 : _a.distinct) === true) {
counter = performance.now();
const unifiedOutputCount = yield ConsumerExecutor_1.default.processDistinct(ExecutorScope_1.default.getMainPath(scope));
tracker.measure('process-distinct:main', performance.now() - counter);
postOperation.totalOutputCount = unifiedOutputCount;
}
if ((_b = consumer.options) === null || _b === void 0 ? void 0 : _b.distinctOn) {
counter = performance.now();
const unifiedOutputCount = yield ConsumerExecutor_1.default.processDistinctOn(consumer, ExecutorScope_1.default.getMainPath(scope));
tracker.measure('process-distinct-on:main', performance.now() - counter);
postOperation.totalOutputCount = unifiedOutputCount;
}
}
// Export to the destination
counter = performance.now();
const exportRes = yield OutputExecutor_1.default.exportResult(consumer, ConsumerManager_1.default.getExpandedFields(consumer), scope);
tracker.measure('export-result', performance.now() - counter);
// Perform on-success actions if any
if (consumer.outputs.some(x => x.onSuccess)) {
counter = performance.now();
yield ConsumerOnFinishManager_1.default.onConsumerSuccess(consumer, usageId);
tracker.measure('on-success-actions', performance.now() - counter);
}
yield this.performCleanupOperations(scope, tracker);
const finalResult = this.computeFinalResult(tracker, executorResults, usageId, exportRes.key);
finalResult.elapsedMS = performance.now() - start;
if (Algo_1.default.hasVal(postOperation.totalOutputCount))
finalResult.outputCount = postOperation.totalOutputCount;
UsageManager_1.default.endUsage(usageId, finalResult.outputCount, finalResult);
return finalResult;
}
catch (error) {
yield ConsumerOnFinishManager_1.default.onConsumerError(consumer, usageId);
yield this.performCleanupOperations(scope, tracker);
UsageManager_1.default.failUsage(usageId, Helper_1.default.asError(error).message);
throw error;
}
});
/**
* Calculates line-aligned chunk offsets for parallel file processing.
* Each chunk boundary is adjusted to the next newline to avoid breaking lines.
* Returns a single chunk for small files where parallelism overhead isn't worth it.
*/
this.scopeWork = (fileUri, numChunks) => {
const fileSize = fs_1.default.statSync(fileUri).size;
if (fileSize === 0)
return [];
// Small files: single chunk, parallelism overhead not worth it
if (fileSize < Constants_1.default.defaults.MIN_FILE_SIZE_FOR_PARALLEL) {
return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
}
// Calculate optimal chunk count based on file size and CPU cores (-1 cause it is used by the main thread)
const cpus = numChunks !== null && numChunks !== void 0 ? numChunks : (os_1.default.cpus().length - 1);
const maxChunksBySize = Math.floor(fileSize / Constants_1.default.defaults.MIN_CHUNK_SIZE);
const effectiveChunks = Math.min(cpus, maxChunksBySize);
if (effectiveChunks <= 1)
return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
const targetChunkSize = Math.floor(fileSize / effectiveChunks);
const fd = fs_1.default.openSync(fileUri, 'r');
try {
const offsets = [];
let currentStart = 0;
for (let i = 0; i < cpus - 1; i++) {
const targetEnd = currentStart + targetChunkSize;
// Don't overshoot file size
if (targetEnd >= fileSize) {
break;
}
// Find next newline after target boundary
const alignedEnd = this.findNextNewline(fd, targetEnd, fileSize);
offsets.push({ start: currentStart, end: alignedEnd, isFirstChunk: i === 0, fileUri });
currentStart = alignedEnd;
}
// Final chunk goes to end of file
if (currentStart < fileSize) {
offsets.push({ start: currentStart, end: fileSize, isFirstChunk: offsets.length === 0, fileUri });
}
return offsets;
}
finally {
fs_1.default.closeSync(fd);
}
};
/**
* Efficiently finds the next newline character starting from a position.
* Uses small buffer reads for speed.
*/
this.findNextNewline = (fd, position, fileSize) => {
const BUFFER_SIZE = 8192; // 8KB buffer for scanning
const buffer = Buffer.allocUnsafe(BUFFER_SIZE);
let currentPos = position;
while (currentPos < fileSize) {
const bytesToRead = Math.min(BUFFER_SIZE, fileSize - currentPos);
const bytesRead = fs_1.default.readSync(fd, buffer, 0, bytesToRead, currentPos);
if (bytesRead === 0)
break;
// Scan buffer for newline
for (let i = 0; i < bytesRead; i++) {
if (buffer[i] === 0x0A) { // \n
return currentPos + i + 1; // Position after the newline
}
}
currentPos += bytesRead;
}
// No newline found, return file end
return fileSize;
};
this.readySourceFiles = (consumer, scope) => __awaiter(this, void 0, void 0, function* () {
const results = [];
for (let i = 0; i < consumer.producers.length; i++) {
const cProd = consumer.producers[i];
const prod = Environment_1.default.getProducer(cProd.name);
results.push({ prod, cProd, response: yield ProducerExecutor_1.default.ready(prod, scope) });
}
return results;
});
this._getWorkerPath = () => {
// Get the current file's directory
const currentDir = __dirname;
if (ProcessENVManager_1.default.getEnvVariable('NODE_ENV') === 'dev' || ProcessENVManager_1.default.getEnvVariable('NODE_ENV') === 'development')
return path_1.default.resolve('./.build/workers');
const forcedPath = ProcessENVManager_1.default.getEnvVariable('REMORA_WORKERS_PATH');
if (forcedPath && forcedPath.length > 0)
return path_1.default.join(__dirname, forcedPath);
// Check if we're in a published npm package (no .build in path)
if (!currentDir.includes('.build')) {
// We're in the published package, workers are relative to package root
// __dirname is something like: /path/to/package/executors
// Workers are at /path/to/package/workers (sibling folder)
return path_1.default.join(__dirname, '../workers');
}
else {
// We're in development, workers are in ./.build/workers
return path_1.default.resolve('./.build/workers');
}
};
this.reconcileExecutorThreadsResults = (scope, executorResults, tracker) => __awaiter(this, void 0, void 0, function* () {
const mainPath = ExecutorScope_1.default.getMainPath(scope);
ConsumerExecutor_1.default._ensurePath(mainPath);
// Merge all the various files into a single one
if (executorResults.length > 1) {
const perf = performance.now();
const output = fs_1.default.createWriteStream(mainPath);
output.setMaxListeners(executorResults.length + 1);
for (const workerResult of executorResults) {
yield (0, promises_2.pipeline)(fs_1.default.createReadStream(workerResult.resultUri), output, { end: false });
}
output.end();
output.close();
tracker.measure('merge-workers', performance.now() - perf);
}
else if (executorResults.length === 1) {
// If there is only one worker, then just rename the worker .dataset to the general consumer one
yield promises_1.default.rename(executorResults[0].resultUri, mainPath);
}
});
this.performCleanupOperations = (scope, tracker) => __awaiter(this, void 0, void 0, function* () {
const start = performance.now();
yield ExecutorScope_1.default.clearScope(scope);
tracker.measure('cleanup-operations', performance.now() - start);
});
this.computeFinalResult = (tracker, executorResults, executionId, resultUri) => {
const result = {
cycles: Algo_1.default.max(executorResults.map(x => x.cycles)),
elapsedMS: Algo_1.default.sum(executorResults.map(x => x.elapsedMS)),
inputCount: Algo_1.default.sum(executorResults.map(x => x.inputCount)),
outputCount: Algo_1.default.sum(executorResults.map(x => x.outputCount)),
workerCount: executorResults.length,
executionId,
resultUri,
operations: {}
};
for (const res of executorResults) {
for (const opKey of Object.keys(res.operations)) {
const op = res.operations[opKey];
let label = result.operations[opKey];
if (!label) {
result.operations[opKey] = { avg: -1, max: -1, min: -1, elapsedMS: [] };
label = result.operations[opKey];
}
label.elapsedMS.push(op.elapsedMS);
}
// Calculate min, max, avg for all operations after collecting all data
for (const opKey of Object.keys(result.operations)) {
const operation = result.operations[opKey];
if (operation.elapsedMS.length > 0) {
operation.min = Math.min(...operation.elapsedMS);
operation.max = Math.max(...operation.elapsedMS);
operation.avg = Algo_1.default.mean(operation.elapsedMS);
}
}
}
// Add tracker operations to result
const trackerOperations = tracker.getOperations();
for (const opKey of Object.keys(trackerOperations)) {
const trackerOp = trackerOperations[opKey];
const value = trackerOp.elapsedMS;
if (!result.operations[opKey]) {
result.operations[opKey] = { avg: value, max: value, min: value, elapsedMS: [] };
}
result.operations[opKey].elapsedMS.push(value);
}
return result;
};
this.onWorkAdvanced = (packet, index, progress) => {
const { processed, total } = packet;
progress.update((index + 1).toString(), processed / total);
};
}
}
const ExecutorOrchestrator = new ExecutorOrchestratorClass();
exports.default = ExecutorOrchestrator;