UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

332 lines (331 loc) • 19.5 kB

JavaScript

"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const os_1 = __importDefault(require("os")); const fs_1 = __importDefault(require("fs")); const promises_1 = __importDefault(require("fs/promises")); const path_1 = __importDefault(require("path")); const promises_2 = require("stream/promises"); const workerpool_1 = __importDefault(require("workerpool")); const Affirm_1 = __importDefault(require("../core/Affirm")); const UsageManager_1 = __importDefault(require("../engines/usage/UsageManager")); const Helper_1 = __importDefault(require("../helper/Helper")); const Environment_1 = __importDefault(require("../engines/Environment")); const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor")); const Constants_1 = __importDefault(require("../Constants")); const DriverHelper_1 = __importDefault(require("../drivers/DriverHelper")); const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor")); const OutputExecutor_1 = __importDefault(require("./OutputExecutor")); const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager")); const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance")); const ExecutorProgress_1 = __importDefault(require("./ExecutorProgress")); const Algo_1 = __importDefault(require("../core/Algo")); const ConsumerOnFinishManager_1 = __importDefault(require("../engines/consumer/ConsumerOnFinishManager")); const ExecutorScope_1 = __importDefault(require("./ExecutorScope")); const ProcessENVManager_1 = __importDefault(require("../engines/ProcessENVManager")); class ExecutorOrchestratorClass { constructor() { this.init = () => { if (!this._executorPool) { const options = { workerThreadOpts: { resourceLimits: { maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB } } }; const workerPath = this._getWorkerPath(); this._executorPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ExecutorWorker.js'), options); } }; this.launch = (request) => __awaiter(this, void 0, void 0, function* () { var _a, _b; (0, Affirm_1.default)(request, 'Invalid options'); const { consumer, details, logProgress, options } = request; (0, Affirm_1.default)(consumer, 'Invalid consumer'); (0, Affirm_1.default)(details, 'Invalid execution details'); const tracker = new ExecutorPerformance_1.default(); const _progress = new ExecutorProgress_1.default(logProgress); const { usageId } = UsageManager_1.default.startUsage(consumer, details); const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [] }; try { const start = performance.now(); this.init(); const executorResults = []; let counter = performance.now(); const sourceFilesByProducer = yield this.readySourceFiles(consumer, scope); tracker.measure('ready-producers', performance.now() - counter); let globalWorkerIndex = 0; for (const pair of sourceFilesByProducer) { const { prod, cProd, response } = pair; // Make sure that the data files are there, if missing and isOptional = true, then skip if (!fs_1.default.existsSync(response.files[0].fullUri)) { if (!cProd.isOptional) throw new Error(`Expected data file ${response.files[0].fullUri} of producer ${prod.name} in consumer ${consumer.name} is missing.`); else if (cProd.isOptional === true) continue; } console.log('Starting operations on ', response.files[0].fullUri); // Extract the dimensions for this producer just once const firstLine = (yield DriverHelper_1.default.quickReadFile(response.files[0].fullUri, 1))[0]; const header = ProducerExecutor_1.default.processHeader(firstLine, prod); const prodDimensions = ProducerExecutor_1.default.reconcileHeader(header, prod); const totalFiles = response.files.length; for (const [fileIndex, file] of response.files.entries()) { const chunks = ExecutorOrchestrator.scopeWork(file.fullUri); const workerThreads = []; for (const chunk of chunks) { // Spawn off thread const workerId = `${usageId}_${globalWorkerIndex}`; const currentWorkerIndex = globalWorkerIndex; globalWorkerIndex++; const workerData = { producer: prod, chunk, consumer, prodDimensions, workerId, scope, options }; _progress.register((currentWorkerIndex + 1).toString(), prod.name, fileIndex, totalFiles); scope.workersId.push(workerId); workerThreads.push(this._executorPool.exec('executor', [workerData], { on: payload => this.onWorkAdvanced(payload, currentWorkerIndex, _progress) })); } executorResults.push(...yield Promise.all(workerThreads)); // WARNING: will this not create problems when multiple are executed together at the same time since this is a singleton?!? yield this._executorPool.terminate(); } } _progress.complete(); if (executorResults.some(x => !Algo_1.default.hasVal(x))) throw new Error(`${executorResults.filter(x => !Algo_1.default.hasVal(x)).length} worker(s) failed to produce valid results`); yield this.reconcileExecutorThreadsResults(scope, executorResults, tracker); // If there is more than one worker, then I need to redo the operations that are done on multiple lines (cause now the worker files have been merged together) const postOperation = { totalOutputCount: null }; if (executorResults.length > 1) { if (((_a = consumer.options) === null || _a === void 0 ? void 0 : _a.distinct) === true) { counter = performance.now(); const unifiedOutputCount = yield ConsumerExecutor_1.default.processDistinct(ExecutorScope_1.default.getMainPath(scope)); tracker.measure('process-distinct:main', performance.now() - counter); postOperation.totalOutputCount = unifiedOutputCount; } if ((_b = consumer.options) === null || _b === void 0 ? void 0 : _b.distinctOn) { counter = performance.now(); const unifiedOutputCount = yield ConsumerExecutor_1.default.processDistinctOn(consumer, ExecutorScope_1.default.getMainPath(scope)); tracker.measure('process-distinct-on:main', performance.now() - counter); postOperation.totalOutputCount = unifiedOutputCount; } } // Export to the destination counter = performance.now(); const exportRes = yield OutputExecutor_1.default.exportResult(consumer, ConsumerManager_1.default.getExpandedFields(consumer), scope); tracker.measure('export-result', performance.now() - counter); // Perform on-success actions if any if (consumer.outputs.some(x => x.onSuccess)) { counter = performance.now(); yield ConsumerOnFinishManager_1.default.onConsumerSuccess(consumer, usageId); tracker.measure('on-success-actions', performance.now() - counter); } yield this.performCleanupOperations(scope, tracker); const finalResult = this.computeFinalResult(tracker, executorResults, usageId, exportRes.key); finalResult.elapsedMS = performance.now() - start; if (Algo_1.default.hasVal(postOperation.totalOutputCount)) finalResult.outputCount = postOperation.totalOutputCount; UsageManager_1.default.endUsage(usageId, finalResult.outputCount, finalResult); return finalResult; } catch (error) { yield ConsumerOnFinishManager_1.default.onConsumerError(consumer, usageId); yield this.performCleanupOperations(scope, tracker); UsageManager_1.default.failUsage(usageId, Helper_1.default.asError(error).message); throw error; } }); /** * Calculates line-aligned chunk offsets for parallel file processing. * Each chunk boundary is adjusted to the next newline to avoid breaking lines. * Returns a single chunk for small files where parallelism overhead isn't worth it. */ this.scopeWork = (fileUri, numChunks) => { const fileSize = fs_1.default.statSync(fileUri).size; if (fileSize === 0) return []; // Small files: single chunk, parallelism overhead not worth it if (fileSize < Constants_1.default.defaults.MIN_FILE_SIZE_FOR_PARALLEL) { return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }]; } // Calculate optimal chunk count based on file size and CPU cores (-1 cause it is used by the main thread) const cpus = numChunks !== null && numChunks !== void 0 ? numChunks : (os_1.default.cpus().length - 1); const maxChunksBySize = Math.floor(fileSize / Constants_1.default.defaults.MIN_CHUNK_SIZE); const effectiveChunks = Math.min(cpus, maxChunksBySize); if (effectiveChunks <= 1) return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }]; const targetChunkSize = Math.floor(fileSize / effectiveChunks); const fd = fs_1.default.openSync(fileUri, 'r'); try { const offsets = []; let currentStart = 0; for (let i = 0; i < cpus - 1; i++) { const targetEnd = currentStart + targetChunkSize; // Don't overshoot file size if (targetEnd >= fileSize) { break; } // Find next newline after target boundary const alignedEnd = this.findNextNewline(fd, targetEnd, fileSize); offsets.push({ start: currentStart, end: alignedEnd, isFirstChunk: i === 0, fileUri }); currentStart = alignedEnd; } // Final chunk goes to end of file if (currentStart < fileSize) { offsets.push({ start: currentStart, end: fileSize, isFirstChunk: offsets.length === 0, fileUri }); } return offsets; } finally { fs_1.default.closeSync(fd); } }; /** * Efficiently finds the next newline character starting from a position. * Uses small buffer reads for speed. */ this.findNextNewline = (fd, position, fileSize) => { const BUFFER_SIZE = 8192; // 8KB buffer for scanning const buffer = Buffer.allocUnsafe(BUFFER_SIZE); let currentPos = position; while (currentPos < fileSize) { const bytesToRead = Math.min(BUFFER_SIZE, fileSize - currentPos); const bytesRead = fs_1.default.readSync(fd, buffer, 0, bytesToRead, currentPos); if (bytesRead === 0) break; // Scan buffer for newline for (let i = 0; i < bytesRead; i++) { if (buffer[i] === 0x0A) { // \n return currentPos + i + 1; // Position after the newline } } currentPos += bytesRead; } // No newline found, return file end return fileSize; }; this.readySourceFiles = (consumer, scope) => __awaiter(this, void 0, void 0, function* () { const results = []; for (let i = 0; i < consumer.producers.length; i++) { const cProd = consumer.producers[i]; const prod = Environment_1.default.getProducer(cProd.name); results.push({ prod, cProd, response: yield ProducerExecutor_1.default.ready(prod, scope) }); } return results; }); this._getWorkerPath = () => { // Get the current file's directory const currentDir = __dirname; if (ProcessENVManager_1.default.getEnvVariable('NODE_ENV') === 'dev' || ProcessENVManager_1.default.getEnvVariable('NODE_ENV') === 'development') return path_1.default.resolve('./.build/workers'); const forcedPath = ProcessENVManager_1.default.getEnvVariable('REMORA_WORKERS_PATH'); if (forcedPath && forcedPath.length > 0) return path_1.default.join(__dirname, forcedPath); // Check if we're in a published npm package (no .build in path) if (!currentDir.includes('.build')) { // We're in the published package, workers are relative to package root // __dirname is something like: /path/to/package/executors // Workers are at /path/to/package/workers (sibling folder) return path_1.default.join(__dirname, '../workers'); } else { // We're in development, workers are in ./.build/workers return path_1.default.resolve('./.build/workers'); } }; this.reconcileExecutorThreadsResults = (scope, executorResults, tracker) => __awaiter(this, void 0, void 0, function* () { const mainPath = ExecutorScope_1.default.getMainPath(scope); ConsumerExecutor_1.default._ensurePath(mainPath); // Merge all the various files into a single one if (executorResults.length > 1) { const perf = performance.now(); const output = fs_1.default.createWriteStream(mainPath); output.setMaxListeners(executorResults.length + 1); for (const workerResult of executorResults) { yield (0, promises_2.pipeline)(fs_1.default.createReadStream(workerResult.resultUri), output, { end: false }); } output.end(); output.close(); tracker.measure('merge-workers', performance.now() - perf); } else if (executorResults.length === 1) { // If there is only one worker, then just rename the worker .dataset to the general consumer one yield promises_1.default.rename(executorResults[0].resultUri, mainPath); } }); this.performCleanupOperations = (scope, tracker) => __awaiter(this, void 0, void 0, function* () { const start = performance.now(); yield ExecutorScope_1.default.clearScope(scope); tracker.measure('cleanup-operations', performance.now() - start); }); this.computeFinalResult = (tracker, executorResults, executionId, resultUri) => { const result = { cycles: Algo_1.default.max(executorResults.map(x => x.cycles)), elapsedMS: Algo_1.default.sum(executorResults.map(x => x.elapsedMS)), inputCount: Algo_1.default.sum(executorResults.map(x => x.inputCount)), outputCount: Algo_1.default.sum(executorResults.map(x => x.outputCount)), workerCount: executorResults.length, executionId, resultUri, operations: {} }; for (const res of executorResults) { for (const opKey of Object.keys(res.operations)) { const op = res.operations[opKey]; let label = result.operations[opKey]; if (!label) { result.operations[opKey] = { avg: -1, max: -1, min: -1, elapsedMS: [] }; label = result.operations[opKey]; } label.elapsedMS.push(op.elapsedMS); } // Calculate min, max, avg for all operations after collecting all data for (const opKey of Object.keys(result.operations)) { const operation = result.operations[opKey]; if (operation.elapsedMS.length > 0) { operation.min = Math.min(...operation.elapsedMS); operation.max = Math.max(...operation.elapsedMS); operation.avg = Algo_1.default.mean(operation.elapsedMS); } } } // Add tracker operations to result const trackerOperations = tracker.getOperations(); for (const opKey of Object.keys(trackerOperations)) { const trackerOp = trackerOperations[opKey]; const value = trackerOp.elapsedMS; if (!result.operations[opKey]) { result.operations[opKey] = { avg: value, max: value, min: value, elapsedMS: [] }; } result.operations[opKey].elapsedMS.push(value); } return result; }; this.onWorkAdvanced = (packet, index, progress) => { const { processed, total } = packet; progress.update((index + 1).toString(), processed / total); }; } } const ExecutorOrchestrator = new ExecutorOrchestratorClass(); exports.default = ExecutorOrchestrator;