UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

177 lines (176 loc) 9.07 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __asyncValues = (this && this.__asyncValues) || function (o) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var m = o[Symbol.asyncIterator], i; return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i); function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; } function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const fs_1 = __importDefault(require("fs")); const readline_1 = __importDefault(require("readline")); const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor")); const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor")); const Affirm_1 = __importDefault(require("../core/Affirm")); const OutputExecutor_1 = __importDefault(require("./OutputExecutor")); const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager")); const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance")); const ExecutorScope_1 = __importDefault(require("./ExecutorScope")); class Executor { constructor() { this._REPORT_WORK_AFTER_LINES = 1000; /** * 1. check and ready the local file for processing * 2. open read stream and write stream * 3. process the file * 4. cleanup and after execution actions */ this.run = (request) => __awaiter(this, void 0, void 0, function* () { var _a, e_1, _b, _c; var _d, _e; (0, Affirm_1.default)(request, 'Invalid request'); const { consumer, producer, prodDimensions, workerId, chunk, options, scope, reportWork } = request; const counter = performance.now(); const result = { executionId: workerId, cycles: 1, elapsedMS: -1, inputCount: -1, outputCount: -1, resultUri: ExecutorScope_1.default.getWorkerPath(scope, workerId), operations: {} }; ExecutorScope_1.default.ensurePath(result.resultUri); let totalOutputCount = 0, totalCycles = 1, perf = 0, lineIndex = 0; const readStream = this.openReadStream(chunk); const writeStream = this.openWriteStream(scope, workerId); const fields = ConsumerManager_1.default.getExpandedFields(consumer); const { isFirstChunk, start, end } = chunk; const totalBytes = end - start; let processedBytes = 0; // Process all the line-independent operations of the consumer in a single pass const lineStream = readline_1.default.createInterface({ input: readStream, crlfDelay: Infinity }); try { for (var _f = true, lineStream_1 = __asyncValues(lineStream), lineStream_1_1; lineStream_1_1 = yield lineStream_1.next(), _a = lineStream_1_1.done, !_a; _f = true) { _c = lineStream_1_1.value; _f = false; const line = _c; if (lineIndex === 0 && isFirstChunk) { if (!this.shouldProcessFirstLine(producer)) { lineIndex++; continue; } } perf = performance.now(); let record = ProducerExecutor_1.default.processLine({ dimensions: prodDimensions, index: lineIndex, line, producer, tracker: this._performance }); this._performance.measure('process-line', performance.now() - perf); if (!record) { lineIndex++; continue; } perf = performance.now(); record = ConsumerExecutor_1.default.processRecord({ record, index: lineIndex, consumer: consumer, fields, producer, dimensions: prodDimensions, requestOptions: options }); this._performance.measure('process-record', performance.now() - perf); if (!record) { lineIndex++; continue; } perf = performance.now(); const outputLine = OutputExecutor_1.default.outputRecord(record, consumer, fields); this._performance.measure('output-record', performance.now() - perf); writeStream.write(outputLine + '\n'); totalOutputCount++; lineIndex++; // Report progress to the main thread if (reportWork && lineIndex % this._REPORT_WORK_AFTER_LINES === 0) { processedBytes = Math.min(readStream.bytesRead, totalBytes); reportWork({ processed: processedBytes, total: totalBytes, workerId: workerId }); } } } catch (e_1_1) { e_1 = { error: e_1_1 }; } finally { try { if (!_f && !_a && (_b = lineStream_1.return)) yield _b.call(lineStream_1); } finally { if (e_1) throw e_1.error; } } // Process the operations that work on multiple lines if (((_d = consumer.options) === null || _d === void 0 ? void 0 : _d.distinct) === true) { perf = performance.now(); totalOutputCount = yield ConsumerExecutor_1.default.processDistinct(result.resultUri); this._performance.measure('process-distinct', performance.now() - perf); totalCycles++; } if ((_e = consumer.options) === null || _e === void 0 ? void 0 : _e.distinctOn) { perf = performance.now(); totalOutputCount = yield ConsumerExecutor_1.default.processDistinctOn(consumer, result.resultUri); this._performance.measure('process-distinct-on', performance.now() - perf); totalCycles++; } result.elapsedMS = performance.now() - counter; result.cycles = totalCycles; result.inputCount = lineIndex; result.outputCount = totalOutputCount; result.operations = this._performance.getOperations(); return result; }); this.openReadStream = (chunk) => { const { end, fileUri, start } = chunk; return fs_1.default.createReadStream(fileUri, { start, end: end }); }; this.openWriteStream = (scope, workerId) => { const workerPath = ExecutorScope_1.default.getWorkerPath(scope, workerId); return fs_1.default.createWriteStream(workerPath); }; this.shouldProcessFirstLine = (producer) => { (0, Affirm_1.default)(producer, 'Invalid producer'); const { settings: { fileType, hasHeaderRow } } = producer; switch (fileType) { case 'PARQUET': case 'XML': case 'XLS': case 'XLSX': case 'CSV': return false; case 'TXT': { if (hasHeaderRow) return false; else return true; } case 'JSON': case 'JSONL': return true; } }; this._performance = new ExecutorPerformance_1.default(); } } exports.default = Executor;