@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
177 lines (176 loc) • 9.07 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const fs_1 = __importDefault(require("fs"));
const readline_1 = __importDefault(require("readline"));
const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor"));
const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor"));
const Affirm_1 = __importDefault(require("../core/Affirm"));
const OutputExecutor_1 = __importDefault(require("./OutputExecutor"));
const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager"));
const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance"));
const ExecutorScope_1 = __importDefault(require("./ExecutorScope"));
class Executor {
constructor() {
this._REPORT_WORK_AFTER_LINES = 1000;
/**
* 1. check and ready the local file for processing
* 2. open read stream and write stream
* 3. process the file
* 4. cleanup and after execution actions
*/
this.run = (request) => __awaiter(this, void 0, void 0, function* () {
var _a, e_1, _b, _c;
var _d, _e;
(0, Affirm_1.default)(request, 'Invalid request');
const { consumer, producer, prodDimensions, workerId, chunk, options, scope, reportWork } = request;
const counter = performance.now();
const result = {
executionId: workerId,
cycles: 1,
elapsedMS: -1,
inputCount: -1,
outputCount: -1,
resultUri: ExecutorScope_1.default.getWorkerPath(scope, workerId),
operations: {}
};
ExecutorScope_1.default.ensurePath(result.resultUri);
let totalOutputCount = 0, totalCycles = 1, perf = 0, lineIndex = 0;
const readStream = this.openReadStream(chunk);
const writeStream = this.openWriteStream(scope, workerId);
const fields = ConsumerManager_1.default.getExpandedFields(consumer);
const { isFirstChunk, start, end } = chunk;
const totalBytes = end - start;
let processedBytes = 0;
// Process all the line-independent operations of the consumer in a single pass
const lineStream = readline_1.default.createInterface({ input: readStream, crlfDelay: Infinity });
try {
for (var _f = true, lineStream_1 = __asyncValues(lineStream), lineStream_1_1; lineStream_1_1 = yield lineStream_1.next(), _a = lineStream_1_1.done, !_a; _f = true) {
_c = lineStream_1_1.value;
_f = false;
const line = _c;
if (lineIndex === 0 && isFirstChunk) {
if (!this.shouldProcessFirstLine(producer)) {
lineIndex++;
continue;
}
}
perf = performance.now();
let record = ProducerExecutor_1.default.processLine({
dimensions: prodDimensions,
index: lineIndex,
line,
producer,
tracker: this._performance
});
this._performance.measure('process-line', performance.now() - perf);
if (!record) {
lineIndex++;
continue;
}
perf = performance.now();
record = ConsumerExecutor_1.default.processRecord({
record,
index: lineIndex,
consumer: consumer,
fields,
producer,
dimensions: prodDimensions,
requestOptions: options
});
this._performance.measure('process-record', performance.now() - perf);
if (!record) {
lineIndex++;
continue;
}
perf = performance.now();
const outputLine = OutputExecutor_1.default.outputRecord(record, consumer, fields);
this._performance.measure('output-record', performance.now() - perf);
writeStream.write(outputLine + '\n');
totalOutputCount++;
lineIndex++;
// Report progress to the main thread
if (reportWork && lineIndex % this._REPORT_WORK_AFTER_LINES === 0) {
processedBytes = Math.min(readStream.bytesRead, totalBytes);
reportWork({ processed: processedBytes, total: totalBytes, workerId: workerId });
}
}
}
catch (e_1_1) { e_1 = { error: e_1_1 }; }
finally {
try {
if (!_f && !_a && (_b = lineStream_1.return)) yield _b.call(lineStream_1);
}
finally { if (e_1) throw e_1.error; }
}
// Process the operations that work on multiple lines
if (((_d = consumer.options) === null || _d === void 0 ? void 0 : _d.distinct) === true) {
perf = performance.now();
totalOutputCount = yield ConsumerExecutor_1.default.processDistinct(result.resultUri);
this._performance.measure('process-distinct', performance.now() - perf);
totalCycles++;
}
if ((_e = consumer.options) === null || _e === void 0 ? void 0 : _e.distinctOn) {
perf = performance.now();
totalOutputCount = yield ConsumerExecutor_1.default.processDistinctOn(consumer, result.resultUri);
this._performance.measure('process-distinct-on', performance.now() - perf);
totalCycles++;
}
result.elapsedMS = performance.now() - counter;
result.cycles = totalCycles;
result.inputCount = lineIndex;
result.outputCount = totalOutputCount;
result.operations = this._performance.getOperations();
return result;
});
this.openReadStream = (chunk) => {
const { end, fileUri, start } = chunk;
return fs_1.default.createReadStream(fileUri, { start, end: end });
};
this.openWriteStream = (scope, workerId) => {
const workerPath = ExecutorScope_1.default.getWorkerPath(scope, workerId);
return fs_1.default.createWriteStream(workerPath);
};
this.shouldProcessFirstLine = (producer) => {
(0, Affirm_1.default)(producer, 'Invalid producer');
const { settings: { fileType, hasHeaderRow } } = producer;
switch (fileType) {
case 'PARQUET':
case 'XML':
case 'XLS':
case 'XLSX':
case 'CSV':
return false;
case 'TXT': {
if (hasHeaderRow)
return false;
else
return true;
}
case 'JSON':
case 'JSONL':
return true;
}
};
this._performance = new ExecutorPerformance_1.default();
}
}
exports.default = Executor;