UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

825 lines (824 loc) 40.4 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __asyncValues = (this && this.__asyncValues) || function (o) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var m = o[Symbol.asyncIterator], i; return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i); function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; } function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const path_1 = __importDefault(require("path")); const fs_1 = __importDefault(require("fs")); const fs_2 = require("fs"); const readline_1 = require("readline"); const Constants_1 = __importDefault(require("../../Constants")); const DatasetManager_1 = __importDefault(require("./DatasetManager")); const DatasetRecord_1 = __importDefault(require("./DatasetRecord")); const DatasetRecordPool_1 = __importDefault(require("./DatasetRecordPool")); const Affirm_1 = __importDefault(require("../../core/Affirm")); const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory")); const Helper_1 = __importDefault(require("../../helper/Helper")); const Algo_1 = __importDefault(require("../../core/Algo")); const Environment_1 = __importDefault(require("../Environment")); const Logger_1 = __importDefault(require("../../helper/Logger")); const ProducerManager_1 = __importDefault(require("../producer/ProducerManager")); class Dataset { constructor(options) { var _a, _b; this.getPath = () => this._path; this.setPath = (path) => { this._path = path; return this; }; this.getFile = () => this._file; this.getExecutionId = () => this._executionId; this.getBatchSize = () => this._batchSize; this.setFirstLine = (firstLine) => { this._firstLine = firstLine; return this; }; this.getFirstLine = () => this._firstLine; this.getCount = () => this._count; this.setCount = (count) => { this._count = count; return this; }; this.getCycles = () => this._iterations; this.getDelimiter = () => this._delimiter; this.setDelimiter = (delimiter) => { this._delimiter = delimiter; return this; }; this.getOperations = () => this._operations; this.load = (source) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(source, 'Invalid source'); this._startOperation('load', { source: source.engine }); try { const driver = yield DriverFactory_1.default.instantiateSource(source); yield driver.download(this); } catch (error) { if (this._file.isOptional) { Logger_1.default.log(`Error loading dataset "${this.name}", creating default configuration and mock data because "isOptional" is true.`); if (!this.getDimensions() || this.getDimensions().length === 0) this.setDimensions(ProducerManager_1.default.getColumns(this._baseProducer).map((x, i) => { var _a; return ({ index: i, key: x.nameInProducer, name: x.aliasInProducer, hidden: false, type: (_a = x.dimension) === null || _a === void 0 ? void 0 : _a.type }); })); if (!this.getFirstLine() || this.getFirstLine().length === 0) { if (this._file.hasHeaderRow) this.setFirstLine(this.getDimensions().map(x => x.key).join(this.getDelimiter())); else this.setFirstLine(''); } } else throw error; } this._size = this._computeSize(); this._finishOperation('load'); return this; }); /** * Load data from an in-memory array of objects and create a local dataset file * @param data Array of objects to load into the dataset * @param dimensions Optional dimensions array. If not provided, will be inferred from the first object * @param delimiter Optional delimiter. Defaults to comma */ this.loadFromMemory = (data_1, producer_1, ...args_1) => __awaiter(this, [data_1, producer_1, ...args_1], void 0, function* (data, producer, discover = false) { var _a, _b; (0, Affirm_1.default)(data, 'Invalid data array'); (0, Affirm_1.default)(Array.isArray(data), 'Data must be an array'); if (data.length === 0) { console.warn('Loading empty array into dataset'); return this; } this._startOperation('load-from-memory', { recordCount: data.length }); try { this._delimiter = (_b = (_a = producer.settings) === null || _a === void 0 ? void 0 : _a.delimiter) !== null && _b !== void 0 ? _b : this._delimiter; // Discover the dimensions on your own const firstItem = data[0]; const firstLine = typeof firstItem === 'object' ? JSON.stringify(firstItem) : String(firstItem); const buildRes = yield DatasetManager_1.default.buildDimensionsFromFirstLine(firstLine, this._file, producer, discover); this._dimensions = buildRes.dimensions; this._updateRecordPoolDimensions(); // Clear existing file content this.clear(); // Convert objects to DatasetRecord format and write to file const records = []; for (const item of data) { // Create a row string by extracting values in dimension order const values = this._dimensions.map(dim => { const value = item[dim.key]; // Handle null/undefined values return value !== null && value !== undefined ? String(value) : ''; }); const rowString = values.join(this._delimiter); const record = new DatasetRecord_1.default(rowString, this._dimensions, this._delimiter); records.push(record); } // Write all records to the dataset file yield this.append(records); this._size = this._computeSize(); this._count = data.length; this._finishOperation('load-from-memory'); return this; } catch (error) { this._finishOperation('load-from-memory'); throw new Error(`Failed to load data from memory: ${error instanceof Error ? error.message : String(error)}`); } }); /** * Stream through the file in batches and apply a transformation */ this.transformStream = (transformer_1, ...args_1) => __awaiter(this, [transformer_1, ...args_1], void 0, function* (transformer, options = {}) { var _a, e_1, _b, _c; var _d, _e, _f, _g; const inputPath = options.inputPath || this._path; const outputPath = options.outputPath || this._tempPath; const fromLine = (_e = (_d = options.range) === null || _d === void 0 ? void 0 : _d.fromLine) !== null && _e !== void 0 ? _e : -1; const toLine = (_g = (_f = options.range) === null || _f === void 0 ? void 0 : _f.toLine) !== null && _g !== void 0 ? _g : Infinity; this.ensureFile(outputPath); if (!fs_1.default.existsSync(inputPath)) throw new Error(`Input file does not exist: ${inputPath}`); this._startOperation('transform-stream'); const readStream = (0, fs_2.createReadStream)(inputPath); const writeStream = (0, fs_2.createWriteStream)(outputPath); const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity }); const dimensions = Algo_1.default.deepClone(this._dimensions); let batch = []; let lineCount = 0; let index = 0; try { for (var _h = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _h = true) { _c = rl_1_1.value; _h = false; const line = _c; try { if (index < fromLine) { index++; continue; } else if (index >= toLine) { index++; break; } index++; // Reuse record from pool and reinitialize it with new line data const record = this._recordPool.getNext(line, dimensions, this._delimiter); batch.push(record); lineCount++; if (batch.length >= this._batchSize) { const transformedBatch = yield transformer(batch); for (const transformedRecord of transformedBatch) { writeStream.write(transformedRecord.stringify() + '\n'); } batch = []; this._recordPool.reset(); } } catch (error) { Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`); lineCount++; } } } catch (e_1_1) { e_1 = { error: e_1_1 }; } finally { try { if (!_h && !_a && (_b = rl_1.return)) yield _b.call(rl_1); } finally { if (e_1) throw e_1.error; } } // Process remaining items in the last batch if (batch.length > 0) { const transformedBatch = yield transformer(batch); for (const transformedRecord of transformedBatch) { writeStream.write(transformedRecord.stringify() + '\n'); } } writeStream.end(); // Wait for write stream to finish yield new Promise((resolve, reject) => { writeStream.on('finish', resolve); writeStream.on('error', reject); }); // Replace original file with transformed file if (outputPath === this._tempPath) { fs_1.default.renameSync(this._tempPath, this._path); } this._count = lineCount; this._size = this._computeSize(); this._iterations++; this._finishOperation('transform-stream'); }); /** * Filter items in the file using batched streaming */ this.filter = (predicate_1, ...args_1) => __awaiter(this, [predicate_1, ...args_1], void 0, function* (predicate, options = {}) { this._startOperation('filter'); let globalIndex = 0; yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () { const filteredBatch = []; for (const item of batch) { if (predicate(item, globalIndex)) { filteredBatch.push(item); } globalIndex++; } return filteredBatch; }), options); this._finishOperation('filter'); return this; }); /** * Map items in the file using batched streaming */ this.map = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) { this._startOperation('map'); let globalIndex = 0; yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () { const mappedBatch = []; for (const item of batch) { const mappedItem = yield mapper(item, globalIndex); mappedBatch.push(mappedItem); globalIndex++; } return mappedBatch; }), options); this._finishOperation('map'); return this; }); /** * FlatMap items in the file using batched streaming - maps each item to an array of items and flattens the result */ this.flatMap = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) { this._startOperation('flat-map'); let globalIndex = 0; yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () { const flatMappedBatch = []; for (const item of batch) { const mappedItems = yield mapper(item, globalIndex); flatMappedBatch.push(...mappedItems); globalIndex++; } return flatMappedBatch; }), options); this._finishOperation('flat-map'); return this; }); /** * Sort the dataset by one or more dimensions using batched streaming with external merge sort * @param compareFn Comparison function that takes two DatasetRecord objects and returns a number * @param options Optional parameters for sorting */ this.sort = (compareFn_1, ...args_1) => __awaiter(this, [compareFn_1, ...args_1], void 0, function* (compareFn, options = {}) { var _a, e_2, _b, _c; this._startOperation('sort'); const { batchSize = this._batchSize } = options; if (!fs_1.default.existsSync(this._path)) { throw new Error(`File does not exist: ${this._path}`); } // Phase 1: Sort individual batches and write them to temporary files const tempFiles = []; const readStream = (0, fs_2.createReadStream)(this._path); const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity }); let batch = []; let batchIndex = 0; try { for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) { _c = rl_2_1.value; _d = false; const line = _c; try { const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter); batch.push(record); if (batch.length >= batchSize) { // Sort the batch batch.sort(compareFn); // Write sorted batch to temporary file const tempFile = `${this._tempPath}_batch_${batchIndex}`; this.ensureFile(tempFile); const writeStream = (0, fs_2.createWriteStream)(tempFile); for (const record of batch) { writeStream.write(record.stringify() + '\n'); } writeStream.end(); yield new Promise((resolve, reject) => { writeStream.on('finish', resolve); writeStream.on('error', reject); }); tempFiles.push(tempFile); batch = []; batchIndex++; } } catch (error) { Logger_1.default.log(`Error parsing line during sort: ${error}`); } } } catch (e_2_1) { e_2 = { error: e_2_1 }; } finally { try { if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2); } finally { if (e_2) throw e_2.error; } } // Handle remaining items in the last batch if (batch.length > 0) { batch.sort(compareFn); const tempFile = `${this._tempPath}_batch_${batchIndex}`; this.ensureFile(tempFile); const writeStream = (0, fs_2.createWriteStream)(tempFile); for (const record of batch) { writeStream.write(record.stringify() + '\n'); } writeStream.end(); yield new Promise((resolve, reject) => { writeStream.on('finish', resolve); writeStream.on('error', reject); }); tempFiles.push(tempFile); } rl.close(); readStream.close(); // Phase 2: Merge sorted batches using k-way merge if (tempFiles.length === 0) { this._finishOperation('sort'); return this; } if (tempFiles.length === 1) { // Only one batch, just rename it fs_1.default.renameSync(tempFiles[0], this._path); } else { // Perform k-way merge yield this._performKWayMergeSort(tempFiles, this._path, compareFn); } // Clean up temporary files for (const tempFile of tempFiles) { if (fs_1.default.existsSync(tempFile)) { fs_1.default.unlinkSync(tempFile); } } this._iterations++; this._finishOperation('sort'); return this; }); /** * Convenience method to sort by a specific dimension * @param dimensionName The name of the dimension to sort by * @param ascending Whether to sort in ascending order (default: true) */ this.sortByDimension = (dimensionName_1, ...args_1) => __awaiter(this, [dimensionName_1, ...args_1], void 0, function* (dimensionName, ascending = true) { const dimension = this._dimensions.find(d => d.name === dimensionName); if (!dimension) { throw new Error(`Dimension "${dimensionName}" not found. Available dimensions: ${this._dimensions.map(d => d.name).join(', ')}`); } const compareFn = (a, b) => { const aValue = a.getValue(dimensionName); const bValue = b.getValue(dimensionName); // Handle null/undefined values if (aValue == null && bValue == null) return 0; if (aValue == null) return ascending ? -1 : 1; if (bValue == null) return ascending ? 1 : -1; // Try to parse as numbers for numeric comparison const aNum = Number(aValue); const bNum = Number(bValue); if (!isNaN(aNum) && !isNaN(bNum)) { const result = aNum - bNum; return ascending ? result : -result; } // String comparison const aStr = String(aValue); const bStr = String(bValue); const result = aStr.localeCompare(bStr); return ascending ? result : -result; }; return this.sort(compareFn); }); /** * Remove duplicate records from the dataset using batched streaming * @param keySelector Optional function to generate a key for comparison. If not provided, uses the entire record */ this.distinct = (keySelector) => __awaiter(this, void 0, void 0, function* () { this._startOperation('distinct'); if (!fs_1.default.existsSync(this._path)) { throw new Error(`File does not exist: ${this._path}`); } const seen = new Set(); yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () { const distinctBatch = []; for (const record of batch) { // Generate a key for uniqueness check const recordKey = keySelector ? keySelector(record) : record.stringify(); if (!seen.has(recordKey)) { seen.add(recordKey); distinctBatch.push(record); } } return distinctBatch; })); this._finishOperation('distinct'); return this; }); /** * Remove duplicate records based on specific dimensions * @param dimensionNames Array of dimension names to use for uniqueness comparison */ this.distinctByDimensions = (dimensionNames) => __awaiter(this, void 0, void 0, function* () { // Validate that all dimension names exist const existingNames = this._dimensions.map(d => d.name); const missingDimensions = dimensionNames.filter(name => !existingNames.includes(name)); (0, Affirm_1.default)(missingDimensions.length === 0, `Cannot create distinct by dimensions. Missing dimensions: ${missingDimensions.join(', ')}`); const keySelector = (record) => { const values = dimensionNames.map(name => { const value = record.getValue(name); return value !== null && value !== undefined ? String(value) : ''; }); return values.join('|'); // Use pipe as separator to avoid collisions }; return this.distinct(keySelector); }); /** * Internal method to perform k-way merge of sorted files */ this._performKWayMergeSort = (tempFiles, outputPath, compareFn) => __awaiter(this, void 0, void 0, function* () { const readers = []; // Initialize readers for each temp file for (const file of tempFiles) { const readStream = (0, fs_2.createReadStream)(file); const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity }); const iterator = rl[Symbol.asyncIterator](); readers.push({ file, rl, currentRecord: null, finished: false, iterator }); } // Read first record from each file for (const reader of readers) { try { const { value, done } = yield reader.iterator.next(); if (!done) reader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter); else reader.finished = true; } catch (_a) { reader.finished = true; } } // Write merged results this.ensureFile(outputPath); const writeStream = (0, fs_2.createWriteStream)(outputPath); while (readers.some(r => !r.finished)) { // Find the reader with the smallest current record let minReader = null; for (const reader of readers) { if (!reader.finished && reader.currentRecord) { if (!minReader || !minReader.currentRecord || compareFn(reader.currentRecord, minReader.currentRecord) < 0) { minReader = reader; } } } if (minReader && minReader.currentRecord) { // Write the smallest record writeStream.write(minReader.currentRecord.stringify() + '\n'); // Read next record from the same reader try { const { value, done } = yield minReader.iterator.next(); if (!done) { minReader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter); } else { minReader.finished = true; minReader.currentRecord = null; } } catch (_b) { minReader.finished = true; minReader.currentRecord = null; } } } writeStream.end(); yield new Promise((resolve, reject) => { writeStream.on('finish', resolve); writeStream.on('error', reject); }); // Close all readers for (const reader of readers) { reader.rl.close(); } }); /** * Stream through batches without modification (for reading) */ this.streamBatches = (processor) => __awaiter(this, void 0, void 0, function* () { var _a, e_3, _b, _c; if (!fs_1.default.existsSync(this._path)) { throw new Error(`File does not exist: ${this._path}`); } this._startOperation('stream-batches'); const readStream = (0, fs_2.createReadStream)(this._path); const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity }); let batch = []; let batchIndex = 0; let lineCount = 0; try { for (var _d = true, rl_3 = __asyncValues(rl), rl_3_1; rl_3_1 = yield rl_3.next(), _a = rl_3_1.done, !_a; _d = true) { _c = rl_3_1.value; _d = false; const line = _c; try { const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter); batch.push(record); lineCount++; if (batch.length >= this._batchSize) { yield processor(batch, batchIndex); batch = []; batchIndex++; } } catch (error) { Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`); } } } catch (e_3_1) { e_3 = { error: e_3_1 }; } finally { try { if (!_d && !_a && (_b = rl_3.return)) yield _b.call(rl_3); } finally { if (e_3) throw e_3.error; } } // Process remaining items in the last batch if (batch.length > 0) { yield processor(batch, batchIndex); } this._iterations++; this._finishOperation('stream-batches'); }); /** * Check if file exists */ this.exists = () => fs_1.default.existsSync(this._path); /** * Create the file if it doesn't exist */ this.ensureFile = (filePath) => { const dir = path_1.default.dirname(filePath); if (!fs_1.default.existsSync(dir)) { fs_1.default.mkdirSync(dir, { recursive: true }); } if (!fs_1.default.existsSync(filePath)) { fs_1.default.writeFileSync(filePath, ''); } }; /** * Clear the file content */ this.clear = () => { if (fs_1.default.existsSync(this._path)) { fs_1.default.writeFileSync(this._path, ''); } return this; }; /** * Append data to the file */ this.append = (items) => __awaiter(this, void 0, void 0, function* () { this._startOperation('append'); const writeStream = (0, fs_2.createWriteStream)(this._path, { flags: 'a' }); for (const item of items) { writeStream.write(item.stringify() + '\n'); } writeStream.end(); yield new Promise((resolve, reject) => { writeStream.on('finish', resolve); writeStream.on('error', reject); }); this._finishOperation('append'); return this; }); /** * Read a specified number of lines from the file */ this.readLines = (numberOfLines) => __awaiter(this, void 0, void 0, function* () { var _a, e_4, _b, _c; if (!fs_1.default.existsSync(this._path)) return []; if (numberOfLines <= 0) return []; this._startOperation('read-lines', { numberOfLines }); const readStream = (0, fs_2.createReadStream)(this._path); const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity }); const results = []; let lineCount = 0; try { for (var _d = true, rl_4 = __asyncValues(rl), rl_4_1; rl_4_1 = yield rl_4.next(), _a = rl_4_1.done, !_a; _d = true) { _c = rl_4_1.value; _d = false; const line = _c; try { const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter); results.push(record); lineCount++; if (lineCount >= numberOfLines) { break; } } catch (error) { Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`); lineCount++; } } } catch (e_4_1) { e_4 = { error: e_4_1 }; } finally { try { if (!_d && !_a && (_b = rl_4.return)) yield _b.call(rl_4); } finally { if (e_4) throw e_4.error; } } rl.close(); readStream.close(); this._finishOperation('read-lines'); return results; }); this.getDimensions = () => this._dimensions; this.setDimensions = (dimensions) => { this._dimensions = dimensions; return this; }; this.setSingleDimension = (newDimension, oldDimension) => { (0, Affirm_1.default)(newDimension, `Invalid new dimension`); (0, Affirm_1.default)(oldDimension, 'Invalid old dimension'); const current = this._dimensions.findIndex(x => x.index === oldDimension.index); (0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`); this._dimensions.splice(current, 1, newDimension); return this; }; /** * Update the record pool when dimensions change */ this._updateRecordPoolDimensions = () => { // Update all pooled records with current dimensions this._recordPool.updateDimensions(this._dimensions, this._delimiter); }; this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) { console.log(`DS ${this.name} (${this._count} | ${this._iterations})`); console.log(this._dimensions.map(x => x.name).join(this._delimiter)); const records = yield this.readLines(count); records.forEach((x, i) => console.log(`[${i}]`, full ? x : x.stringify())); console.log('----------'); }); this.printStats = () => { var _a, _b; const total = ((_b = (_a = this._operations) === null || _a === void 0 ? void 0 : _a.map(x => x.elapsedMs)) !== null && _b !== void 0 ? _b : []).reduce((sum, ms) => sum + ms, 0); console.log(`DS[stats] ${this.name} (size: ${this._count} | cycles: ${this._iterations} | ms: ${Helper_1.default.formatDuration(total)})`); console.log(`Operations: ${this._operations.length}`); console.log(JSON.stringify(this._operations, null, 4)); }; /** * Destroy the dataset by removing all allocated memory and created files */ this.destroy = () => { this._startOperation('destroy'); try { // Remove the main dataset file if (fs_1.default.existsSync(this._path)) { fs_1.default.unlinkSync(this._path); } // Remove the temporary file if it exists if (fs_1.default.existsSync(this._tempPath)) { fs_1.default.unlinkSync(this._tempPath); } // Remove any batch temporary files that might still exist const tempDir = path_1.default.dirname(this._tempPath); if (fs_1.default.existsSync(tempDir)) { const files = fs_1.default.readdirSync(tempDir); const batchFiles = files.filter(file => file.startsWith(path_1.default.basename(this._tempPath) + '_batch_')); for (const batchFile of batchFiles) { const fullPath = path_1.default.join(tempDir, batchFile); if (fs_1.default.existsSync(fullPath)) { fs_1.default.unlinkSync(fullPath); } } // Try to remove the temp directory if it's empty try { if (fs_1.default.readdirSync(tempDir).length === 0) { fs_1.default.rmdirSync(tempDir); } } catch (_a) { // Directory not empty or other error, ignore } } this._finishOperation('destroy'); } catch (error) { this._finishOperation('destroy'); throw new Error(`Failed to destroy dataset: ${error instanceof Error ? error.message : String(error)}`); } }; this._startOperation = (name, metadata) => { const newOperation = { name, count: -1, elapsedMs: performance.now(), status: 'running', subOperations: [], metadata: metadata }; const runningOperation = this._findRunningOperation(); if (runningOperation) runningOperation.subOperations.push(newOperation); else this._operations.push(newOperation); }; this._finishOperation = (name) => { const finishedOperation = this._findRunningOperation(name); if (finishedOperation) { finishedOperation.status = 'completed'; finishedOperation.count = this._count; finishedOperation.elapsedMs = performance.now() - finishedOperation.elapsedMs; } else { const currentOperation = this._operations.find(x => x.status === 'running'); const currentName = currentOperation ? currentOperation.name : 'none'; console.warn(`Finished operation "${name}" but no running operation with that name was found (current running: "${currentName}")`); } }; this._findRunningOperation = (name) => { const searchInOperations = (operations) => { for (const operation of operations) { if (operation.status === 'running' && (name === undefined || operation.name === name)) { // If we're looking for a specific name, return it if (name !== undefined) { return operation; } // If we're looking for the deepest running operation (name is undefined), // check if this operation has deeper running sub-operations const deeperRunning = searchInOperations(operation.subOperations); if (deeperRunning) { return deeperRunning; } // If no deeper running operations, this is the deepest return operation; } if (operation.subOperations && operation.subOperations.length > 0) { const found = searchInOperations(operation.subOperations); if (found) { return found; } } } return null; }; return searchInOperations(this._operations); }; this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024); const { name, baseProducer, file, batchSize, executionId } = options; this.name = name; this._file = file; this._executionId = executionId; this._baseProducer = baseProducer; this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY; this._dimensions = []; this._firstLine = ''; this._delimiter = (_b = file.delimiter) !== null && _b !== void 0 ? _b : ','; this._count = 0; this._iterations = 0; this._operations = []; // Initialize record pool for optimization this._recordPool = new DatasetRecordPool_1.default(this._batchSize); const datasetName = this.name .replace(/[^a-zA-Z0-9_-]/g, '_') .replace(/_{2,}/g, '_') .replace(/^_+|_+$/g, '') .toLowerCase(); const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName; this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset'); this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp'); this.ensureFile(this._path); } } exports.default = Dataset;