@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
825 lines (824 loc) • 40.4 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const path_1 = __importDefault(require("path"));
const fs_1 = __importDefault(require("fs"));
const fs_2 = require("fs");
const readline_1 = require("readline");
const Constants_1 = __importDefault(require("../../Constants"));
const DatasetManager_1 = __importDefault(require("./DatasetManager"));
const DatasetRecord_1 = __importDefault(require("./DatasetRecord"));
const DatasetRecordPool_1 = __importDefault(require("./DatasetRecordPool"));
const Affirm_1 = __importDefault(require("../../core/Affirm"));
const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
const Helper_1 = __importDefault(require("../../helper/Helper"));
const Algo_1 = __importDefault(require("../../core/Algo"));
const Environment_1 = __importDefault(require("../Environment"));
const Logger_1 = __importDefault(require("../../helper/Logger"));
const ProducerManager_1 = __importDefault(require("../producer/ProducerManager"));
class Dataset {
constructor(options) {
var _a, _b;
this.getPath = () => this._path;
this.setPath = (path) => {
this._path = path;
return this;
};
this.getFile = () => this._file;
this.getExecutionId = () => this._executionId;
this.getBatchSize = () => this._batchSize;
this.setFirstLine = (firstLine) => {
this._firstLine = firstLine;
return this;
};
this.getFirstLine = () => this._firstLine;
this.getCount = () => this._count;
this.setCount = (count) => {
this._count = count;
return this;
};
this.getCycles = () => this._iterations;
this.getDelimiter = () => this._delimiter;
this.setDelimiter = (delimiter) => {
this._delimiter = delimiter;
return this;
};
this.getOperations = () => this._operations;
this.load = (source) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(source, 'Invalid source');
this._startOperation('load', { source: source.engine });
try {
const driver = yield DriverFactory_1.default.instantiateSource(source);
yield driver.download(this);
}
catch (error) {
if (this._file.isOptional) {
Logger_1.default.log(`Error loading dataset "${this.name}", creating default configuration and mock data because "isOptional" is true.`);
if (!this.getDimensions() || this.getDimensions().length === 0)
this.setDimensions(ProducerManager_1.default.getColumns(this._baseProducer).map((x, i) => { var _a; return ({ index: i, key: x.nameInProducer, name: x.aliasInProducer, hidden: false, type: (_a = x.dimension) === null || _a === void 0 ? void 0 : _a.type }); }));
if (!this.getFirstLine() || this.getFirstLine().length === 0) {
if (this._file.hasHeaderRow)
this.setFirstLine(this.getDimensions().map(x => x.key).join(this.getDelimiter()));
else
this.setFirstLine('');
}
}
else
throw error;
}
this._size = this._computeSize();
this._finishOperation('load');
return this;
});
/**
* Load data from an in-memory array of objects and create a local dataset file
* @param data Array of objects to load into the dataset
* @param dimensions Optional dimensions array. If not provided, will be inferred from the first object
* @param delimiter Optional delimiter. Defaults to comma
*/
this.loadFromMemory = (data_1, producer_1, ...args_1) => __awaiter(this, [data_1, producer_1, ...args_1], void 0, function* (data, producer, discover = false) {
var _a, _b;
(0, Affirm_1.default)(data, 'Invalid data array');
(0, Affirm_1.default)(Array.isArray(data), 'Data must be an array');
if (data.length === 0) {
console.warn('Loading empty array into dataset');
return this;
}
this._startOperation('load-from-memory', { recordCount: data.length });
try {
this._delimiter = (_b = (_a = producer.settings) === null || _a === void 0 ? void 0 : _a.delimiter) !== null && _b !== void 0 ? _b : this._delimiter;
// Discover the dimensions on your own
const firstItem = data[0];
const firstLine = typeof firstItem === 'object' ? JSON.stringify(firstItem) : String(firstItem);
const buildRes = yield DatasetManager_1.default.buildDimensionsFromFirstLine(firstLine, this._file, producer, discover);
this._dimensions = buildRes.dimensions;
this._updateRecordPoolDimensions();
// Clear existing file content
this.clear();
// Convert objects to DatasetRecord format and write to file
const records = [];
for (const item of data) {
// Create a row string by extracting values in dimension order
const values = this._dimensions.map(dim => {
const value = item[dim.key];
// Handle null/undefined values
return value !== null && value !== undefined ? String(value) : '';
});
const rowString = values.join(this._delimiter);
const record = new DatasetRecord_1.default(rowString, this._dimensions, this._delimiter);
records.push(record);
}
// Write all records to the dataset file
yield this.append(records);
this._size = this._computeSize();
this._count = data.length;
this._finishOperation('load-from-memory');
return this;
}
catch (error) {
this._finishOperation('load-from-memory');
throw new Error(`Failed to load data from memory: ${error instanceof Error ? error.message : String(error)}`);
}
});
/**
* Stream through the file in batches and apply a transformation
*/
this.transformStream = (transformer_1, ...args_1) => __awaiter(this, [transformer_1, ...args_1], void 0, function* (transformer, options = {}) {
var _a, e_1, _b, _c;
var _d, _e, _f, _g;
const inputPath = options.inputPath || this._path;
const outputPath = options.outputPath || this._tempPath;
const fromLine = (_e = (_d = options.range) === null || _d === void 0 ? void 0 : _d.fromLine) !== null && _e !== void 0 ? _e : -1;
const toLine = (_g = (_f = options.range) === null || _f === void 0 ? void 0 : _f.toLine) !== null && _g !== void 0 ? _g : Infinity;
this.ensureFile(outputPath);
if (!fs_1.default.existsSync(inputPath))
throw new Error(`Input file does not exist: ${inputPath}`);
this._startOperation('transform-stream');
const readStream = (0, fs_2.createReadStream)(inputPath);
const writeStream = (0, fs_2.createWriteStream)(outputPath);
const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
const dimensions = Algo_1.default.deepClone(this._dimensions);
let batch = [];
let lineCount = 0;
let index = 0;
try {
for (var _h = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _h = true) {
_c = rl_1_1.value;
_h = false;
const line = _c;
try {
if (index < fromLine) {
index++;
continue;
}
else if (index >= toLine) {
index++;
break;
}
index++;
// Reuse record from pool and reinitialize it with new line data
const record = this._recordPool.getNext(line, dimensions, this._delimiter);
batch.push(record);
lineCount++;
if (batch.length >= this._batchSize) {
const transformedBatch = yield transformer(batch);
for (const transformedRecord of transformedBatch) {
writeStream.write(transformedRecord.stringify() + '\n');
}
batch = [];
this._recordPool.reset();
}
}
catch (error) {
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
lineCount++;
}
}
}
catch (e_1_1) { e_1 = { error: e_1_1 }; }
finally {
try {
if (!_h && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
}
finally { if (e_1) throw e_1.error; }
}
// Process remaining items in the last batch
if (batch.length > 0) {
const transformedBatch = yield transformer(batch);
for (const transformedRecord of transformedBatch) {
writeStream.write(transformedRecord.stringify() + '\n');
}
}
writeStream.end();
// Wait for write stream to finish
yield new Promise((resolve, reject) => {
writeStream.on('finish', resolve);
writeStream.on('error', reject);
});
// Replace original file with transformed file
if (outputPath === this._tempPath) {
fs_1.default.renameSync(this._tempPath, this._path);
}
this._count = lineCount;
this._size = this._computeSize();
this._iterations++;
this._finishOperation('transform-stream');
});
/**
* Filter items in the file using batched streaming
*/
this.filter = (predicate_1, ...args_1) => __awaiter(this, [predicate_1, ...args_1], void 0, function* (predicate, options = {}) {
this._startOperation('filter');
let globalIndex = 0;
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
const filteredBatch = [];
for (const item of batch) {
if (predicate(item, globalIndex)) {
filteredBatch.push(item);
}
globalIndex++;
}
return filteredBatch;
}), options);
this._finishOperation('filter');
return this;
});
/**
* Map items in the file using batched streaming
*/
this.map = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) {
this._startOperation('map');
let globalIndex = 0;
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
const mappedBatch = [];
for (const item of batch) {
const mappedItem = yield mapper(item, globalIndex);
mappedBatch.push(mappedItem);
globalIndex++;
}
return mappedBatch;
}), options);
this._finishOperation('map');
return this;
});
/**
* FlatMap items in the file using batched streaming - maps each item to an array of items and flattens the result
*/
this.flatMap = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) {
this._startOperation('flat-map');
let globalIndex = 0;
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
const flatMappedBatch = [];
for (const item of batch) {
const mappedItems = yield mapper(item, globalIndex);
flatMappedBatch.push(...mappedItems);
globalIndex++;
}
return flatMappedBatch;
}), options);
this._finishOperation('flat-map');
return this;
});
/**
* Sort the dataset by one or more dimensions using batched streaming with external merge sort
* @param compareFn Comparison function that takes two DatasetRecord objects and returns a number
* @param options Optional parameters for sorting
*/
this.sort = (compareFn_1, ...args_1) => __awaiter(this, [compareFn_1, ...args_1], void 0, function* (compareFn, options = {}) {
var _a, e_2, _b, _c;
this._startOperation('sort');
const { batchSize = this._batchSize } = options;
if (!fs_1.default.existsSync(this._path)) {
throw new Error(`File does not exist: ${this._path}`);
}
// Phase 1: Sort individual batches and write them to temporary files
const tempFiles = [];
const readStream = (0, fs_2.createReadStream)(this._path);
const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
let batch = [];
let batchIndex = 0;
try {
for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) {
_c = rl_2_1.value;
_d = false;
const line = _c;
try {
const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
batch.push(record);
if (batch.length >= batchSize) {
// Sort the batch
batch.sort(compareFn);
// Write sorted batch to temporary file
const tempFile = `${this._tempPath}_batch_${batchIndex}`;
this.ensureFile(tempFile);
const writeStream = (0, fs_2.createWriteStream)(tempFile);
for (const record of batch) {
writeStream.write(record.stringify() + '\n');
}
writeStream.end();
yield new Promise((resolve, reject) => {
writeStream.on('finish', resolve);
writeStream.on('error', reject);
});
tempFiles.push(tempFile);
batch = [];
batchIndex++;
}
}
catch (error) {
Logger_1.default.log(`Error parsing line during sort: ${error}`);
}
}
}
catch (e_2_1) { e_2 = { error: e_2_1 }; }
finally {
try {
if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2);
}
finally { if (e_2) throw e_2.error; }
}
// Handle remaining items in the last batch
if (batch.length > 0) {
batch.sort(compareFn);
const tempFile = `${this._tempPath}_batch_${batchIndex}`;
this.ensureFile(tempFile);
const writeStream = (0, fs_2.createWriteStream)(tempFile);
for (const record of batch) {
writeStream.write(record.stringify() + '\n');
}
writeStream.end();
yield new Promise((resolve, reject) => {
writeStream.on('finish', resolve);
writeStream.on('error', reject);
});
tempFiles.push(tempFile);
}
rl.close();
readStream.close();
// Phase 2: Merge sorted batches using k-way merge
if (tempFiles.length === 0) {
this._finishOperation('sort');
return this;
}
if (tempFiles.length === 1) {
// Only one batch, just rename it
fs_1.default.renameSync(tempFiles[0], this._path);
}
else {
// Perform k-way merge
yield this._performKWayMergeSort(tempFiles, this._path, compareFn);
}
// Clean up temporary files
for (const tempFile of tempFiles) {
if (fs_1.default.existsSync(tempFile)) {
fs_1.default.unlinkSync(tempFile);
}
}
this._iterations++;
this._finishOperation('sort');
return this;
});
/**
* Convenience method to sort by a specific dimension
* @param dimensionName The name of the dimension to sort by
* @param ascending Whether to sort in ascending order (default: true)
*/
this.sortByDimension = (dimensionName_1, ...args_1) => __awaiter(this, [dimensionName_1, ...args_1], void 0, function* (dimensionName, ascending = true) {
const dimension = this._dimensions.find(d => d.name === dimensionName);
if (!dimension) {
throw new Error(`Dimension "${dimensionName}" not found. Available dimensions: ${this._dimensions.map(d => d.name).join(', ')}`);
}
const compareFn = (a, b) => {
const aValue = a.getValue(dimensionName);
const bValue = b.getValue(dimensionName);
// Handle null/undefined values
if (aValue == null && bValue == null)
return 0;
if (aValue == null)
return ascending ? -1 : 1;
if (bValue == null)
return ascending ? 1 : -1;
// Try to parse as numbers for numeric comparison
const aNum = Number(aValue);
const bNum = Number(bValue);
if (!isNaN(aNum) && !isNaN(bNum)) {
const result = aNum - bNum;
return ascending ? result : -result;
}
// String comparison
const aStr = String(aValue);
const bStr = String(bValue);
const result = aStr.localeCompare(bStr);
return ascending ? result : -result;
};
return this.sort(compareFn);
});
/**
* Remove duplicate records from the dataset using batched streaming
* @param keySelector Optional function to generate a key for comparison. If not provided, uses the entire record
*/
this.distinct = (keySelector) => __awaiter(this, void 0, void 0, function* () {
this._startOperation('distinct');
if (!fs_1.default.existsSync(this._path)) {
throw new Error(`File does not exist: ${this._path}`);
}
const seen = new Set();
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
const distinctBatch = [];
for (const record of batch) {
// Generate a key for uniqueness check
const recordKey = keySelector ? keySelector(record) : record.stringify();
if (!seen.has(recordKey)) {
seen.add(recordKey);
distinctBatch.push(record);
}
}
return distinctBatch;
}));
this._finishOperation('distinct');
return this;
});
/**
* Remove duplicate records based on specific dimensions
* @param dimensionNames Array of dimension names to use for uniqueness comparison
*/
this.distinctByDimensions = (dimensionNames) => __awaiter(this, void 0, void 0, function* () {
// Validate that all dimension names exist
const existingNames = this._dimensions.map(d => d.name);
const missingDimensions = dimensionNames.filter(name => !existingNames.includes(name));
(0, Affirm_1.default)(missingDimensions.length === 0, `Cannot create distinct by dimensions. Missing dimensions: ${missingDimensions.join(', ')}`);
const keySelector = (record) => {
const values = dimensionNames.map(name => {
const value = record.getValue(name);
return value !== null && value !== undefined ? String(value) : '';
});
return values.join('|'); // Use pipe as separator to avoid collisions
};
return this.distinct(keySelector);
});
/**
* Internal method to perform k-way merge of sorted files
*/
this._performKWayMergeSort = (tempFiles, outputPath, compareFn) => __awaiter(this, void 0, void 0, function* () {
const readers = [];
// Initialize readers for each temp file
for (const file of tempFiles) {
const readStream = (0, fs_2.createReadStream)(file);
const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
const iterator = rl[Symbol.asyncIterator]();
readers.push({ file, rl, currentRecord: null, finished: false, iterator });
}
// Read first record from each file
for (const reader of readers) {
try {
const { value, done } = yield reader.iterator.next();
if (!done)
reader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter);
else
reader.finished = true;
}
catch (_a) {
reader.finished = true;
}
}
// Write merged results
this.ensureFile(outputPath);
const writeStream = (0, fs_2.createWriteStream)(outputPath);
while (readers.some(r => !r.finished)) {
// Find the reader with the smallest current record
let minReader = null;
for (const reader of readers) {
if (!reader.finished && reader.currentRecord) {
if (!minReader || !minReader.currentRecord || compareFn(reader.currentRecord, minReader.currentRecord) < 0) {
minReader = reader;
}
}
}
if (minReader && minReader.currentRecord) {
// Write the smallest record
writeStream.write(minReader.currentRecord.stringify() + '\n');
// Read next record from the same reader
try {
const { value, done } = yield minReader.iterator.next();
if (!done) {
minReader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter);
}
else {
minReader.finished = true;
minReader.currentRecord = null;
}
}
catch (_b) {
minReader.finished = true;
minReader.currentRecord = null;
}
}
}
writeStream.end();
yield new Promise((resolve, reject) => {
writeStream.on('finish', resolve);
writeStream.on('error', reject);
});
// Close all readers
for (const reader of readers) {
reader.rl.close();
}
});
/**
* Stream through batches without modification (for reading)
*/
this.streamBatches = (processor) => __awaiter(this, void 0, void 0, function* () {
var _a, e_3, _b, _c;
if (!fs_1.default.existsSync(this._path)) {
throw new Error(`File does not exist: ${this._path}`);
}
this._startOperation('stream-batches');
const readStream = (0, fs_2.createReadStream)(this._path);
const rl = (0, readline_1.createInterface)({
input: readStream,
crlfDelay: Infinity
});
let batch = [];
let batchIndex = 0;
let lineCount = 0;
try {
for (var _d = true, rl_3 = __asyncValues(rl), rl_3_1; rl_3_1 = yield rl_3.next(), _a = rl_3_1.done, !_a; _d = true) {
_c = rl_3_1.value;
_d = false;
const line = _c;
try {
const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
batch.push(record);
lineCount++;
if (batch.length >= this._batchSize) {
yield processor(batch, batchIndex);
batch = [];
batchIndex++;
}
}
catch (error) {
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
}
}
}
catch (e_3_1) { e_3 = { error: e_3_1 }; }
finally {
try {
if (!_d && !_a && (_b = rl_3.return)) yield _b.call(rl_3);
}
finally { if (e_3) throw e_3.error; }
}
// Process remaining items in the last batch
if (batch.length > 0) {
yield processor(batch, batchIndex);
}
this._iterations++;
this._finishOperation('stream-batches');
});
/**
* Check if file exists
*/
this.exists = () => fs_1.default.existsSync(this._path);
/**
* Create the file if it doesn't exist
*/
this.ensureFile = (filePath) => {
const dir = path_1.default.dirname(filePath);
if (!fs_1.default.existsSync(dir)) {
fs_1.default.mkdirSync(dir, { recursive: true });
}
if (!fs_1.default.existsSync(filePath)) {
fs_1.default.writeFileSync(filePath, '');
}
};
/**
* Clear the file content
*/
this.clear = () => {
if (fs_1.default.existsSync(this._path)) {
fs_1.default.writeFileSync(this._path, '');
}
return this;
};
/**
* Append data to the file
*/
this.append = (items) => __awaiter(this, void 0, void 0, function* () {
this._startOperation('append');
const writeStream = (0, fs_2.createWriteStream)(this._path, { flags: 'a' });
for (const item of items) {
writeStream.write(item.stringify() + '\n');
}
writeStream.end();
yield new Promise((resolve, reject) => {
writeStream.on('finish', resolve);
writeStream.on('error', reject);
});
this._finishOperation('append');
return this;
});
/**
* Read a specified number of lines from the file
*/
this.readLines = (numberOfLines) => __awaiter(this, void 0, void 0, function* () {
var _a, e_4, _b, _c;
if (!fs_1.default.existsSync(this._path))
return [];
if (numberOfLines <= 0)
return [];
this._startOperation('read-lines', { numberOfLines });
const readStream = (0, fs_2.createReadStream)(this._path);
const rl = (0, readline_1.createInterface)({
input: readStream,
crlfDelay: Infinity
});
const results = [];
let lineCount = 0;
try {
for (var _d = true, rl_4 = __asyncValues(rl), rl_4_1; rl_4_1 = yield rl_4.next(), _a = rl_4_1.done, !_a; _d = true) {
_c = rl_4_1.value;
_d = false;
const line = _c;
try {
const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
results.push(record);
lineCount++;
if (lineCount >= numberOfLines) {
break;
}
}
catch (error) {
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
lineCount++;
}
}
}
catch (e_4_1) { e_4 = { error: e_4_1 }; }
finally {
try {
if (!_d && !_a && (_b = rl_4.return)) yield _b.call(rl_4);
}
finally { if (e_4) throw e_4.error; }
}
rl.close();
readStream.close();
this._finishOperation('read-lines');
return results;
});
this.getDimensions = () => this._dimensions;
this.setDimensions = (dimensions) => {
this._dimensions = dimensions;
return this;
};
this.setSingleDimension = (newDimension, oldDimension) => {
(0, Affirm_1.default)(newDimension, `Invalid new dimension`);
(0, Affirm_1.default)(oldDimension, 'Invalid old dimension');
const current = this._dimensions.findIndex(x => x.index === oldDimension.index);
(0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`);
this._dimensions.splice(current, 1, newDimension);
return this;
};
/**
* Update the record pool when dimensions change
*/
this._updateRecordPoolDimensions = () => {
// Update all pooled records with current dimensions
this._recordPool.updateDimensions(this._dimensions, this._delimiter);
};
this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
console.log(`DS ${this.name} (${this._count} | ${this._iterations})`);
console.log(this._dimensions.map(x => x.name).join(this._delimiter));
const records = yield this.readLines(count);
records.forEach((x, i) => console.log(`[${i}]`, full ? x : x.stringify()));
console.log('----------');
});
this.printStats = () => {
var _a, _b;
const total = ((_b = (_a = this._operations) === null || _a === void 0 ? void 0 : _a.map(x => x.elapsedMs)) !== null && _b !== void 0 ? _b : []).reduce((sum, ms) => sum + ms, 0);
console.log(`DS[stats] ${this.name} (size: ${this._count} | cycles: ${this._iterations} | ms: ${Helper_1.default.formatDuration(total)})`);
console.log(`Operations: ${this._operations.length}`);
console.log(JSON.stringify(this._operations, null, 4));
};
/**
* Destroy the dataset by removing all allocated memory and created files
*/
this.destroy = () => {
this._startOperation('destroy');
try {
// Remove the main dataset file
if (fs_1.default.existsSync(this._path)) {
fs_1.default.unlinkSync(this._path);
}
// Remove the temporary file if it exists
if (fs_1.default.existsSync(this._tempPath)) {
fs_1.default.unlinkSync(this._tempPath);
}
// Remove any batch temporary files that might still exist
const tempDir = path_1.default.dirname(this._tempPath);
if (fs_1.default.existsSync(tempDir)) {
const files = fs_1.default.readdirSync(tempDir);
const batchFiles = files.filter(file => file.startsWith(path_1.default.basename(this._tempPath) + '_batch_'));
for (const batchFile of batchFiles) {
const fullPath = path_1.default.join(tempDir, batchFile);
if (fs_1.default.existsSync(fullPath)) {
fs_1.default.unlinkSync(fullPath);
}
}
// Try to remove the temp directory if it's empty
try {
if (fs_1.default.readdirSync(tempDir).length === 0) {
fs_1.default.rmdirSync(tempDir);
}
}
catch (_a) {
// Directory not empty or other error, ignore
}
}
this._finishOperation('destroy');
}
catch (error) {
this._finishOperation('destroy');
throw new Error(`Failed to destroy dataset: ${error instanceof Error ? error.message : String(error)}`);
}
};
this._startOperation = (name, metadata) => {
const newOperation = {
name,
count: -1,
elapsedMs: performance.now(),
status: 'running',
subOperations: [],
metadata: metadata
};
const runningOperation = this._findRunningOperation();
if (runningOperation)
runningOperation.subOperations.push(newOperation);
else
this._operations.push(newOperation);
};
this._finishOperation = (name) => {
const finishedOperation = this._findRunningOperation(name);
if (finishedOperation) {
finishedOperation.status = 'completed';
finishedOperation.count = this._count;
finishedOperation.elapsedMs = performance.now() - finishedOperation.elapsedMs;
}
else {
const currentOperation = this._operations.find(x => x.status === 'running');
const currentName = currentOperation ? currentOperation.name : 'none';
console.warn(`Finished operation "${name}" but no running operation with that name was found (current running: "${currentName}")`);
}
};
this._findRunningOperation = (name) => {
const searchInOperations = (operations) => {
for (const operation of operations) {
if (operation.status === 'running' && (name === undefined || operation.name === name)) {
// If we're looking for a specific name, return it
if (name !== undefined) {
return operation;
}
// If we're looking for the deepest running operation (name is undefined),
// check if this operation has deeper running sub-operations
const deeperRunning = searchInOperations(operation.subOperations);
if (deeperRunning) {
return deeperRunning;
}
// If no deeper running operations, this is the deepest
return operation;
}
if (operation.subOperations && operation.subOperations.length > 0) {
const found = searchInOperations(operation.subOperations);
if (found) {
return found;
}
}
}
return null;
};
return searchInOperations(this._operations);
};
this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
const { name, baseProducer, file, batchSize, executionId } = options;
this.name = name;
this._file = file;
this._executionId = executionId;
this._baseProducer = baseProducer;
this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
this._dimensions = [];
this._firstLine = '';
this._delimiter = (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',';
this._count = 0;
this._iterations = 0;
this._operations = [];
// Initialize record pool for optimization
this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
const datasetName = this.name
.replace(/[^a-zA-Z0-9_-]/g, '_')
.replace(/_{2,}/g, '_')
.replace(/^_+|_+$/g, '')
.toLowerCase();
const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName;
this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp');
this.ensureFile(this._path);
}
}
exports.default = Dataset;