@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
212 lines (211 loc) • 12.6 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const Affirm_1 = __importDefault(require("../../core/Affirm"));
const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
const Dataset_1 = __importDefault(require("./Dataset"));
const DeveloperEngine_1 = __importDefault(require("../ai/DeveloperEngine"));
const Constants_1 = __importDefault(require("../../Constants"));
const ProducerManager_1 = __importDefault(require("../producer/ProducerManager"));
class DatasetManagerClass {
constructor() {
/**
* Create a new Dataset for a producer. If an executionId is provided, the dataset files will
* be isolated inside a sub-folder specific to that execution to avoid concurrency conflicts
* when the same producer / consumer is executed multiple times in parallel.
*/
this.create = (producer, options) => {
var _a, _b;
(0, Affirm_1.default)(producer, 'Invalid producer');
const { name, settings: { delimiter, fileKey, fileType, hasHeaderRow, sheetName, httpApi } } = producer;
const executionId = options === null || options === void 0 ? void 0 : options.executionId;
const cProducer = options === null || options === void 0 ? void 0 : options.cProducer;
// Check if any dimension has sourceFilename flag set to true
const hasSourceFilenameDimension = (_b = (_a = producer.dimensions) === null || _a === void 0 ? void 0 : _a.some(d => d.sourceFilename === true)) !== null && _b !== void 0 ? _b : false;
const dataset = new Dataset_1.default({
name,
baseProducer: producer,
file: {
fileKey,
fileType,
hasHeaderRow,
sheetName,
delimiter,
httpApi,
includeSourceFilename: hasSourceFilenameDimension,
isOptional: cProducer === null || cProducer === void 0 ? void 0 : cProducer.isOptional
},
executionId
});
return dataset;
};
this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
Affirm_1.default.hasValue(firstLine, `Invalid first line`);
(0, Affirm_1.default)(dsFile, `Invalid dataset file`);
(0, Affirm_1.default)(producer, `Invalid producer`);
const file = dsFile;
switch (file.fileType) {
case 'CSV': {
const delimiterChar = (_a = file.delimiter) !== null && _a !== void 0 ? _a : ',';
const headerLine = firstLine;
const rawDimensions = ParseManager_1.default._extractHeader(headerLine, delimiterChar, producer, discover);
return {
dimensions: rawDimensions.map(x => ({
key: x.name,
name: x.saveAs,
index: x.index,
hidden: null,
type: x.type
})),
delimiter: delimiterChar
};
}
case 'PARQUET':
case 'JSONL':
case 'JSON': {
const columns = ProducerManager_1.default.getColumns(producer);
const firstObject = JSON.parse(firstLine);
const keys = Object.keys(firstObject);
// const columnsWithDot = columns.filter(x => x.aliasInProducer.includes('.'))
// if (columnsWithDot.length > 0) {
// console.log(columns, keys, 'PAPAPAPP')
// for (const colWithDot of columnsWithDot) {
// console.log(colWithDot)
// }
// }
// If includeSourceFilename is enabled, the driver has added $source_filename column
// We need to add it to the keys list so dimensions can reference it
const includeSourceFilename = file.includeSourceFilename === true;
if (includeSourceFilename) {
keys.push(Constants_1.default.SOURCE_FILENAME_COLUMN);
}
if (discover) {
return {
delimiter: (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',',
dimensions: keys.map((x, i) => ({
hidden: false,
index: i,
key: x,
name: x,
type: DeveloperEngine_1.default.inferDimensionType(firstObject === null || firstObject === void 0 ? void 0 : firstObject[x])
}))
};
}
const dimensions = [];
for (const pColumn of columns) {
// Handle sourceFilename dimension specially - it maps to the $source_filename column added by the driver
if (((_c = pColumn.dimension) === null || _c === void 0 ? void 0 : _c.sourceFilename) === true) {
if (includeSourceFilename) {
const sourceFilenameIndex = keys.findIndex(x => x === Constants_1.default.SOURCE_FILENAME_COLUMN);
dimensions.push({
index: sourceFilenameIndex,
key: Constants_1.default.SOURCE_FILENAME_COLUMN,
name: pColumn.nameInProducer,
hidden: null,
type: (_e = (_d = pColumn.dimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
});
}
continue;
}
const columnKey = (_f = pColumn.aliasInProducer) !== null && _f !== void 0 ? _f : pColumn.nameInProducer;
const csvColumnIndex = keys.findIndex(x => x === columnKey);
(0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
dimensions.push({
index: csvColumnIndex,
key: columnKey,
name: pColumn.nameInProducer,
hidden: null,
type: (_h = (_g = pColumn.dimension) === null || _g === void 0 ? void 0 : _g.type) !== null && _h !== void 0 ? _h : 'string'
});
}
const delimiterChar = (_j = file.delimiter) !== null && _j !== void 0 ? _j : ',';
return { dimensions, delimiter: delimiterChar };
}
case 'TXT': {
if (!file.hasHeaderRow) {
// If the file is a TXT and there isn't an header row, then I add a fake one that maps directly to the producer
const delimiterChar = (_k = file.delimiter) !== null && _k !== void 0 ? _k : ',';
const columns = ProducerManager_1.default.getColumns(producer);
const includeSourceFilename = file.includeSourceFilename === true;
if (discover) {
// Since I don't have an header, and I'm discovering, I just create placeholder dimensions based on the same number of columns of the txt
const colValues = firstLine.split(delimiterChar);
const dimensions = colValues.map((x, i) => ({
hidden: false,
index: i,
key: `Col ${i + 1}`,
name: `Col ${i + 1}`,
type: 'string'
}));
return {
delimiter: delimiterChar,
dimensions
};
}
// Filter out sourceFilename columns for index-based mapping, but track them for later
const regularColumns = columns.filter(x => { var _a; return ((_a = x.dimension) === null || _a === void 0 ? void 0 : _a.sourceFilename) !== true; });
const sourceFilenameColumn = columns.find(x => { var _a; return ((_a = x.dimension) === null || _a === void 0 ? void 0 : _a.sourceFilename) === true; });
const dimensions = regularColumns.map((x, i) => {
var _a, _b, _c;
return ({
key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer,
name: x.nameInProducer,
index: i,
hidden: null,
type: (_c = (_b = x.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
});
});
// Add sourceFilename dimension at the end if enabled
if (sourceFilenameColumn && includeSourceFilename) {
dimensions.push({
key: Constants_1.default.SOURCE_FILENAME_COLUMN,
name: sourceFilenameColumn.nameInProducer,
index: regularColumns.length, // Index after all regular columns
hidden: null,
type: (_m = (_l = sourceFilenameColumn.dimension) === null || _l === void 0 ? void 0 : _l.type) !== null && _m !== void 0 ? _m : 'string'
});
}
return {
dimensions,
delimiter: delimiterChar
};
}
else {
const delimiterChar = (_o = producer.settings.delimiter) !== null && _o !== void 0 ? _o : ',';
const rawDimensions = ParseManager_1.default._extractHeader(firstLine, delimiterChar, producer, discover);
return {
dimensions: rawDimensions.map(x => ({
key: x.name,
name: x.saveAs,
index: x.index,
hidden: null,
type: x.type
})),
delimiter: delimiterChar
};
}
}
case 'XLS':
break;
case 'XLSX':
break;
case 'XML':
break;
}
});
}
}
const DatasetManager = new DatasetManagerClass();
exports.default = DatasetManager;