UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

128 lines (127 loc) 7.66 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const Affirm_1 = __importDefault(require("../../core/Affirm")); const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory")); const Environment_1 = __importDefault(require("../Environment")); const DatasetManager_1 = __importDefault(require("../dataset/DatasetManager")); const Logger_1 = __importDefault(require("../../helper/Logger")); class ProducerEngineClass { constructor() { this.readFile = (producer, options) => __awaiter(this, void 0, void 0, function* () { var _a, _b, _c; (0, Affirm_1.default)(producer, 'Invalid producer'); (0, Affirm_1.default)(options, 'Invalid options'); if (options.readmode === 'lines') (0, Affirm_1.default)(options.lines, 'Invalid lines'); const source = Environment_1.default.getSource(producer.source); (0, Affirm_1.default)(source, `No source found for producer "${producer.name}" with name "${producer.source}"`); const driver = yield DriverFactory_1.default.instantiateSource(source); (0, Affirm_1.default)(driver, `No driver found for producer "${producer.name}" with driver type "${source.engine}"`); const { settings: { fileKey, fileType, sheetName, hasHeaderRow } } = producer; let dataset = DatasetManager_1.default.create(producer); let lines = []; switch (options.readmode) { case 'lines': lines = yield driver.readLinesInRange({ fileKey, fileType, options: { lineFrom: options.lines.from, lineTo: options.lines.to, sheetName, hasHeaderRow }, httpApi: (_a = producer.settings) === null || _a === void 0 ? void 0 : _a.httpApi }); break; case 'all': lines = yield driver.readAll({ fileKey, fileType, options: { sheetName, hasHeaderRow }, httpApi: (_b = producer.settings) === null || _b === void 0 ? void 0 : _b.httpApi }); break; case 'download': dataset = yield driver.download(dataset); break; } switch ((_c = producer.settings.fileType) === null || _c === void 0 ? void 0 : _c.toUpperCase()) { case 'CSV': case 'TXT': return { data: lines, dataset, dataType: 'lines-of-text' }; case 'XLS': case 'XLSX': return { data: lines, dataset, dataType: 'lines-of-text' }; case 'PARQUET': case 'JSONL': case 'JSON': { if (lines.length === 1) { // Attempt to handle cases where a single line might contain multiple JSON objects separated by newlines // Or if the entire file content is a single JSON array stringified. try { const parsedAsArray = JSON.parse(lines[0]); if (Array.isArray(parsedAsArray)) { return { data: parsedAsArray, dataset, dataType: 'array-of-json' }; } } catch (error) { // If parsing as array fails, proceed to split by newline console.warn('Failed to parse single line as JSON array, splitting by newline:', error); } lines = lines[0].split('\\n'); } const json = lines.filter(line => line.trim() !== '').map(x => JSON.parse(x)); return { data: json, dataset, dataType: 'array-of-json' }; } case 'XML': { // The driver's _readXmlLines method now returns an array of JSON strings. // Each string needs to be parsed into a JSON object. const json = lines.filter(line => line.trim() !== '').map(x => JSON.parse(x)); return { data: json, dataset, dataType: 'array-of-json' }; } default: throw new Error(`Invalid file type "${producer.settings.fileType}" for engine type "${source.engine}" for producer "${producer.name}": not supported`); } }); this.readSampleData = (producer_1, ...args_1) => __awaiter(this, [producer_1, ...args_1], void 0, function* (producer, sampleSize = 10, discover = false) { (0, Affirm_1.default)(producer, 'Invalid producer'); (0, Affirm_1.default)(sampleSize > 0, 'Sample size must be greater than 0'); const source = Environment_1.default.getSource(producer.source); (0, Affirm_1.default)(source, `No source found for producer "${producer.name}" with name "${producer.source}"`); let dataset = DatasetManager_1.default.create(producer); switch (source.engine) { case 'aws-redshift': { const sql = `SELECT * FROM "${source.authentication['schema']}"."${producer.settings.sqlTable}" LIMIT ${sampleSize}`; (0, Affirm_1.default)(sql, `Invalid SQL from deployment compilation for producer "${producer.name}"`); const driver = yield DriverFactory_1.default.instantiateSource(source); (0, Affirm_1.default)(driver, `No driver found for producer "${producer.name}" with driver type "${source.engine}"`); const res = yield driver.query(sql); dataset = yield dataset.loadFromMemory(res.rows, producer, discover); break; } case 'local': case 'aws-s3': case 'delta-share': { const fileData = yield this.readFile(producer, { readmode: 'lines', lines: { from: 0, to: sampleSize } }); dataset = yield dataset.loadFromMemory(fileData.data, producer, discover); break; } default: throw new Error(`Invalid engine type "${source.engine}" for producer "${producer.name}": not supported`); } const sampleData = [...yield dataset.readLines(sampleSize)]; dataset.destroy(); Logger_1.default.log(`Finished reading sample dataset:\n${dataset.printStats()}`); return sampleData; }); } } const ProducerEngine = new ProducerEngineClass(); exports.default = ProducerEngine;