UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

496 lines (495 loc) 27.1 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __asyncValues = (this && this.__asyncValues) || function (o) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var m = o[Symbol.asyncIterator], i; return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i); function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; } function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const client_s3_1 = require("@aws-sdk/client-s3"); const Affirm_1 = __importDefault(require("../../core/Affirm")); const SecretManager_1 = __importDefault(require("../../engines/SecretManager")); const promises_1 = require("stream/promises"); const readline_1 = __importDefault(require("readline")); const path_1 = __importDefault(require("path")); const fs_1 = __importDefault(require("fs")); const Algo_1 = __importDefault(require("../../core/Algo")); const xlsx_1 = __importDefault(require("xlsx")); const XMLParser_1 = __importDefault(require("../../engines/parsing/XMLParser")); const Helper_1 = __importDefault(require("../../helper/Helper")); const ParseHelper_1 = __importDefault(require("../../engines/parsing/ParseHelper")); const DriverHelper_1 = __importDefault(require("../DriverHelper")); const Logger_1 = __importDefault(require("../../helper/Logger")); const Constants_1 = __importDefault(require("../../Constants")); const XLSParser_1 = __importDefault(require("../../engines/parsing/XLSParser")); const ExecutorScope_1 = __importDefault(require("../../executors/ExecutorScope")); class S3SourceDriver { constructor() { this.init = (source) => __awaiter(this, void 0, void 0, function* () { this._bucketName = source.authentication['bucket']; const sessionToken = SecretManager_1.default.replaceSecret(source.authentication['sessionToken']); const config = { region: source.authentication['region'], credentials: { accessKeyId: SecretManager_1.default.replaceSecret(source.authentication['accessKey']), secretAccessKey: SecretManager_1.default.replaceSecret(source.authentication['secretKey']), sessionToken: sessionToken ? sessionToken : undefined } }; this._client = new client_s3_1.S3Client(config); // TODO: is there a way to test if the connection was successful? like a query or scan that I can do? return this; }); this.readAll = (request) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first'); (0, Affirm_1.default)(request, `Invalid download request`); (0, Affirm_1.default)(request.fileKey, `Invalid file key for download request`); const { fileKey } = request; if (fileKey.includes('%')) { const allFileKeys = yield this.listFiles(fileKey); (0, Affirm_1.default)(allFileKeys.length < 50, `Pattern ${fileKey} of producer requested to S3 matches more than 50 files (${allFileKeys.length}), this is more than the S3 allowed limit. Please refine your pattern, remove some files or use a separate bucket.`); const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), i)); const results = yield Promise.all(promises); return results.flat(); } else { return yield this._get(request); } }); this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first'); (0, Affirm_1.default)(request, 'Invalid read request'); (0, Affirm_1.default)(request.options, 'Invalid read request options'); const { fileKey } = request; if (fileKey.includes('%')) { const allFileKeys = yield this.listFiles(fileKey); const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), i)); const results = yield Promise.all(promises); return results.flat(); } else { return yield this._get(request); } }); this.download = (dataset) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first'); (0, Affirm_1.default)(dataset, 'Invalid dataset'); const file = dataset.getFile(); (0, Affirm_1.default)(file, 'Invalid dataset file'); (0, Affirm_1.default)(file.fileKey, 'Invalid file key'); (0, Affirm_1.default)(file.fileType, `Invalid file type`); const includeSourceFilename = file.includeSourceFilename === true; const downloadLocally = (fileUrl_1, headerLine_1, ...args_1) => __awaiter(this, [fileUrl_1, headerLine_1, ...args_1], void 0, function* (fileUrl, headerLine, appendMode = false, sourceFilename) { // Download and validate header in a single stream pass const command = new client_s3_1.GetObjectCommand({ Bucket: this._bucketName, Key: fileUrl }); const response = yield this._client.send(command); (0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3'); let stream; switch (file.fileType) { case 'XLS': case 'XLSX': stream = yield XLSParser_1.default.parseXLSStream(response.Body, file.sheetName); break; default: stream = response.Body; break; } return DriverHelper_1.default.appendToUnifiedFile({ stream, fileKey: fileUrl, destinationPath: dataset.getPath(), append: appendMode, headerLine, fileType: file.fileType, hasHeaderRow: file.hasHeaderRow, delimiter: dataset.getDelimiter(), sourceFilename }); }); const { fileKey } = file; const setFirstLineFromStream = (stream) => __awaiter(this, void 0, void 0, function* () { var _a, e_1, _b, _c; const rl = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity }); let firstLine = ''; switch (file.fileType) { case 'XLSX': case 'XLS': firstLine = yield XLSParser_1.default.getHeaderXlsFromStream(stream, file.sheetName); break; case 'CSV': case 'JSON': case 'JSONL': case 'TXT': try { for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) { _c = rl_1_1.value; _d = false; const line = _c; firstLine = line; break; } } catch (e_1_1) { e_1 = { error: e_1_1 }; } finally { try { if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1); } finally { if (e_1) throw e_1.error; } } rl.close(); break; } // If including source filename, append a placeholder column name to the header if (file.includeSourceFilename) { firstLine = firstLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN; } dataset.setFirstLine(firstLine); return firstLine; }); if (fileKey.includes('%')) { const allFileKeys = yield this.listFiles(fileKey); Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`); Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in bucket "${this._bucketName}".`); // Get header line from the first file const firstFileCommand = new client_s3_1.GetObjectCommand({ Bucket: this._bucketName, Key: allFileKeys[0] }); const firstFileResponse = yield this._client.send(firstFileCommand); (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3'); const firstFileStream = firstFileResponse.Body; const headerLine = yield setFirstLineFromStream(firstFileStream); let totalLineCount = 0; // Download files sequentially to avoid file conflicts for (let i = 0; i < allFileKeys.length; i++) { const currentFileKey = allFileKeys[i]; // Pass the filename (just the basename) if includeSourceFilename is enabled const sourceFilename = includeSourceFilename ? path_1.default.basename(currentFileKey) : undefined; totalLineCount += yield downloadLocally(currentFileKey, headerLine, i > 0, sourceFilename); // Append mode for subsequent files } dataset.setCount(totalLineCount); return dataset; } else { // Get header line from the single file const firstFileCommand = new client_s3_1.GetObjectCommand({ Bucket: this._bucketName, Key: fileKey }); const firstFileResponse = yield this._client.send(firstFileCommand); (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3'); const firstFileStream = firstFileResponse.Body; const headerLine = yield setFirstLineFromStream(firstFileStream); // Pass the filename if includeSourceFilename is enabled const sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined; const totalLineCount = yield downloadLocally(fileKey, headerLine, false, sourceFilename); dataset.setCount(totalLineCount); return dataset; } }); this.exist = (producer) => __awaiter(this, void 0, void 0, function* () { var _a; (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first'); (0, Affirm_1.default)(producer, 'Invalid read producer'); const bucket = this._bucketName; const fileKey = producer.settings.fileKey; (0, Affirm_1.default)(fileKey, `Invalid file key for download request`); if (fileKey.includes('%')) { const allFileKeys = yield this.listFiles(fileKey); return allFileKeys.length > 0; } else { try { yield this._client.send(new client_s3_1.HeadObjectCommand({ Bucket: bucket, Key: fileKey })); return true; } catch (error) { if (((_a = error.$metadata) === null || _a === void 0 ? void 0 : _a.httpStatusCode) === 404 || error.name === 'NotFound') return false; throw error; } } }); this._readLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () { var _a, e_2, _b, _c; const reader = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity }); const lines = []; let lineCounter = 0; try { for (var _d = true, reader_1 = __asyncValues(reader), reader_1_1; reader_1_1 = yield reader_1.next(), _a = reader_1_1.done, !_a; _d = true) { _c = reader_1_1.value; _d = false; const line = _c; if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) { if (lineCounter >= lineFrom && lineCounter < lineTo) { if (line && line.length > 0) lines.push(line); } lineCounter++; if (lineCounter >= lineTo) break; } else { if (line && line.length > 0) lines.push(line); } } } catch (e_2_1) { e_2 = { error: e_2_1 }; } finally { try { if (!_d && !_a && (_b = reader_1.return)) yield _b.call(reader_1); } finally { if (e_2) throw e_2.error; } } reader.close(); return lines; }); this._readExcelLines = (stream, sheetName, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () { var _a, stream_1, stream_1_1; var _b, e_3, _c, _d; (0, Affirm_1.default)(sheetName, `Invalid sheetname`); const chunks = []; try { for (_a = true, stream_1 = __asyncValues(stream); stream_1_1 = yield stream_1.next(), _b = stream_1_1.done, !_b; _a = true) { _d = stream_1_1.value; _a = false; const chunk = _d; chunks.push(chunk); } } catch (e_3_1) { e_3 = { error: e_3_1 }; } finally { try { if (!_a && !_b && (_c = stream_1.return)) yield _c.call(stream_1); } finally { if (e_3) throw e_3.error; } } const buffer = Buffer.concat(chunks); const excel = xlsx_1.default.read(buffer, { type: 'buffer' }); (0, Affirm_1.default)(excel.SheetNames.includes(sheetName), `The sheet "${sheetName}" doesn't exist in the excel (available: ${excel.SheetNames.join(', ')})`); const sheet = excel.Sheets[sheetName]; const csv = xlsx_1.default.utils.sheet_to_csv(sheet); const lines = csv.split('\n'); if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) return lines.slice(lineFrom, lineTo + 1); else return lines; }); this._readXmlLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () { var _a, stream_2, stream_2_1; var _b, e_4, _c, _d; const chunks = []; try { for (_a = true, stream_2 = __asyncValues(stream); stream_2_1 = yield stream_2.next(), _b = stream_2_1.done, !_b; _a = true) { _d = stream_2_1.value; _a = false; const chunk = _d; chunks.push(chunk); } } catch (e_4_1) { e_4 = { error: e_4_1 }; } finally { try { if (!_a && !_b && (_c = stream_2.return)) yield _c.call(stream_2); } finally { if (e_4) throw e_4.error; } } const buffer = Buffer.concat(chunks); const jsonData = XMLParser_1.default.xmlToJson(buffer); // Convert JSON data to string lines. This might need adjustment based on XML structure. let lines = Array.isArray(jsonData) ? jsonData.map(item => JSON.stringify(item)) : [JSON.stringify(jsonData)]; if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) { lines = lines.slice(lineFrom, lineTo + 1); } return lines; }); this._get = (request, index) => __awaiter(this, void 0, void 0, function* () { const { fileKey, fileType, options } = request; const bucket = this._bucketName; let lineFrom, lineTo, sheetName, hasHeaderRow; if (options) { lineFrom = options.lineFrom; lineTo = options.lineTo; sheetName = options.sheetName; hasHeaderRow = options.hasHeaderRow; } const response = yield this._client.send(new client_s3_1.GetObjectCommand({ Bucket: bucket, Key: fileKey })); (0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3'); const stream = response.Body; let lines = []; switch (fileType) { case 'CSV': case 'JSON': case 'JSONL': case 'TXT': if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) lines = yield this._readLines(stream, lineFrom, lineTo); else lines = yield this._readLines(stream); break; case 'XLS': case 'XLSX': if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) lines = yield this._readExcelLines(stream, sheetName, lineFrom, lineTo); else lines = yield this._readExcelLines(stream, sheetName); break; case 'XML': if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) lines = yield this._readXmlLines(stream, lineFrom, lineTo); else lines = yield this._readXmlLines(stream); break; } // If this is not the first file read in a pattern match AND the file type has an header, // then I need to remove the header from the resulting lines or the header will be duplicated if (index > 0 && ParseHelper_1.default.shouldHaveHeader(fileType, hasHeaderRow)) { lines = lines.slice(1); } return lines; }); this._listFiles = (fileKeyPattern, maxKeys, continuationToken) => __awaiter(this, void 0, void 0, function* () { var _a; (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first'); // Convert SQL-like pattern to prefix and pattern parts for filtering let prefix = ''; if (fileKeyPattern) { if (fileKeyPattern.includes('%')) { const parts = fileKeyPattern.split('%').filter(part => part.length > 0); // If pattern starts with text before first %, use it as prefix for S3 optimization if (!fileKeyPattern.startsWith('%') && parts[0]) { prefix = parts[0]; } } else { // No wildcard, use the entire pattern as prefix prefix = fileKeyPattern; } } const listParams = { Bucket: this._bucketName, Prefix: prefix || undefined, MaxKeys: maxKeys || 10000, ContinuationToken: continuationToken }; try { const response = yield this._client.send(new client_s3_1.ListObjectsV2Command(listParams)); const files = ((_a = response.Contents) === null || _a === void 0 ? void 0 : _a.map(obj => obj.Key).filter(key => key !== undefined)) || []; const matchingFiles = Helper_1.default.matchPattern(fileKeyPattern, files); return { files: matchingFiles, nextContinuationToken: response.NextContinuationToken }; } catch (error) { throw new Error(`Failed to list files in bucket "${this._bucketName}": ${error.message}`); } }); this.listFiles = (fileKeyPattern, maxKeys) => __awaiter(this, void 0, void 0, function* () { const allFiles = []; let continuationToken = undefined; do { const result = yield this._listFiles(fileKeyPattern, maxKeys, continuationToken); allFiles.push(...result.files); continuationToken = result.nextContinuationToken; // If maxKeys is specified and we've reached the limit, break if (maxKeys && allFiles.length >= maxKeys) { return allFiles.slice(0, maxKeys); } } while (continuationToken); return allFiles; }); this.downloadFile = (fileKey) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first'); (0, Affirm_1.default)(fileKey, 'Invalid file key'); const response = yield this._client.send(new client_s3_1.GetObjectCommand({ Bucket: this._bucketName, Key: fileKey })); (0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3'); const content = yield response.Body.transformToByteArray(); return Buffer.from(content); }); this.deleteFile = (fileKey) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first'); (0, Affirm_1.default)(fileKey, 'Invalid file key'); yield this._client.send(new client_s3_1.DeleteObjectCommand({ Bucket: this._bucketName, Key: fileKey })); }); this.copyFile = (sourceFileKey, destinationBucket, destinationFileKey) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first'); (0, Affirm_1.default)(sourceFileKey, 'Invalid source file key'); (0, Affirm_1.default)(destinationBucket, 'Invalid destination bucket'); (0, Affirm_1.default)(destinationFileKey, 'Invalid destination file key'); yield this._client.send(new client_s3_1.CopyObjectCommand({ CopySource: `${this._bucketName}/${sourceFileKey}`, Bucket: destinationBucket, Key: destinationFileKey })); }); this.ready = (request) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(request, 'Invalid producer'); (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first'); const { producer, scope } = request; const { fileKey } = producer.settings; (0, Affirm_1.default)(fileKey, 'Invalid file key'); const streamToFile = (s3Key, localPath) => __awaiter(this, void 0, void 0, function* () { const command = new client_s3_1.GetObjectCommand({ Bucket: this._bucketName, Key: s3Key }); const response = yield this._client.send(command); (0, Affirm_1.default)(response.Body, `Failed to fetch object "${s3Key}" from S3`); // Ensure the directory for the file exists const fileDir = path_1.default.dirname(localPath); if (!fs_1.default.existsSync(fileDir)) { fs_1.default.mkdirSync(fileDir, { recursive: true }); } const writeStream = fs_1.default.createWriteStream(localPath); yield (0, promises_1.pipeline)(response.Body, writeStream); }); if (fileKey.includes('%')) { const allFileKeys = yield this.listFiles(fileKey); Affirm_1.default.hasItems(allFileKeys, `The file key pattern "${fileKey}" doesn't have any matches in bucket "${this._bucketName}".`); // Stream each file to local temp storage sequentially to avoid overwhelming the connection const allFilePaths = []; for (const s3Key of allFileKeys) { const localPath = ExecutorScope_1.default.getProducerPath(scope, producer, s3Key); ExecutorScope_1.default.ensurePath(localPath); yield streamToFile(s3Key, localPath); allFilePaths.push(localPath); } return { files: allFilePaths.map(x => ({ fullUri: x })) }; } else { const localPath = ExecutorScope_1.default.getProducerPath(scope, producer, fileKey); ExecutorScope_1.default.ensurePath(localPath); yield streamToFile(fileKey, localPath); return { files: [{ fullUri: localPath }] }; } }); } } exports.default = S3SourceDriver;