@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
496 lines (495 loc) • 27.1 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const client_s3_1 = require("@aws-sdk/client-s3");
const Affirm_1 = __importDefault(require("../../core/Affirm"));
const SecretManager_1 = __importDefault(require("../../engines/SecretManager"));
const promises_1 = require("stream/promises");
const readline_1 = __importDefault(require("readline"));
const path_1 = __importDefault(require("path"));
const fs_1 = __importDefault(require("fs"));
const Algo_1 = __importDefault(require("../../core/Algo"));
const xlsx_1 = __importDefault(require("xlsx"));
const XMLParser_1 = __importDefault(require("../../engines/parsing/XMLParser"));
const Helper_1 = __importDefault(require("../../helper/Helper"));
const ParseHelper_1 = __importDefault(require("../../engines/parsing/ParseHelper"));
const DriverHelper_1 = __importDefault(require("../DriverHelper"));
const Logger_1 = __importDefault(require("../../helper/Logger"));
const Constants_1 = __importDefault(require("../../Constants"));
const XLSParser_1 = __importDefault(require("../../engines/parsing/XLSParser"));
const ExecutorScope_1 = __importDefault(require("../../executors/ExecutorScope"));
class S3SourceDriver {
constructor() {
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
this._bucketName = source.authentication['bucket'];
const sessionToken = SecretManager_1.default.replaceSecret(source.authentication['sessionToken']);
const config = {
region: source.authentication['region'],
credentials: {
accessKeyId: SecretManager_1.default.replaceSecret(source.authentication['accessKey']),
secretAccessKey: SecretManager_1.default.replaceSecret(source.authentication['secretKey']),
sessionToken: sessionToken ? sessionToken : undefined
}
};
this._client = new client_s3_1.S3Client(config);
// TODO: is there a way to test if the connection was successful? like a query or scan that I can do?
return this;
});
this.readAll = (request) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
(0, Affirm_1.default)(request, `Invalid download request`);
(0, Affirm_1.default)(request.fileKey, `Invalid file key for download request`);
const { fileKey } = request;
if (fileKey.includes('%')) {
const allFileKeys = yield this.listFiles(fileKey);
(0, Affirm_1.default)(allFileKeys.length < 50, `Pattern ${fileKey} of producer requested to S3 matches more than 50 files (${allFileKeys.length}), this is more than the S3 allowed limit. Please refine your pattern, remove some files or use a separate bucket.`);
const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), i));
const results = yield Promise.all(promises);
return results.flat();
}
else {
return yield this._get(request);
}
});
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
(0, Affirm_1.default)(request, 'Invalid read request');
(0, Affirm_1.default)(request.options, 'Invalid read request options');
const { fileKey } = request;
if (fileKey.includes('%')) {
const allFileKeys = yield this.listFiles(fileKey);
const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), i));
const results = yield Promise.all(promises);
return results.flat();
}
else {
return yield this._get(request);
}
});
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
(0, Affirm_1.default)(dataset, 'Invalid dataset');
const file = dataset.getFile();
(0, Affirm_1.default)(file, 'Invalid dataset file');
(0, Affirm_1.default)(file.fileKey, 'Invalid file key');
(0, Affirm_1.default)(file.fileType, `Invalid file type`);
const includeSourceFilename = file.includeSourceFilename === true;
const downloadLocally = (fileUrl_1, headerLine_1, ...args_1) => __awaiter(this, [fileUrl_1, headerLine_1, ...args_1], void 0, function* (fileUrl, headerLine, appendMode = false, sourceFilename) {
// Download and validate header in a single stream pass
const command = new client_s3_1.GetObjectCommand({
Bucket: this._bucketName,
Key: fileUrl
});
const response = yield this._client.send(command);
(0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
let stream;
switch (file.fileType) {
case 'XLS':
case 'XLSX':
stream = yield XLSParser_1.default.parseXLSStream(response.Body, file.sheetName);
break;
default:
stream = response.Body;
break;
}
return DriverHelper_1.default.appendToUnifiedFile({
stream,
fileKey: fileUrl,
destinationPath: dataset.getPath(),
append: appendMode,
headerLine,
fileType: file.fileType,
hasHeaderRow: file.hasHeaderRow,
delimiter: dataset.getDelimiter(),
sourceFilename
});
});
const { fileKey } = file;
const setFirstLineFromStream = (stream) => __awaiter(this, void 0, void 0, function* () {
var _a, e_1, _b, _c;
const rl = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
let firstLine = '';
switch (file.fileType) {
case 'XLSX':
case 'XLS':
firstLine = yield XLSParser_1.default.getHeaderXlsFromStream(stream, file.sheetName);
break;
case 'CSV':
case 'JSON':
case 'JSONL':
case 'TXT':
try {
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
_c = rl_1_1.value;
_d = false;
const line = _c;
firstLine = line;
break;
}
}
catch (e_1_1) { e_1 = { error: e_1_1 }; }
finally {
try {
if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
}
finally { if (e_1) throw e_1.error; }
}
rl.close();
break;
}
// If including source filename, append a placeholder column name to the header
if (file.includeSourceFilename) {
firstLine = firstLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
}
dataset.setFirstLine(firstLine);
return firstLine;
});
if (fileKey.includes('%')) {
const allFileKeys = yield this.listFiles(fileKey);
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in bucket "${this._bucketName}".`);
// Get header line from the first file
const firstFileCommand = new client_s3_1.GetObjectCommand({
Bucket: this._bucketName,
Key: allFileKeys[0]
});
const firstFileResponse = yield this._client.send(firstFileCommand);
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
const firstFileStream = firstFileResponse.Body;
const headerLine = yield setFirstLineFromStream(firstFileStream);
let totalLineCount = 0;
// Download files sequentially to avoid file conflicts
for (let i = 0; i < allFileKeys.length; i++) {
const currentFileKey = allFileKeys[i];
// Pass the filename (just the basename) if includeSourceFilename is enabled
const sourceFilename = includeSourceFilename ? path_1.default.basename(currentFileKey) : undefined;
totalLineCount += yield downloadLocally(currentFileKey, headerLine, i > 0, sourceFilename); // Append mode for subsequent files
}
dataset.setCount(totalLineCount);
return dataset;
}
else {
// Get header line from the single file
const firstFileCommand = new client_s3_1.GetObjectCommand({
Bucket: this._bucketName,
Key: fileKey
});
const firstFileResponse = yield this._client.send(firstFileCommand);
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
const firstFileStream = firstFileResponse.Body;
const headerLine = yield setFirstLineFromStream(firstFileStream);
// Pass the filename if includeSourceFilename is enabled
const sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined;
const totalLineCount = yield downloadLocally(fileKey, headerLine, false, sourceFilename);
dataset.setCount(totalLineCount);
return dataset;
}
});
this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
var _a;
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
(0, Affirm_1.default)(producer, 'Invalid read producer');
const bucket = this._bucketName;
const fileKey = producer.settings.fileKey;
(0, Affirm_1.default)(fileKey, `Invalid file key for download request`);
if (fileKey.includes('%')) {
const allFileKeys = yield this.listFiles(fileKey);
return allFileKeys.length > 0;
}
else {
try {
yield this._client.send(new client_s3_1.HeadObjectCommand({ Bucket: bucket, Key: fileKey }));
return true;
}
catch (error) {
if (((_a = error.$metadata) === null || _a === void 0 ? void 0 : _a.httpStatusCode) === 404 || error.name === 'NotFound')
return false;
throw error;
}
}
});
this._readLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
var _a, e_2, _b, _c;
const reader = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
const lines = [];
let lineCounter = 0;
try {
for (var _d = true, reader_1 = __asyncValues(reader), reader_1_1; reader_1_1 = yield reader_1.next(), _a = reader_1_1.done, !_a; _d = true) {
_c = reader_1_1.value;
_d = false;
const line = _c;
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) {
if (lineCounter >= lineFrom && lineCounter < lineTo) {
if (line && line.length > 0)
lines.push(line);
}
lineCounter++;
if (lineCounter >= lineTo)
break;
}
else {
if (line && line.length > 0)
lines.push(line);
}
}
}
catch (e_2_1) { e_2 = { error: e_2_1 }; }
finally {
try {
if (!_d && !_a && (_b = reader_1.return)) yield _b.call(reader_1);
}
finally { if (e_2) throw e_2.error; }
}
reader.close();
return lines;
});
this._readExcelLines = (stream, sheetName, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
var _a, stream_1, stream_1_1;
var _b, e_3, _c, _d;
(0, Affirm_1.default)(sheetName, `Invalid sheetname`);
const chunks = [];
try {
for (_a = true, stream_1 = __asyncValues(stream); stream_1_1 = yield stream_1.next(), _b = stream_1_1.done, !_b; _a = true) {
_d = stream_1_1.value;
_a = false;
const chunk = _d;
chunks.push(chunk);
}
}
catch (e_3_1) { e_3 = { error: e_3_1 }; }
finally {
try {
if (!_a && !_b && (_c = stream_1.return)) yield _c.call(stream_1);
}
finally { if (e_3) throw e_3.error; }
}
const buffer = Buffer.concat(chunks);
const excel = xlsx_1.default.read(buffer, { type: 'buffer' });
(0, Affirm_1.default)(excel.SheetNames.includes(sheetName), `The sheet "${sheetName}" doesn't exist in the excel (available: ${excel.SheetNames.join(', ')})`);
const sheet = excel.Sheets[sheetName];
const csv = xlsx_1.default.utils.sheet_to_csv(sheet);
const lines = csv.split('\n');
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo))
return lines.slice(lineFrom, lineTo + 1);
else
return lines;
});
this._readXmlLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
var _a, stream_2, stream_2_1;
var _b, e_4, _c, _d;
const chunks = [];
try {
for (_a = true, stream_2 = __asyncValues(stream); stream_2_1 = yield stream_2.next(), _b = stream_2_1.done, !_b; _a = true) {
_d = stream_2_1.value;
_a = false;
const chunk = _d;
chunks.push(chunk);
}
}
catch (e_4_1) { e_4 = { error: e_4_1 }; }
finally {
try {
if (!_a && !_b && (_c = stream_2.return)) yield _c.call(stream_2);
}
finally { if (e_4) throw e_4.error; }
}
const buffer = Buffer.concat(chunks);
const jsonData = XMLParser_1.default.xmlToJson(buffer);
// Convert JSON data to string lines. This might need adjustment based on XML structure.
let lines = Array.isArray(jsonData) ? jsonData.map(item => JSON.stringify(item)) : [JSON.stringify(jsonData)];
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) {
lines = lines.slice(lineFrom, lineTo + 1);
}
return lines;
});
this._get = (request, index) => __awaiter(this, void 0, void 0, function* () {
const { fileKey, fileType, options } = request;
const bucket = this._bucketName;
let lineFrom, lineTo, sheetName, hasHeaderRow;
if (options) {
lineFrom = options.lineFrom;
lineTo = options.lineTo;
sheetName = options.sheetName;
hasHeaderRow = options.hasHeaderRow;
}
const response = yield this._client.send(new client_s3_1.GetObjectCommand({
Bucket: bucket,
Key: fileKey
}));
(0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
const stream = response.Body;
let lines = [];
switch (fileType) {
case 'CSV':
case 'JSON':
case 'JSONL':
case 'TXT':
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo))
lines = yield this._readLines(stream, lineFrom, lineTo);
else
lines = yield this._readLines(stream);
break;
case 'XLS':
case 'XLSX':
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo))
lines = yield this._readExcelLines(stream, sheetName, lineFrom, lineTo);
else
lines = yield this._readExcelLines(stream, sheetName);
break;
case 'XML':
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo))
lines = yield this._readXmlLines(stream, lineFrom, lineTo);
else
lines = yield this._readXmlLines(stream);
break;
}
// If this is not the first file read in a pattern match AND the file type has an header,
// then I need to remove the header from the resulting lines or the header will be duplicated
if (index > 0 && ParseHelper_1.default.shouldHaveHeader(fileType, hasHeaderRow)) {
lines = lines.slice(1);
}
return lines;
});
this._listFiles = (fileKeyPattern, maxKeys, continuationToken) => __awaiter(this, void 0, void 0, function* () {
var _a;
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
// Convert SQL-like pattern to prefix and pattern parts for filtering
let prefix = '';
if (fileKeyPattern) {
if (fileKeyPattern.includes('%')) {
const parts = fileKeyPattern.split('%').filter(part => part.length > 0);
// If pattern starts with text before first %, use it as prefix for S3 optimization
if (!fileKeyPattern.startsWith('%') && parts[0]) {
prefix = parts[0];
}
}
else {
// No wildcard, use the entire pattern as prefix
prefix = fileKeyPattern;
}
}
const listParams = {
Bucket: this._bucketName,
Prefix: prefix || undefined,
MaxKeys: maxKeys || 10000,
ContinuationToken: continuationToken
};
try {
const response = yield this._client.send(new client_s3_1.ListObjectsV2Command(listParams));
const files = ((_a = response.Contents) === null || _a === void 0 ? void 0 : _a.map(obj => obj.Key).filter(key => key !== undefined)) || [];
const matchingFiles = Helper_1.default.matchPattern(fileKeyPattern, files);
return {
files: matchingFiles,
nextContinuationToken: response.NextContinuationToken
};
}
catch (error) {
throw new Error(`Failed to list files in bucket "${this._bucketName}": ${error.message}`);
}
});
this.listFiles = (fileKeyPattern, maxKeys) => __awaiter(this, void 0, void 0, function* () {
const allFiles = [];
let continuationToken = undefined;
do {
const result = yield this._listFiles(fileKeyPattern, maxKeys, continuationToken);
allFiles.push(...result.files);
continuationToken = result.nextContinuationToken;
// If maxKeys is specified and we've reached the limit, break
if (maxKeys && allFiles.length >= maxKeys) {
return allFiles.slice(0, maxKeys);
}
} while (continuationToken);
return allFiles;
});
this.downloadFile = (fileKey) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first');
(0, Affirm_1.default)(fileKey, 'Invalid file key');
const response = yield this._client.send(new client_s3_1.GetObjectCommand({
Bucket: this._bucketName,
Key: fileKey
}));
(0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
const content = yield response.Body.transformToByteArray();
return Buffer.from(content);
});
this.deleteFile = (fileKey) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first');
(0, Affirm_1.default)(fileKey, 'Invalid file key');
yield this._client.send(new client_s3_1.DeleteObjectCommand({
Bucket: this._bucketName,
Key: fileKey
}));
});
this.copyFile = (sourceFileKey, destinationBucket, destinationFileKey) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first');
(0, Affirm_1.default)(sourceFileKey, 'Invalid source file key');
(0, Affirm_1.default)(destinationBucket, 'Invalid destination bucket');
(0, Affirm_1.default)(destinationFileKey, 'Invalid destination file key');
yield this._client.send(new client_s3_1.CopyObjectCommand({
CopySource: `${this._bucketName}/${sourceFileKey}`,
Bucket: destinationBucket,
Key: destinationFileKey
}));
});
this.ready = (request) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(request, 'Invalid producer');
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "init()" first');
const { producer, scope } = request;
const { fileKey } = producer.settings;
(0, Affirm_1.default)(fileKey, 'Invalid file key');
const streamToFile = (s3Key, localPath) => __awaiter(this, void 0, void 0, function* () {
const command = new client_s3_1.GetObjectCommand({
Bucket: this._bucketName,
Key: s3Key
});
const response = yield this._client.send(command);
(0, Affirm_1.default)(response.Body, `Failed to fetch object "${s3Key}" from S3`);
// Ensure the directory for the file exists
const fileDir = path_1.default.dirname(localPath);
if (!fs_1.default.existsSync(fileDir)) {
fs_1.default.mkdirSync(fileDir, { recursive: true });
}
const writeStream = fs_1.default.createWriteStream(localPath);
yield (0, promises_1.pipeline)(response.Body, writeStream);
});
if (fileKey.includes('%')) {
const allFileKeys = yield this.listFiles(fileKey);
Affirm_1.default.hasItems(allFileKeys, `The file key pattern "${fileKey}" doesn't have any matches in bucket "${this._bucketName}".`);
// Stream each file to local temp storage sequentially to avoid overwhelming the connection
const allFilePaths = [];
for (const s3Key of allFileKeys) {
const localPath = ExecutorScope_1.default.getProducerPath(scope, producer, s3Key);
ExecutorScope_1.default.ensurePath(localPath);
yield streamToFile(s3Key, localPath);
allFilePaths.push(localPath);
}
return { files: allFilePaths.map(x => ({ fullUri: x })) };
}
else {
const localPath = ExecutorScope_1.default.getProducerPath(scope, producer, fileKey);
ExecutorScope_1.default.ensurePath(localPath);
yield streamToFile(fileKey, localPath);
return { files: [{ fullUri: localPath }] };
}
});
}
}
exports.default = S3SourceDriver;