@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
249 lines (248 loc) • 12 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const stream_1 = require("stream");
const readline_1 = require("readline");
const promises_1 = require("stream/promises");
const fs_1 = require("fs");
const path_1 = __importDefault(require("path"));
const Logger_1 = __importDefault(require("../helper/Logger"));
const Affirm_1 = __importDefault(require("../core/Affirm"));
const XLSParser_1 = __importDefault(require("../engines/parsing/XLSParser"));
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser"));
const Constants_1 = __importDefault(require("../Constants"));
const DriverHelper = {
appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
(0, Affirm_1.default)(options, 'Invalid options');
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter, sourceFilename } = options;
(0, Affirm_1.default)(headerLine, `Invalid header line`);
const keys = (fileType === 'JSON' || fileType === 'JSONL')
? Object.keys(JSON.parse(headerLine))
: [];
const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
// When sourceFilename is set, the headerLine includes $source_filename at the end.
// For validation, we need to compare against the original header without this suffix.
const originalHeaderLine = sourceFilename
? headerLine.slice(0, headerLine.lastIndexOf(delimiter))
: headerLine;
let isFirstLine = true;
let hasValidatedHeader = shouldValidateHeader ? false : true;
let leftoverData = '';
let globalIndex = 0;
let lineCount = 0;
const headerValidationTransform = new stream_1.Transform({
transform(chunk, encoding, callback) {
const chunkStr = leftoverData + chunk.toString();
const lines = chunkStr.split('\n');
// Keep the last line as leftover if it doesn't end with newline
leftoverData = lines.pop() || '';
const filteredLines = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Header validation for first line
if (!hasValidatedHeader && isFirstLine && i === 0) {
if (shouldValidateHeader && originalHeaderLine && originalHeaderLine.trim() !== '' && line.trim() !== originalHeaderLine.trim()) {
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${originalHeaderLine}`;
Logger_1.default.log(msg);
return callback(new Error(msg));
}
hasValidatedHeader = true;
isFirstLine = false;
}
// Apply your filtering logic here
if (shouldIncludeLine(line, globalIndex)) {
filteredLines.push(processLine(line));
}
globalIndex++;
}
// Output filtered lines
if (filteredLines.length > 0) {
const output = filteredLines.join('\n') + '\n';
callback(null, Buffer.from(output));
}
else {
callback(null, null); // No data to output
}
},
flush(callback) {
// Process any remaining data
if (leftoverData.trim()) {
if (shouldIncludeLine(leftoverData, -1)) {
callback(null, Buffer.from(processLine(leftoverData) + '\n'));
}
else {
callback(null, null);
}
}
else {
callback(null, null);
}
globalIndex++;
}
});
// Helper function to determine if a line should be included
const shouldIncludeLine = (line, lineIndex) => {
// For flat files (csv, txt) ignore the first line of the header (I already saved that line)
if (lineIndex === 0 && shouldValidateHeader)
return false;
// Skip empty lines
if (line.trim() === '')
return false;
return true;
};
const processLine = (line) => {
lineCount++;
let processedLine;
switch (fileType) {
case 'JSON':
case 'JSONL': {
try {
const parsed = JSON.parse(line);
processedLine = keys.map(k => parsed[k]).join(delimiter);
}
catch (error) {
Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`);
throw error;
}
break;
}
default:
processedLine = line;
}
// If sourceFilename is provided, append it to each line
if (sourceFilename) {
processedLine = processedLine + delimiter + sourceFilename;
}
return processedLine;
};
const writeOptions = append ? { flags: 'a' } : {};
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
return lineCount;
}),
appendObjectsToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
(0, Affirm_1.default)(options, 'Invalid options');
const { append, destinationPath, objects, delimiter } = options;
const writeOptions = append ? { flags: 'a' } : {};
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
let lineCount = 0;
const keys = Object.keys(objects[0]);
for (const obj of objects) {
const serialized = keys.map(k => obj[k]).join(delimiter) + '\n';
writeStream.write(serialized);
lineCount++;
}
writeStream.close();
return lineCount;
}),
quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
var _a, e_1, _b, _c;
const fileStream = (0, fs_1.createReadStream)(filePath);
const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity });
const lines = [];
let counter = 0;
try {
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
_c = rl_1_1.value;
_d = false;
const line = _c;
lines.push(line);
counter++;
if (counter >= lineCount) {
break;
}
}
}
catch (e_1_1) { e_1 = { error: e_1_1 }; }
finally {
try {
if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
}
finally { if (e_1) throw e_1.error; }
}
rl.close();
fileStream.close();
return lines;
}),
setHeaderFromFile: (fileKey, file, filePath, dataset) => __awaiter(void 0, void 0, void 0, function* () {
(0, Affirm_1.default)(filePath, 'Invalid path');
(0, Affirm_1.default)(fileKey, 'Invalid fileKey');
(0, Affirm_1.default)(file, 'Invalid File');
let headerLine;
switch (file.fileType) {
case 'XLS':
case 'XLSX':
headerLine = yield XLSParser_1.default.getHeaderXls(path_1.default.join(filePath, fileKey), file.sheetName);
if (file.includeSourceFilename) {
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
}
dataset.setFirstLine(headerLine);
break;
case 'XML':
// using a differnt logic for encoded type xls and xlsx
headerLine = (yield XMLParser_1.default.readXmlLines(path_1.default.join(filePath, fileKey)))[0];
dataset.setFirstLine(headerLine);
break;
case 'CSV':
case 'JSON':
case 'JSONL':
case 'TXT':
// Get header line from the first file
headerLine = (yield DriverHelper.quickReadFile(path_1.default.join(filePath, fileKey), 1))[0];
// If including source filename, append a placeholder column name to the header
if (file.includeSourceFilename) {
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
}
dataset.setFirstLine(headerLine);
break;
default:
throw new Error(`the fileType "${file.fileType}" is not implemented yet`);
}
}),
quickReadStream: (stream, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
var _a, e_2, _b, _c;
const rl = (0, readline_1.createInterface)({ input: stream, crlfDelay: Infinity });
const lines = [];
let counter = 0;
try {
for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) {
_c = rl_2_1.value;
_d = false;
const line = _c;
lines.push(line);
counter++;
if (counter >= lineCount) {
break;
}
}
}
catch (e_2_1) { e_2 = { error: e_2_1 }; }
finally {
try {
if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2);
}
finally { if (e_2) throw e_2.error; }
}
rl.close();
return lines;
})
};
exports.default = DriverHelper;