UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

249 lines (248 loc) 12 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __asyncValues = (this && this.__asyncValues) || function (o) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var m = o[Symbol.asyncIterator], i; return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i); function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; } function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); } }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const stream_1 = require("stream"); const readline_1 = require("readline"); const promises_1 = require("stream/promises"); const fs_1 = require("fs"); const path_1 = __importDefault(require("path")); const Logger_1 = __importDefault(require("../helper/Logger")); const Affirm_1 = __importDefault(require("../core/Affirm")); const XLSParser_1 = __importDefault(require("../engines/parsing/XLSParser")); const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser")); const Constants_1 = __importDefault(require("../Constants")); const DriverHelper = { appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () { (0, Affirm_1.default)(options, 'Invalid options'); const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter, sourceFilename } = options; (0, Affirm_1.default)(headerLine, `Invalid header line`); const keys = (fileType === 'JSON' || fileType === 'JSONL') ? Object.keys(JSON.parse(headerLine)) : []; const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true); // When sourceFilename is set, the headerLine includes $source_filename at the end. // For validation, we need to compare against the original header without this suffix. const originalHeaderLine = sourceFilename ? headerLine.slice(0, headerLine.lastIndexOf(delimiter)) : headerLine; let isFirstLine = true; let hasValidatedHeader = shouldValidateHeader ? false : true; let leftoverData = ''; let globalIndex = 0; let lineCount = 0; const headerValidationTransform = new stream_1.Transform({ transform(chunk, encoding, callback) { const chunkStr = leftoverData + chunk.toString(); const lines = chunkStr.split('\n'); // Keep the last line as leftover if it doesn't end with newline leftoverData = lines.pop() || ''; const filteredLines = []; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Header validation for first line if (!hasValidatedHeader && isFirstLine && i === 0) { if (shouldValidateHeader && originalHeaderLine && originalHeaderLine.trim() !== '' && line.trim() !== originalHeaderLine.trim()) { const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${originalHeaderLine}`; Logger_1.default.log(msg); return callback(new Error(msg)); } hasValidatedHeader = true; isFirstLine = false; } // Apply your filtering logic here if (shouldIncludeLine(line, globalIndex)) { filteredLines.push(processLine(line)); } globalIndex++; } // Output filtered lines if (filteredLines.length > 0) { const output = filteredLines.join('\n') + '\n'; callback(null, Buffer.from(output)); } else { callback(null, null); // No data to output } }, flush(callback) { // Process any remaining data if (leftoverData.trim()) { if (shouldIncludeLine(leftoverData, -1)) { callback(null, Buffer.from(processLine(leftoverData) + '\n')); } else { callback(null, null); } } else { callback(null, null); } globalIndex++; } }); // Helper function to determine if a line should be included const shouldIncludeLine = (line, lineIndex) => { // For flat files (csv, txt) ignore the first line of the header (I already saved that line) if (lineIndex === 0 && shouldValidateHeader) return false; // Skip empty lines if (line.trim() === '') return false; return true; }; const processLine = (line) => { lineCount++; let processedLine; switch (fileType) { case 'JSON': case 'JSONL': { try { const parsed = JSON.parse(line); processedLine = keys.map(k => parsed[k]).join(delimiter); } catch (error) { Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`); throw error; } break; } default: processedLine = line; } // If sourceFilename is provided, append it to each line if (sourceFilename) { processedLine = processedLine + delimiter + sourceFilename; } return processedLine; }; const writeOptions = append ? { flags: 'a' } : {}; const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions); yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream); return lineCount; }), appendObjectsToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () { (0, Affirm_1.default)(options, 'Invalid options'); const { append, destinationPath, objects, delimiter } = options; const writeOptions = append ? { flags: 'a' } : {}; const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions); let lineCount = 0; const keys = Object.keys(objects[0]); for (const obj of objects) { const serialized = keys.map(k => obj[k]).join(delimiter) + '\n'; writeStream.write(serialized); lineCount++; } writeStream.close(); return lineCount; }), quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () { var _a, e_1, _b, _c; const fileStream = (0, fs_1.createReadStream)(filePath); const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity }); const lines = []; let counter = 0; try { for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) { _c = rl_1_1.value; _d = false; const line = _c; lines.push(line); counter++; if (counter >= lineCount) { break; } } } catch (e_1_1) { e_1 = { error: e_1_1 }; } finally { try { if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1); } finally { if (e_1) throw e_1.error; } } rl.close(); fileStream.close(); return lines; }), setHeaderFromFile: (fileKey, file, filePath, dataset) => __awaiter(void 0, void 0, void 0, function* () { (0, Affirm_1.default)(filePath, 'Invalid path'); (0, Affirm_1.default)(fileKey, 'Invalid fileKey'); (0, Affirm_1.default)(file, 'Invalid File'); let headerLine; switch (file.fileType) { case 'XLS': case 'XLSX': headerLine = yield XLSParser_1.default.getHeaderXls(path_1.default.join(filePath, fileKey), file.sheetName); if (file.includeSourceFilename) { headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN; } dataset.setFirstLine(headerLine); break; case 'XML': // using a differnt logic for encoded type xls and xlsx headerLine = (yield XMLParser_1.default.readXmlLines(path_1.default.join(filePath, fileKey)))[0]; dataset.setFirstLine(headerLine); break; case 'CSV': case 'JSON': case 'JSONL': case 'TXT': // Get header line from the first file headerLine = (yield DriverHelper.quickReadFile(path_1.default.join(filePath, fileKey), 1))[0]; // If including source filename, append a placeholder column name to the header if (file.includeSourceFilename) { headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN; } dataset.setFirstLine(headerLine); break; default: throw new Error(`the fileType "${file.fileType}" is not implemented yet`); } }), quickReadStream: (stream, lineCount) => __awaiter(void 0, void 0, void 0, function* () { var _a, e_2, _b, _c; const rl = (0, readline_1.createInterface)({ input: stream, crlfDelay: Infinity }); const lines = []; let counter = 0; try { for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) { _c = rl_2_1.value; _d = false; const line = _c; lines.push(line); counter++; if (counter >= lineCount) { break; } } } catch (e_2_1) { e_2 = { error: e_2_1 }; } finally { try { if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2); } finally { if (e_2) throw e_2.error; } } rl.close(); return lines; }) }; exports.default = DriverHelper;