UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

498 lines (497 loc) 25.2 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const Affirm_1 = __importDefault(require("../../core/Affirm")); const ProducerEngine_1 = __importDefault(require("../producer/ProducerEngine")); const Environment_1 = __importDefault(require("../Environment")); const path_1 = __importDefault(require("path")); const promises_1 = __importDefault(require("fs/promises")); const dayjs_1 = __importDefault(require("dayjs")); const customParseFormat_1 = __importDefault(require("dayjs/plugin/customParseFormat")); dayjs_1.default.extend(customParseFormat_1.default); class DeveloperEngineClass { constructor() { this.discover = (producer) => __awaiter(this, void 0, void 0, function* () { var _a; (0, Affirm_1.default)(producer, 'Invalid producer'); const sampleData = yield ProducerEngine_1.default.readSampleData(producer, 10, true); (0, Affirm_1.default)(sampleData, 'Discover process failed: no result found'); const typeDefinitions = this.extractFieldTypes(sampleData); const mappedProducer = { name: producer.name, description: producer.description, source: producer.source, settings: Object.assign({}, producer.settings), dimensions: typeDefinitions.map(field => { var _a; return ({ name: field.name, type: this.mapFieldTypeToProducerType(field.type), description: `Auto-mapped field: ${field.name}`, classification: ((_a = this.extractFieldClassification(field)) === null || _a === void 0 ? void 0 : _a.isPHI) ? ['PHI'] : undefined }); }), measures: [], _version: (_a = producer._version) !== null && _a !== void 0 ? _a : 1 }; mappedProducer['$schema'] = producer['$schema']; // Save the mapped producer to file const producerPath = path_1.default.join(process.cwd(), 'remora', 'producers', `${producer.name}.json`); yield promises_1.default.writeFile(producerPath, JSON.stringify(mappedProducer, null, 4), 'utf-8'); return { producer: mappedProducer, fields: typeDefinitions }; }); this.mapFieldTypeToProducerType = (fieldType) => { switch (fieldType) { case 'number': return 'number'; case 'string': return 'string'; case 'date': case 'datetime': return 'datetime'; default: return 'string'; } }; // Infer the most likely type from a single JS value // Returns one of: 'number' | 'boolean' | 'date' | 'datetime' | 'string' | 'array' | 'object' | 'null' this.inferType = (value) => { if (value === null || value === undefined) return 'string'; // Arrays if (Array.isArray(value)) return 'array'; // Booleans (including common string representations) if (typeof value === 'boolean') return 'boolean'; if (typeof value === 'string') { const trimmed = value.trim(); const lower = trimmed.toLowerCase(); if (lower === 'true' || lower === 'false') return 'boolean'; // Numbers (numeric strings) const numericRegex = /^-?\d+(?:\.\d+)?$/; if (numericRegex.test(trimmed)) return 'number'; // Timestamps (10 or 13 digits) const tsRegex = /^-?\d{10}(?:\d{3})?$/; if (tsRegex.test(trimmed)) { const n = Number(trimmed.length === 10 ? `${trimmed}000` : trimmed); const d = new Date(n); if (!isNaN(d.getTime())) return 'datetime'; } // Dates with common formats const dateFormats = [ 'YYYY-MM-DD', 'YYYY/MM/DD', 'DD/MM/YYYY', 'MM/DD/YYYY', 'YYYYMMDD', 'DD-MMM-YYYY', 'YYYY-MM-DD HH:mm', 'YYYY-MM-DD HH:mm:ss', 'YYYY-MM-DDTHH:mm', 'YYYY-MM-DDTHH:mmZ', 'YYYY-MM-DDTHH:mm:ss', 'YYYY-MM-DDTHH:mm:ssZ', 'YYYY-MM-DDTHH:mm:ss.SSSZ' ]; for (const fmt of dateFormats) { const d = (0, dayjs_1.default)(trimmed, fmt, true); if (d.isValid()) { // If time components likely present, classify as datetime if (/T|\d+:\d+/.test(trimmed)) return 'datetime'; return 'date'; } } // ISO 8601 without specifying format const iso = (0, dayjs_1.default)(trimmed); if (iso.isValid() && /\d{4}-\d{2}-\d{2}/.test(trimmed)) { if (/T|\d+:\d+/.test(trimmed)) return 'datetime'; return 'date'; } return 'string'; } if (typeof value === 'number') return 'number'; if (typeof value === 'object') { // Date instance if (value instanceof Date && !isNaN(value.getTime())) return 'datetime'; return 'object'; } // Fallback for bigint, symbol, function -> string return 'string'; }; this.inferDimensionType = (value) => { const type = this.inferType(value); switch (type) { case 'array': case 'object': return 'string'; case 'boolean': return 'boolean'; case 'date': case 'datetime': return 'datetime'; case 'number': return 'number'; case 'string': return 'string'; default: return 'string'; } }; this.extractFieldTypes = (records) => { if (!records || records.length === 0) return []; const sample = records[0]; return Object.entries(sample._value).map(([key, value]) => ({ name: key, type: this.inferType(value) })); }; this.extractFieldClassification = (field) => { (0, Affirm_1.default)(field, 'Invalid field'); const { name, type } = field; const fieldNameLower = name.toLowerCase(); // Rule 1: Names const namePatterns = [ /\b(first|last|middle|full|given|family|sur|maiden|nick|display)[\s_-]?name\b/, /\b(fname|lname|mname|fullname|givenname|familyname|surname)\b/, /\b(patient|person|individual|customer|client|user)[\s_-]?name\b/, /\bname\b/ ]; // Rule 2: Geographic subdivisions (excluding state level) const geoPatterns = [ /\b(address|addr|street|st|avenue|ave|road|rd|lane|ln|drive|dr|blvd|boulevard)\b/, /\b(city|town|village|municipality)\b/, /\b(county|parish|borough)\b/, /\b(zip|postal|zipcode|postalcode)\b/, /\b(precinct|district|ward)\b/, /\b(geocode|coordinates|coord|latitude|longitude|lat|lng)\b/, /\b(location|place|residence|home)\b/ ]; // Rule 3: Dates related to individuals const datePatterns = [ /\b(birth|born|dob|birthdate|date[\s_-]?of[\s_-]?birth)\b/, /\b(admission|admit|admitdate|admission[\s_-]?date)\b/, /\b(discharge|discharged|dischargedate|discharge[\s_-]?date)\b/, /\b(death|died|dod|date[\s_-]?of[\s_-]?death|deceased)\b/, /\b(age|years[\s_-]?old|yrs[\s_-]?old)\b/, /\b(visit|appointment|appt|service)[\s_-]?date\b/, /\b(created|updated|modified|last[\s_-]?seen)[\s_-]?date\b/ ]; // Rule 4: Phone numbers const phonePatterns = [ /\b(phone|tel|telephone|mobile|cell|cellular)\b/, /\b(home|work|office|emergency)[\s_-]?phone\b/, /\b(contact|phone)[\s_-]?number\b/ ]; // Rule 5: Fax numbers const faxPatterns = [ /\b(fax|facsimile)\b/, /\bfax[\s_-]?number\b/ ]; // Rule 6: Email addresses const emailPatterns = [ /\b(email|e[\s_-]?mail|mail)\b/, /\b(email|mail)[\s_-]?address\b/, /\b(contact|personal|work)[\s_-]?email\b/ ]; // Rule 7: Social Security numbers const ssnPatterns = [ /\b(ssn|social[\s_-]?security)\b/, /\bsocial[\s_-]?security[\s_-]?number\b/, /\btax[\s_-]?id\b/ ]; // Rule 8: Medical record numbers const medicalRecordPatterns = [ /\b(mrn|medical[\s_-]?record)\b/, /\bmedical[\s_-]?record[\s_-]?number\b/, /\b(patient|chart)[\s_-]?number\b/, /\b(patient|medical)[\s_-]?id\b/ ]; // Rule 9: Health plan beneficiary numbers const healthPlanPatterns = [ /\b(member|subscriber|beneficiary)[\s_-]?id\b/, /\b(insurance|health[\s_-]?plan)[\s_-]?number\b/, /\b(policy|plan)[\s_-]?number\b/, /\b(medicaid|medicare)[\s_-]?number\b/ ]; // Rule 10: Account numbers const accountPatterns = [ /\b(account|acct)[\s_-]?number\b/, /\b(account|acct)[\s_-]?id\b/, /\b(billing|financial)[\s_-]?account\b/ ]; // Rule 11: Certificate/license numbers const licensePatterns = [ /\b(license|licence|cert|certificate)[\s_-]?number\b/, /\b(driver|drivers)[\s_-]?license\b/, /\b(professional|medical)[\s_-]?license\b/, /\b(permit|registration)[\s_-]?number\b/ ]; // Rule 12: Vehicle identifiers const vehiclePatterns = [ /\b(vehicle|car|auto)[\s_-]?id\b/, /\b(license[\s_-]?plate|plate[\s_-]?number)\b/, /\b(vin|vehicle[\s_-]?identification)\b/, /\bserial[\s_-]?number\b/ ]; // Rule 13: Device identifiers const devicePatterns = [ /\b(device|equipment)[\s_-]?id\b/, /\b(serial|model)[\s_-]?number\b/, /\b(imei|mac[\s_-]?address|uuid)\b/ ]; // Rule 14: URLs const urlPatterns = [ /\b(url|web[\s_-]?address|website)\b/, /\b(link|hyperlink)\b/, /\b(homepage|web[\s_-]?page)\b/ ]; // Rule 15: IP addresses const ipPatterns = [ /\b(ip|ip[\s_-]?address)\b/, /\b(internet[\s_-]?protocol)\b/, /\b(network[\s_-]?address)\b/ ]; // Rule 16: Biometric identifiers const biometricPatterns = [ /\b(biometric|fingerprint|voiceprint)\b/, /\b(finger|thumb)[\s_-]?print\b/, /\b(voice|speech)[\s_-]?recognition\b/, /\b(retina|iris)[\s_-]?scan\b/ ]; // Rule 17: Photographic images const imagePatterns = [ /\b(photo|photograph|image|picture)\b/, /\b(face|facial)[\s_-]?image\b/, /\b(avatar|profile[\s_-]?picture)\b/ ]; // Rule 18: Other unique identifiers const uniqueIdPatterns = [ /\b(unique|universal)[\s_-]?id\b/, /\b(id|identifier|tracking)[\s_-]?code\b/, /\b(reference|ref)[\s_-]?number\b/, /\b(token|key|hash)\b/, /\b(guid|uuid)\b/ ]; // Check each pattern category const patternCategories = [ { patterns: namePatterns, category: 'Names' }, { patterns: geoPatterns, category: 'Geographic Information' }, { patterns: datePatterns, category: 'Dates' }, { patterns: phonePatterns, category: 'Phone Numbers' }, { patterns: faxPatterns, category: 'Fax Numbers' }, { patterns: emailPatterns, category: 'Email Addresses' }, { patterns: ssnPatterns, category: 'Social Security Numbers' }, { patterns: medicalRecordPatterns, category: 'Medical Record Numbers' }, { patterns: healthPlanPatterns, category: 'Health Plan Beneficiary Numbers' }, { patterns: accountPatterns, category: 'Account Numbers' }, { patterns: licensePatterns, category: 'Certificate/License Numbers' }, { patterns: vehiclePatterns, category: 'Vehicle Identifiers' }, { patterns: devicePatterns, category: 'Device Identifiers' }, { patterns: urlPatterns, category: 'URLs' }, { patterns: ipPatterns, category: 'IP Addresses' }, { patterns: biometricPatterns, category: 'Biometric Identifiers' }, { patterns: imagePatterns, category: 'Photographic Images' }, { patterns: uniqueIdPatterns, category: 'Unique Identifiers' } ]; for (const { patterns, category } of patternCategories) { for (const pattern of patterns) { if (pattern.test(fieldNameLower)) { return { isPHI: true, category, fieldName: name, fieldType: type, reason: `Field name matches pattern for ${category}` }; } } } return { isPHI: false, category: null, fieldName: name, fieldType: type, reason: 'No PHI/PII patterns detected' }; }; this.createMockData = (producer, records) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(producer, 'Invalid producer'); (0, Affirm_1.default)(records > 0, 'Record count must be greater than 0'); const source = Environment_1.default.getSource(producer.source); (0, Affirm_1.default)(source, `No source found for producer "${producer.name}" with name "${producer.source}"`); (0, Affirm_1.default)(source.engine === 'local', `Mock data generation only supports local file-based producers. Source engine "${source.engine}" is not supported.`); const { fileKey, fileType, delimiter } = producer.settings; (0, Affirm_1.default)(fileKey, 'Producer must have a fileKey setting for mock data generation'); (0, Affirm_1.default)(fileType, 'Producer must have a fileType setting for mock data generation'); // Generate mock records const mockRecords = this.generateMockRecords(producer.dimensions, records); // Get the file path const basePath = source.authentication.path || process.cwd(); const filePath = path_1.default.join(basePath, fileKey.replace('%', 'mock')); // Ensure directory exists yield promises_1.default.mkdir(path_1.default.dirname(filePath), { recursive: true }); // Write to file based on type const content = this.formatMockData(mockRecords, fileType, delimiter); yield promises_1.default.writeFile(filePath, content, 'utf-8'); return { filePath, recordCount: records }; }); this.generateMockRecords = (dimensions, count) => { const records = []; for (let i = 0; i < count; i++) { const record = {}; for (const dimension of dimensions) { // Skip sourceFilename dimensions as they are auto-populated if (dimension.sourceFilename) continue; record[dimension.name] = this.generateMockValue(dimension, i); } records.push(record); } return records; }; this.generateMockValue = (dimension, index) => { const { name, type } = dimension; const nameLower = name.toLowerCase(); // Generate contextual mock data based on field name patterns if (this.matchesPattern(nameLower, ['id', 'identifier', 'key', 'pk'])) { return `${index + 1}`; } if (this.matchesPattern(nameLower, ['first_name', 'firstname', 'fname', 'given_name'])) { return this.pickRandom(['John', 'Jane', 'Michael', 'Sarah', 'David', 'Emily', 'Robert', 'Lisa', 'James', 'Mary']); } if (this.matchesPattern(nameLower, ['last_name', 'lastname', 'lname', 'surname', 'family_name'])) { return this.pickRandom(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Martinez', 'Wilson']); } if (this.matchesPattern(nameLower, ['name', 'full_name', 'fullname'])) { const firstNames = ['John', 'Jane', 'Michael', 'Sarah', 'David', 'Emily']; const lastNames = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia']; return `${this.pickRandom(firstNames)} ${this.pickRandom(lastNames)}`; } if (this.matchesPattern(nameLower, ['email', 'mail'])) { return `user${index + 1}@example.com`; } if (this.matchesPattern(nameLower, ['phone', 'telephone', 'mobile', 'cell'])) { return `555-${String(Math.floor(Math.random() * 900) + 100).padStart(3, '0')}-${String(Math.floor(Math.random() * 9000) + 1000).padStart(4, '0')}`; } if (this.matchesPattern(nameLower, ['address', 'street', 'addr'])) { const streets = ['Main St', 'Oak Ave', 'Elm Dr', 'Pine Rd', 'Maple Ln', 'Cedar Blvd']; return `${Math.floor(Math.random() * 9999) + 1} ${this.pickRandom(streets)}`; } if (this.matchesPattern(nameLower, ['city', 'town'])) { return this.pickRandom(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego']); } if (this.matchesPattern(nameLower, ['state', 'province'])) { return this.pickRandom(['CA', 'TX', 'FL', 'NY', 'PA', 'IL', 'OH', 'GA', 'NC', 'MI']); } if (this.matchesPattern(nameLower, ['zip', 'postal', 'zipcode'])) { return String(Math.floor(Math.random() * 90000) + 10000); } if (this.matchesPattern(nameLower, ['country'])) { return this.pickRandom(['USA', 'Canada', 'UK', 'Germany', 'France', 'Australia']); } if (this.matchesPattern(nameLower, ['age', 'years'])) { return Math.floor(Math.random() * 80) + 18; } if (this.matchesPattern(nameLower, ['sex', 'gender'])) { return this.pickRandom(['M', 'F', 'Male', 'Female']); } if (this.matchesPattern(nameLower, ['birth', 'dob', 'birthdate'])) { const year = Math.floor(Math.random() * 60) + 1940; const month = String(Math.floor(Math.random() * 12) + 1).padStart(2, '0'); const day = String(Math.floor(Math.random() * 28) + 1).padStart(2, '0'); return `${year}-${month}-${day}`; } if (this.matchesPattern(nameLower, ['date', 'created', 'updated', 'timestamp'])) { const year = Math.floor(Math.random() * 5) + 2020; const month = String(Math.floor(Math.random() * 12) + 1).padStart(2, '0'); const day = String(Math.floor(Math.random() * 28) + 1).padStart(2, '0'); return `${year}-${month}-${day}`; } if (this.matchesPattern(nameLower, ['amount', 'price', 'cost', 'total', 'balance'])) { return (Math.random() * 1000).toFixed(2); } if (this.matchesPattern(nameLower, ['quantity', 'count', 'qty'])) { return Math.floor(Math.random() * 100) + 1; } if (this.matchesPattern(nameLower, ['status'])) { return this.pickRandom(['active', 'inactive', 'pending', 'completed', 'cancelled']); } if (this.matchesPattern(nameLower, ['type', 'category'])) { return this.pickRandom(['TypeA', 'TypeB', 'TypeC', 'TypeD']); } if (this.matchesPattern(nameLower, ['description', 'desc', 'notes', 'comment'])) { return `Sample description for record ${index + 1}`; } // Fall back to type-based generation return this.generateValueByType(type, index); }; this.matchesPattern = (fieldName, patterns) => { return patterns.some(pattern => fieldName.includes(pattern)); }; this.pickRandom = (arr) => { return arr[Math.floor(Math.random() * arr.length)]; }; this.generateValueByType = (type, index) => { switch (type) { case 'string': return `value_${index + 1}`; case 'number': return Math.floor(Math.random() * 1000); case 'boolean': return Math.random() > 0.5; case 'datetime': { const year = Math.floor(Math.random() * 5) + 2020; const month = String(Math.floor(Math.random() * 12) + 1).padStart(2, '0'); const day = String(Math.floor(Math.random() * 28) + 1).padStart(2, '0'); return `${year}-${month}-${day}`; } default: return `value_${index + 1}`; } }; this.formatMockData = (records, fileType, delimiter) => { switch (fileType) { case 'JSON': return JSON.stringify(records, null, 2); case 'JSONL': return records.map(r => JSON.stringify(r)).join('\n'); case 'CSV': case 'TXT': { const delim = delimiter || ','; if (records.length === 0) return ''; const headers = Object.keys(records[0]); const headerLine = headers.join(delim); const dataLines = records.map(record => headers.map(h => { const val = record[h]; const strVal = val === null || val === undefined ? '' : String(val); // Escape delimiter and quotes in values if (strVal.includes(delim) || strVal.includes('"') || strVal.includes('\n')) { return `"${strVal.replace(/"/g, '""')}"`; } return strVal; }).join(delim)); return [headerLine, ...dataLines].join('\n'); } default: throw new Error(`Unsupported file type for mock data generation: ${fileType}`); } }; } } const DeveloperEngine = new DeveloperEngineClass(); exports.default = DeveloperEngine;