@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
498 lines (497 loc) • 25.2 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const Affirm_1 = __importDefault(require("../../core/Affirm"));
const ProducerEngine_1 = __importDefault(require("../producer/ProducerEngine"));
const Environment_1 = __importDefault(require("../Environment"));
const path_1 = __importDefault(require("path"));
const promises_1 = __importDefault(require("fs/promises"));
const dayjs_1 = __importDefault(require("dayjs"));
const customParseFormat_1 = __importDefault(require("dayjs/plugin/customParseFormat"));
dayjs_1.default.extend(customParseFormat_1.default);
class DeveloperEngineClass {
constructor() {
this.discover = (producer) => __awaiter(this, void 0, void 0, function* () {
var _a;
(0, Affirm_1.default)(producer, 'Invalid producer');
const sampleData = yield ProducerEngine_1.default.readSampleData(producer, 10, true);
(0, Affirm_1.default)(sampleData, 'Discover process failed: no result found');
const typeDefinitions = this.extractFieldTypes(sampleData);
const mappedProducer = {
name: producer.name,
description: producer.description,
source: producer.source,
settings: Object.assign({}, producer.settings),
dimensions: typeDefinitions.map(field => {
var _a;
return ({
name: field.name,
type: this.mapFieldTypeToProducerType(field.type),
description: `Auto-mapped field: ${field.name}`,
classification: ((_a = this.extractFieldClassification(field)) === null || _a === void 0 ? void 0 : _a.isPHI) ? ['PHI'] : undefined
});
}),
measures: [],
_version: (_a = producer._version) !== null && _a !== void 0 ? _a : 1
};
mappedProducer['$schema'] = producer['$schema'];
// Save the mapped producer to file
const producerPath = path_1.default.join(process.cwd(), 'remora', 'producers', `${producer.name}.json`);
yield promises_1.default.writeFile(producerPath, JSON.stringify(mappedProducer, null, 4), 'utf-8');
return { producer: mappedProducer, fields: typeDefinitions };
});
this.mapFieldTypeToProducerType = (fieldType) => {
switch (fieldType) {
case 'number':
return 'number';
case 'string':
return 'string';
case 'date':
case 'datetime':
return 'datetime';
default:
return 'string';
}
};
// Infer the most likely type from a single JS value
// Returns one of: 'number' | 'boolean' | 'date' | 'datetime' | 'string' | 'array' | 'object' | 'null'
this.inferType = (value) => {
if (value === null || value === undefined)
return 'string';
// Arrays
if (Array.isArray(value))
return 'array';
// Booleans (including common string representations)
if (typeof value === 'boolean')
return 'boolean';
if (typeof value === 'string') {
const trimmed = value.trim();
const lower = trimmed.toLowerCase();
if (lower === 'true' || lower === 'false')
return 'boolean';
// Numbers (numeric strings)
const numericRegex = /^-?\d+(?:\.\d+)?$/;
if (numericRegex.test(trimmed))
return 'number';
// Timestamps (10 or 13 digits)
const tsRegex = /^-?\d{10}(?:\d{3})?$/;
if (tsRegex.test(trimmed)) {
const n = Number(trimmed.length === 10 ? `${trimmed}000` : trimmed);
const d = new Date(n);
if (!isNaN(d.getTime()))
return 'datetime';
}
// Dates with common formats
const dateFormats = [
'YYYY-MM-DD',
'YYYY/MM/DD',
'DD/MM/YYYY',
'MM/DD/YYYY',
'YYYYMMDD',
'DD-MMM-YYYY',
'YYYY-MM-DD HH:mm',
'YYYY-MM-DD HH:mm:ss',
'YYYY-MM-DDTHH:mm',
'YYYY-MM-DDTHH:mmZ',
'YYYY-MM-DDTHH:mm:ss',
'YYYY-MM-DDTHH:mm:ssZ',
'YYYY-MM-DDTHH:mm:ss.SSSZ'
];
for (const fmt of dateFormats) {
const d = (0, dayjs_1.default)(trimmed, fmt, true);
if (d.isValid()) {
// If time components likely present, classify as datetime
if (/T|\d+:\d+/.test(trimmed))
return 'datetime';
return 'date';
}
}
// ISO 8601 without specifying format
const iso = (0, dayjs_1.default)(trimmed);
if (iso.isValid() && /\d{4}-\d{2}-\d{2}/.test(trimmed)) {
if (/T|\d+:\d+/.test(trimmed))
return 'datetime';
return 'date';
}
return 'string';
}
if (typeof value === 'number')
return 'number';
if (typeof value === 'object') {
// Date instance
if (value instanceof Date && !isNaN(value.getTime()))
return 'datetime';
return 'object';
}
// Fallback for bigint, symbol, function -> string
return 'string';
};
this.inferDimensionType = (value) => {
const type = this.inferType(value);
switch (type) {
case 'array':
case 'object': return 'string';
case 'boolean': return 'boolean';
case 'date':
case 'datetime': return 'datetime';
case 'number': return 'number';
case 'string': return 'string';
default: return 'string';
}
};
this.extractFieldTypes = (records) => {
if (!records || records.length === 0)
return [];
const sample = records[0];
return Object.entries(sample._value).map(([key, value]) => ({
name: key,
type: this.inferType(value)
}));
};
this.extractFieldClassification = (field) => {
(0, Affirm_1.default)(field, 'Invalid field');
const { name, type } = field;
const fieldNameLower = name.toLowerCase();
// Rule 1: Names
const namePatterns = [
/\b(first|last|middle|full|given|family|sur|maiden|nick|display)[\s_-]?name\b/,
/\b(fname|lname|mname|fullname|givenname|familyname|surname)\b/,
/\b(patient|person|individual|customer|client|user)[\s_-]?name\b/,
/\bname\b/
];
// Rule 2: Geographic subdivisions (excluding state level)
const geoPatterns = [
/\b(address|addr|street|st|avenue|ave|road|rd|lane|ln|drive|dr|blvd|boulevard)\b/,
/\b(city|town|village|municipality)\b/,
/\b(county|parish|borough)\b/,
/\b(zip|postal|zipcode|postalcode)\b/,
/\b(precinct|district|ward)\b/,
/\b(geocode|coordinates|coord|latitude|longitude|lat|lng)\b/,
/\b(location|place|residence|home)\b/
];
// Rule 3: Dates related to individuals
const datePatterns = [
/\b(birth|born|dob|birthdate|date[\s_-]?of[\s_-]?birth)\b/,
/\b(admission|admit|admitdate|admission[\s_-]?date)\b/,
/\b(discharge|discharged|dischargedate|discharge[\s_-]?date)\b/,
/\b(death|died|dod|date[\s_-]?of[\s_-]?death|deceased)\b/,
/\b(age|years[\s_-]?old|yrs[\s_-]?old)\b/,
/\b(visit|appointment|appt|service)[\s_-]?date\b/,
/\b(created|updated|modified|last[\s_-]?seen)[\s_-]?date\b/
];
// Rule 4: Phone numbers
const phonePatterns = [
/\b(phone|tel|telephone|mobile|cell|cellular)\b/,
/\b(home|work|office|emergency)[\s_-]?phone\b/,
/\b(contact|phone)[\s_-]?number\b/
];
// Rule 5: Fax numbers
const faxPatterns = [
/\b(fax|facsimile)\b/,
/\bfax[\s_-]?number\b/
];
// Rule 6: Email addresses
const emailPatterns = [
/\b(email|e[\s_-]?mail|mail)\b/,
/\b(email|mail)[\s_-]?address\b/,
/\b(contact|personal|work)[\s_-]?email\b/
];
// Rule 7: Social Security numbers
const ssnPatterns = [
/\b(ssn|social[\s_-]?security)\b/,
/\bsocial[\s_-]?security[\s_-]?number\b/,
/\btax[\s_-]?id\b/
];
// Rule 8: Medical record numbers
const medicalRecordPatterns = [
/\b(mrn|medical[\s_-]?record)\b/,
/\bmedical[\s_-]?record[\s_-]?number\b/,
/\b(patient|chart)[\s_-]?number\b/,
/\b(patient|medical)[\s_-]?id\b/
];
// Rule 9: Health plan beneficiary numbers
const healthPlanPatterns = [
/\b(member|subscriber|beneficiary)[\s_-]?id\b/,
/\b(insurance|health[\s_-]?plan)[\s_-]?number\b/,
/\b(policy|plan)[\s_-]?number\b/,
/\b(medicaid|medicare)[\s_-]?number\b/
];
// Rule 10: Account numbers
const accountPatterns = [
/\b(account|acct)[\s_-]?number\b/,
/\b(account|acct)[\s_-]?id\b/,
/\b(billing|financial)[\s_-]?account\b/
];
// Rule 11: Certificate/license numbers
const licensePatterns = [
/\b(license|licence|cert|certificate)[\s_-]?number\b/,
/\b(driver|drivers)[\s_-]?license\b/,
/\b(professional|medical)[\s_-]?license\b/,
/\b(permit|registration)[\s_-]?number\b/
];
// Rule 12: Vehicle identifiers
const vehiclePatterns = [
/\b(vehicle|car|auto)[\s_-]?id\b/,
/\b(license[\s_-]?plate|plate[\s_-]?number)\b/,
/\b(vin|vehicle[\s_-]?identification)\b/,
/\bserial[\s_-]?number\b/
];
// Rule 13: Device identifiers
const devicePatterns = [
/\b(device|equipment)[\s_-]?id\b/,
/\b(serial|model)[\s_-]?number\b/,
/\b(imei|mac[\s_-]?address|uuid)\b/
];
// Rule 14: URLs
const urlPatterns = [
/\b(url|web[\s_-]?address|website)\b/,
/\b(link|hyperlink)\b/,
/\b(homepage|web[\s_-]?page)\b/
];
// Rule 15: IP addresses
const ipPatterns = [
/\b(ip|ip[\s_-]?address)\b/,
/\b(internet[\s_-]?protocol)\b/,
/\b(network[\s_-]?address)\b/
];
// Rule 16: Biometric identifiers
const biometricPatterns = [
/\b(biometric|fingerprint|voiceprint)\b/,
/\b(finger|thumb)[\s_-]?print\b/,
/\b(voice|speech)[\s_-]?recognition\b/,
/\b(retina|iris)[\s_-]?scan\b/
];
// Rule 17: Photographic images
const imagePatterns = [
/\b(photo|photograph|image|picture)\b/,
/\b(face|facial)[\s_-]?image\b/,
/\b(avatar|profile[\s_-]?picture)\b/
];
// Rule 18: Other unique identifiers
const uniqueIdPatterns = [
/\b(unique|universal)[\s_-]?id\b/,
/\b(id|identifier|tracking)[\s_-]?code\b/,
/\b(reference|ref)[\s_-]?number\b/,
/\b(token|key|hash)\b/,
/\b(guid|uuid)\b/
];
// Check each pattern category
const patternCategories = [
{ patterns: namePatterns, category: 'Names' },
{ patterns: geoPatterns, category: 'Geographic Information' },
{ patterns: datePatterns, category: 'Dates' },
{ patterns: phonePatterns, category: 'Phone Numbers' },
{ patterns: faxPatterns, category: 'Fax Numbers' },
{ patterns: emailPatterns, category: 'Email Addresses' },
{ patterns: ssnPatterns, category: 'Social Security Numbers' },
{ patterns: medicalRecordPatterns, category: 'Medical Record Numbers' },
{ patterns: healthPlanPatterns, category: 'Health Plan Beneficiary Numbers' },
{ patterns: accountPatterns, category: 'Account Numbers' },
{ patterns: licensePatterns, category: 'Certificate/License Numbers' },
{ patterns: vehiclePatterns, category: 'Vehicle Identifiers' },
{ patterns: devicePatterns, category: 'Device Identifiers' },
{ patterns: urlPatterns, category: 'URLs' },
{ patterns: ipPatterns, category: 'IP Addresses' },
{ patterns: biometricPatterns, category: 'Biometric Identifiers' },
{ patterns: imagePatterns, category: 'Photographic Images' },
{ patterns: uniqueIdPatterns, category: 'Unique Identifiers' }
];
for (const { patterns, category } of patternCategories) {
for (const pattern of patterns) {
if (pattern.test(fieldNameLower)) {
return {
isPHI: true,
category,
fieldName: name,
fieldType: type,
reason: `Field name matches pattern for ${category}`
};
}
}
}
return {
isPHI: false,
category: null,
fieldName: name,
fieldType: type,
reason: 'No PHI/PII patterns detected'
};
};
this.createMockData = (producer, records) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(producer, 'Invalid producer');
(0, Affirm_1.default)(records > 0, 'Record count must be greater than 0');
const source = Environment_1.default.getSource(producer.source);
(0, Affirm_1.default)(source, `No source found for producer "${producer.name}" with name "${producer.source}"`);
(0, Affirm_1.default)(source.engine === 'local', `Mock data generation only supports local file-based producers. Source engine "${source.engine}" is not supported.`);
const { fileKey, fileType, delimiter } = producer.settings;
(0, Affirm_1.default)(fileKey, 'Producer must have a fileKey setting for mock data generation');
(0, Affirm_1.default)(fileType, 'Producer must have a fileType setting for mock data generation');
// Generate mock records
const mockRecords = this.generateMockRecords(producer.dimensions, records);
// Get the file path
const basePath = source.authentication.path || process.cwd();
const filePath = path_1.default.join(basePath, fileKey.replace('%', 'mock'));
// Ensure directory exists
yield promises_1.default.mkdir(path_1.default.dirname(filePath), { recursive: true });
// Write to file based on type
const content = this.formatMockData(mockRecords, fileType, delimiter);
yield promises_1.default.writeFile(filePath, content, 'utf-8');
return { filePath, recordCount: records };
});
this.generateMockRecords = (dimensions, count) => {
const records = [];
for (let i = 0; i < count; i++) {
const record = {};
for (const dimension of dimensions) {
// Skip sourceFilename dimensions as they are auto-populated
if (dimension.sourceFilename)
continue;
record[dimension.name] = this.generateMockValue(dimension, i);
}
records.push(record);
}
return records;
};
this.generateMockValue = (dimension, index) => {
const { name, type } = dimension;
const nameLower = name.toLowerCase();
// Generate contextual mock data based on field name patterns
if (this.matchesPattern(nameLower, ['id', 'identifier', 'key', 'pk'])) {
return `${index + 1}`;
}
if (this.matchesPattern(nameLower, ['first_name', 'firstname', 'fname', 'given_name'])) {
return this.pickRandom(['John', 'Jane', 'Michael', 'Sarah', 'David', 'Emily', 'Robert', 'Lisa', 'James', 'Mary']);
}
if (this.matchesPattern(nameLower, ['last_name', 'lastname', 'lname', 'surname', 'family_name'])) {
return this.pickRandom(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Martinez', 'Wilson']);
}
if (this.matchesPattern(nameLower, ['name', 'full_name', 'fullname'])) {
const firstNames = ['John', 'Jane', 'Michael', 'Sarah', 'David', 'Emily'];
const lastNames = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia'];
return `${this.pickRandom(firstNames)} ${this.pickRandom(lastNames)}`;
}
if (this.matchesPattern(nameLower, ['email', 'mail'])) {
return `user${index + 1}@example.com`;
}
if (this.matchesPattern(nameLower, ['phone', 'telephone', 'mobile', 'cell'])) {
return `555-${String(Math.floor(Math.random() * 900) + 100).padStart(3, '0')}-${String(Math.floor(Math.random() * 9000) + 1000).padStart(4, '0')}`;
}
if (this.matchesPattern(nameLower, ['address', 'street', 'addr'])) {
const streets = ['Main St', 'Oak Ave', 'Elm Dr', 'Pine Rd', 'Maple Ln', 'Cedar Blvd'];
return `${Math.floor(Math.random() * 9999) + 1} ${this.pickRandom(streets)}`;
}
if (this.matchesPattern(nameLower, ['city', 'town'])) {
return this.pickRandom(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego']);
}
if (this.matchesPattern(nameLower, ['state', 'province'])) {
return this.pickRandom(['CA', 'TX', 'FL', 'NY', 'PA', 'IL', 'OH', 'GA', 'NC', 'MI']);
}
if (this.matchesPattern(nameLower, ['zip', 'postal', 'zipcode'])) {
return String(Math.floor(Math.random() * 90000) + 10000);
}
if (this.matchesPattern(nameLower, ['country'])) {
return this.pickRandom(['USA', 'Canada', 'UK', 'Germany', 'France', 'Australia']);
}
if (this.matchesPattern(nameLower, ['age', 'years'])) {
return Math.floor(Math.random() * 80) + 18;
}
if (this.matchesPattern(nameLower, ['sex', 'gender'])) {
return this.pickRandom(['M', 'F', 'Male', 'Female']);
}
if (this.matchesPattern(nameLower, ['birth', 'dob', 'birthdate'])) {
const year = Math.floor(Math.random() * 60) + 1940;
const month = String(Math.floor(Math.random() * 12) + 1).padStart(2, '0');
const day = String(Math.floor(Math.random() * 28) + 1).padStart(2, '0');
return `${year}-${month}-${day}`;
}
if (this.matchesPattern(nameLower, ['date', 'created', 'updated', 'timestamp'])) {
const year = Math.floor(Math.random() * 5) + 2020;
const month = String(Math.floor(Math.random() * 12) + 1).padStart(2, '0');
const day = String(Math.floor(Math.random() * 28) + 1).padStart(2, '0');
return `${year}-${month}-${day}`;
}
if (this.matchesPattern(nameLower, ['amount', 'price', 'cost', 'total', 'balance'])) {
return (Math.random() * 1000).toFixed(2);
}
if (this.matchesPattern(nameLower, ['quantity', 'count', 'qty'])) {
return Math.floor(Math.random() * 100) + 1;
}
if (this.matchesPattern(nameLower, ['status'])) {
return this.pickRandom(['active', 'inactive', 'pending', 'completed', 'cancelled']);
}
if (this.matchesPattern(nameLower, ['type', 'category'])) {
return this.pickRandom(['TypeA', 'TypeB', 'TypeC', 'TypeD']);
}
if (this.matchesPattern(nameLower, ['description', 'desc', 'notes', 'comment'])) {
return `Sample description for record ${index + 1}`;
}
// Fall back to type-based generation
return this.generateValueByType(type, index);
};
this.matchesPattern = (fieldName, patterns) => {
return patterns.some(pattern => fieldName.includes(pattern));
};
this.pickRandom = (arr) => {
return arr[Math.floor(Math.random() * arr.length)];
};
this.generateValueByType = (type, index) => {
switch (type) {
case 'string':
return `value_${index + 1}`;
case 'number':
return Math.floor(Math.random() * 1000);
case 'boolean':
return Math.random() > 0.5;
case 'datetime': {
const year = Math.floor(Math.random() * 5) + 2020;
const month = String(Math.floor(Math.random() * 12) + 1).padStart(2, '0');
const day = String(Math.floor(Math.random() * 28) + 1).padStart(2, '0');
return `${year}-${month}-${day}`;
}
default:
return `value_${index + 1}`;
}
};
this.formatMockData = (records, fileType, delimiter) => {
switch (fileType) {
case 'JSON':
return JSON.stringify(records, null, 2);
case 'JSONL':
return records.map(r => JSON.stringify(r)).join('\n');
case 'CSV':
case 'TXT': {
const delim = delimiter || ',';
if (records.length === 0)
return '';
const headers = Object.keys(records[0]);
const headerLine = headers.join(delim);
const dataLines = records.map(record => headers.map(h => {
const val = record[h];
const strVal = val === null || val === undefined ? '' : String(val);
// Escape delimiter and quotes in values
if (strVal.includes(delim) || strVal.includes('"') || strVal.includes('\n')) {
return `"${strVal.replace(/"/g, '""')}"`;
}
return strVal;
}).join(delim));
return [headerLine, ...dataLines].join('\n');
}
default:
throw new Error(`Unsupported file type for mock data generation: ${fileType}`);
}
};
}
}
const DeveloperEngine = new DeveloperEngineClass();
exports.default = DeveloperEngine;