UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

233 lines (232 loc) 13.5 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const Affirm_1 = __importDefault(require("../../core/Affirm")); const Environment_1 = __importDefault(require("../Environment")); const Dataset_1 = __importDefault(require("../dataset/Dataset")); const DatasetRecord_1 = __importDefault(require("../dataset/DatasetRecord")); const ConsumerManager_1 = __importDefault(require("../consumer/ConsumerManager")); class JoinEngineClass { constructor() { this.validateFieldInProducer = (fieldName, producerName) => { var _a, _b, _c, _d; const producer = Environment_1.default.getProducer(producerName); if (!producer) { throw new Error(`Producer ${producerName} not found`); } // Check dimensions const hasDimension = producer.dimensions.some(d => d.name === fieldName); // Check measures const hasMeasure = (_b = (_a = producer.measures) === null || _a === void 0 ? void 0 : _a.some(m => m.name === fieldName)) !== null && _b !== void 0 ? _b : false; if (!hasDimension && !hasMeasure) { throw new Error(`Field '${fieldName}' not found in producer '${producerName}'. Available fields: ${producer.dimensions.map(d => d.name).concat((_d = (_c = producer.measures) === null || _c === void 0 ? void 0 : _c.map(m => m.name)) !== null && _d !== void 0 ? _d : []).join(', ')}`); } }; this.validateFieldInConsumer = (fieldName, consumerShape) => { const hasField = consumerShape.dimensions.find(x => x.name === fieldName); if (!hasField) throw new Error(`Field '${fieldName}' not found in consumer '${consumerShape.name}'. Your join condition must be on fields that are present in the consumer.`); }; this.parseJoinCondition = (sql, producer) => { // Extract field names from SQL condition like ${P.id} = ${orders.user_id} const regex = /\${([^}]+)}/g; const matches = Array.from(sql.matchAll(regex)); if (matches.length !== 2) throw new Error(`Invalid join condition: ${sql}. Expected format: \${P.field} = \${producer.field}`); const [left, right] = matches.map(m => m[1]); const [leftProducer, leftField] = left.split('.'); const [rightProducer, rightField] = right.split('.'); if (!leftField || !rightField) throw new Error(`Invalid join condition: ${sql}. Both sides must specify a field name after the dot.`); // Replace P with actual producer name const actualLeftProducer = leftProducer === 'P' ? producer.name : leftProducer; const actualRightProducer = rightProducer === 'P' ? producer.name : rightProducer; // Validate both fields exist in their respective producers this.validateFieldInProducer(leftField, actualLeftProducer); this.validateFieldInProducer(rightField, actualRightProducer); return { leftProducer: actualLeftProducer, leftField: leftField, rightProducer: actualRightProducer, rightField: rightField }; }; this.findProducerData = (producerName, producedData) => { const data = producedData.find(pd => pd.producerKey === producerName); if (!data) throw new Error(`No data found for producer: ${producerName}`); return data.dataset; }; this.createLookupMap = (dataset, key) => __awaiter(this, void 0, void 0, function* () { const map = new Map(); yield dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () { var _a; for (const record of batch) { const keyValue = (_a = record.getValue(key)) === null || _a === void 0 ? void 0 : _a.toString(); if (keyValue === undefined) continue; const existing = map.get(keyValue); if (existing) { existing.push(record); } else { map.set(keyValue, [record]); } } })); return map; }); this.join = (consumer, producedData) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(consumer, 'Invalid consumer'); (0, Affirm_1.default)(producedData, 'Invalid produced data'); if (consumer.producers.length <= 1) return this.findProducerData(consumer.producers[0].name, producedData); if (consumer.producers.some(x => x.union)) return yield this.union(consumer, producedData); const consumerShape = ConsumerManager_1.default.getOutputShape(consumer); const consumerColumns = ConsumerManager_1.default.compile(consumer); // Create a new dataset for the joined result const resultDataset = new Dataset_1.default({ name: `joined_${consumer.name}`, file: { fileKey: 'temp', fileType: 'CSV' }, baseProducer: Environment_1.default.getProducer(consumer.producers[0].name), executionId: producedData[0].dataset.getExecutionId() }); // Get dimensions for the result dataset based on consumer columns const resultDimensions = consumerColumns.map((col, index) => { var _a, _b; return ({ name: col.consumerAlias || col.consumerKey, key: col.consumerAlias || col.consumerKey, index, type: (_b = (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string', hidden: null }); }); // Initialize the result dataset with proper dimensions resultDataset.getDimensions().length = 0; resultDataset.getDimensions().push(...resultDimensions); // Process joins sequentially for (let i = 0; i < consumer.producers.length; i++) { const producer = consumer.producers[i]; if (!producer.joins) continue; for (const join of producer.joins) { const otherProducer = consumer.producers.find(p => p.name === join.otherName); if (!otherProducer) { throw new Error(`Producer ${join.otherName} not found`); } const condition = this.parseJoinCondition(join.sql, producer); this.validateFieldInConsumer(condition.leftField, consumerShape); this.validateFieldInConsumer(condition.rightField, consumerShape); const leftDataset = this.findProducerData(condition.leftProducer, producedData); const rightDataset = this.findProducerData(condition.rightProducer, producedData); // Create lookup map for the right dataset (smaller dataset) const rightLookup = yield this.createLookupMap(rightDataset, condition.rightField); // Perform streaming join yield this.performStreamingJoin(leftDataset, rightLookup, condition, join.relationship, consumerColumns, resultDataset); } } return resultDataset; }); this.union = (consumer, producedData) => __awaiter(this, void 0, void 0, function* () { const getDimensionsKey = (ds) => ds.getDimensions().map(x => x.name.trim()).join(';').trim(); const mainDataset = producedData[0].dataset; const mainDimKey = getDimensionsKey(mainDataset); const otherProducedData = producedData.slice(1); for (const prodData of otherProducedData) { const prodDimKey = getDimensionsKey(prodData.dataset); if (mainDimKey !== prodDimKey) throw new Error(`On consumer "${consumer.name}", can't union the dataset "${prodData.dataset.name}" (producer: ${prodData.producerKey}) because the dimensions are different from the main dataset "${mainDataset.name}" (producer: ${producedData[0].producerKey}). "${mainDimKey}" != "${prodDimKey}"`); yield prodData.dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () { yield mainDataset.append(batch); })); } return mainDataset; }); this.performStreamingJoin = (leftDataset, rightLookup, condition, relationship, consumerColumns, resultDataset) => __awaiter(this, void 0, void 0, function* () { const joinedRecords = []; const batchSize = leftDataset.getBatchSize(); yield leftDataset.streamBatches((leftBatch) => __awaiter(this, void 0, void 0, function* () { var _a; for (const leftRecord of leftBatch) { const leftValue = (_a = leftRecord.getValue(condition.leftField)) === null || _a === void 0 ? void 0 : _a.toString(); if (leftValue === undefined) continue; const rightRecords = rightLookup.get(leftValue) || []; if (rightRecords.length === 0) { // Handle cases where there's no match if (relationship !== 'one-to-many') { // For one-to-one and many-to-one, keep rows even without matches const mergedRecord = this.createMergedRecord(leftRecord, null, condition, consumerColumns, resultDataset); if (mergedRecord) { joinedRecords.push(mergedRecord); } } continue; } // Create joined records for each match for (const rightRecord of rightRecords) { const mergedRecord = this.createMergedRecord(leftRecord, rightRecord, condition, consumerColumns, resultDataset); if (mergedRecord) { joinedRecords.push(mergedRecord); } } // Write batch if it's getting large if (joinedRecords.length >= batchSize) { yield resultDataset.append(joinedRecords); joinedRecords.length = 0; } } })); // Write remaining records if (joinedRecords.length > 0) { yield resultDataset.append(joinedRecords); } }); this.createMergedRecord = (leftRecord, rightRecord, condition, consumerColumns, resultDataset) => { const mergedValues = {}; // Map each field from the appropriate source for (const column of consumerColumns) { const fieldName = column.consumerAlias || column.consumerKey; if (column.owner === condition.leftProducer) { // Get value from left dataset const leftFieldName = column.nameInProducer || fieldName; mergedValues[fieldName] = leftRecord.getValue(leftFieldName); } else if (column.owner === condition.rightProducer) { // Get value from right dataset (if exists) if (rightRecord) { const rightFieldName = column.nameInProducer || fieldName; mergedValues[fieldName] = rightRecord.getValue(rightFieldName); } else { mergedValues[fieldName] = null; } } } // Create the merged record const dimensions = resultDataset.getDimensions(); const delimiter = resultDataset.getDelimiter(); const values = dimensions.map(dim => mergedValues[dim.name] || ''); const recordString = values.join(delimiter); return new DatasetRecord_1.default(recordString, dimensions, delimiter); }; } } const JoinEngine = new JoinEngineClass(); exports.default = JoinEngine;