@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
233 lines (232 loc) • 13.5 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const Affirm_1 = __importDefault(require("../../core/Affirm"));
const Environment_1 = __importDefault(require("../Environment"));
const Dataset_1 = __importDefault(require("../dataset/Dataset"));
const DatasetRecord_1 = __importDefault(require("../dataset/DatasetRecord"));
const ConsumerManager_1 = __importDefault(require("../consumer/ConsumerManager"));
class JoinEngineClass {
constructor() {
this.validateFieldInProducer = (fieldName, producerName) => {
var _a, _b, _c, _d;
const producer = Environment_1.default.getProducer(producerName);
if (!producer) {
throw new Error(`Producer ${producerName} not found`);
}
// Check dimensions
const hasDimension = producer.dimensions.some(d => d.name === fieldName);
// Check measures
const hasMeasure = (_b = (_a = producer.measures) === null || _a === void 0 ? void 0 : _a.some(m => m.name === fieldName)) !== null && _b !== void 0 ? _b : false;
if (!hasDimension && !hasMeasure) {
throw new Error(`Field '${fieldName}' not found in producer '${producerName}'. Available fields: ${producer.dimensions.map(d => d.name).concat((_d = (_c = producer.measures) === null || _c === void 0 ? void 0 : _c.map(m => m.name)) !== null && _d !== void 0 ? _d : []).join(', ')}`);
}
};
this.validateFieldInConsumer = (fieldName, consumerShape) => {
const hasField = consumerShape.dimensions.find(x => x.name === fieldName);
if (!hasField)
throw new Error(`Field '${fieldName}' not found in consumer '${consumerShape.name}'. Your join condition must be on fields that are present in the consumer.`);
};
this.parseJoinCondition = (sql, producer) => {
// Extract field names from SQL condition like ${P.id} = ${orders.user_id}
const regex = /\${([^}]+)}/g;
const matches = Array.from(sql.matchAll(regex));
if (matches.length !== 2)
throw new Error(`Invalid join condition: ${sql}. Expected format: \${P.field} = \${producer.field}`);
const [left, right] = matches.map(m => m[1]);
const [leftProducer, leftField] = left.split('.');
const [rightProducer, rightField] = right.split('.');
if (!leftField || !rightField)
throw new Error(`Invalid join condition: ${sql}. Both sides must specify a field name after the dot.`);
// Replace P with actual producer name
const actualLeftProducer = leftProducer === 'P' ? producer.name : leftProducer;
const actualRightProducer = rightProducer === 'P' ? producer.name : rightProducer;
// Validate both fields exist in their respective producers
this.validateFieldInProducer(leftField, actualLeftProducer);
this.validateFieldInProducer(rightField, actualRightProducer);
return {
leftProducer: actualLeftProducer,
leftField: leftField,
rightProducer: actualRightProducer,
rightField: rightField
};
};
this.findProducerData = (producerName, producedData) => {
const data = producedData.find(pd => pd.producerKey === producerName);
if (!data)
throw new Error(`No data found for producer: ${producerName}`);
return data.dataset;
};
this.createLookupMap = (dataset, key) => __awaiter(this, void 0, void 0, function* () {
const map = new Map();
yield dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () {
var _a;
for (const record of batch) {
const keyValue = (_a = record.getValue(key)) === null || _a === void 0 ? void 0 : _a.toString();
if (keyValue === undefined)
continue;
const existing = map.get(keyValue);
if (existing) {
existing.push(record);
}
else {
map.set(keyValue, [record]);
}
}
}));
return map;
});
this.join = (consumer, producedData) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(consumer, 'Invalid consumer');
(0, Affirm_1.default)(producedData, 'Invalid produced data');
if (consumer.producers.length <= 1)
return this.findProducerData(consumer.producers[0].name, producedData);
if (consumer.producers.some(x => x.union))
return yield this.union(consumer, producedData);
const consumerShape = ConsumerManager_1.default.getOutputShape(consumer);
const consumerColumns = ConsumerManager_1.default.compile(consumer);
// Create a new dataset for the joined result
const resultDataset = new Dataset_1.default({
name: `joined_${consumer.name}`,
file: {
fileKey: 'temp',
fileType: 'CSV'
},
baseProducer: Environment_1.default.getProducer(consumer.producers[0].name),
executionId: producedData[0].dataset.getExecutionId()
});
// Get dimensions for the result dataset based on consumer columns
const resultDimensions = consumerColumns.map((col, index) => {
var _a, _b;
return ({
name: col.consumerAlias || col.consumerKey,
key: col.consumerAlias || col.consumerKey,
index,
type: (_b = (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string',
hidden: null
});
});
// Initialize the result dataset with proper dimensions
resultDataset.getDimensions().length = 0;
resultDataset.getDimensions().push(...resultDimensions);
// Process joins sequentially
for (let i = 0; i < consumer.producers.length; i++) {
const producer = consumer.producers[i];
if (!producer.joins)
continue;
for (const join of producer.joins) {
const otherProducer = consumer.producers.find(p => p.name === join.otherName);
if (!otherProducer) {
throw new Error(`Producer ${join.otherName} not found`);
}
const condition = this.parseJoinCondition(join.sql, producer);
this.validateFieldInConsumer(condition.leftField, consumerShape);
this.validateFieldInConsumer(condition.rightField, consumerShape);
const leftDataset = this.findProducerData(condition.leftProducer, producedData);
const rightDataset = this.findProducerData(condition.rightProducer, producedData);
// Create lookup map for the right dataset (smaller dataset)
const rightLookup = yield this.createLookupMap(rightDataset, condition.rightField);
// Perform streaming join
yield this.performStreamingJoin(leftDataset, rightLookup, condition, join.relationship, consumerColumns, resultDataset);
}
}
return resultDataset;
});
this.union = (consumer, producedData) => __awaiter(this, void 0, void 0, function* () {
const getDimensionsKey = (ds) => ds.getDimensions().map(x => x.name.trim()).join(';').trim();
const mainDataset = producedData[0].dataset;
const mainDimKey = getDimensionsKey(mainDataset);
const otherProducedData = producedData.slice(1);
for (const prodData of otherProducedData) {
const prodDimKey = getDimensionsKey(prodData.dataset);
if (mainDimKey !== prodDimKey)
throw new Error(`On consumer "${consumer.name}", can't union the dataset "${prodData.dataset.name}" (producer: ${prodData.producerKey}) because the dimensions are different from the main dataset "${mainDataset.name}" (producer: ${producedData[0].producerKey}). "${mainDimKey}" != "${prodDimKey}"`);
yield prodData.dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () {
yield mainDataset.append(batch);
}));
}
return mainDataset;
});
this.performStreamingJoin = (leftDataset, rightLookup, condition, relationship, consumerColumns, resultDataset) => __awaiter(this, void 0, void 0, function* () {
const joinedRecords = [];
const batchSize = leftDataset.getBatchSize();
yield leftDataset.streamBatches((leftBatch) => __awaiter(this, void 0, void 0, function* () {
var _a;
for (const leftRecord of leftBatch) {
const leftValue = (_a = leftRecord.getValue(condition.leftField)) === null || _a === void 0 ? void 0 : _a.toString();
if (leftValue === undefined)
continue;
const rightRecords = rightLookup.get(leftValue) || [];
if (rightRecords.length === 0) {
// Handle cases where there's no match
if (relationship !== 'one-to-many') {
// For one-to-one and many-to-one, keep rows even without matches
const mergedRecord = this.createMergedRecord(leftRecord, null, condition, consumerColumns, resultDataset);
if (mergedRecord) {
joinedRecords.push(mergedRecord);
}
}
continue;
}
// Create joined records for each match
for (const rightRecord of rightRecords) {
const mergedRecord = this.createMergedRecord(leftRecord, rightRecord, condition, consumerColumns, resultDataset);
if (mergedRecord) {
joinedRecords.push(mergedRecord);
}
}
// Write batch if it's getting large
if (joinedRecords.length >= batchSize) {
yield resultDataset.append(joinedRecords);
joinedRecords.length = 0;
}
}
}));
// Write remaining records
if (joinedRecords.length > 0) {
yield resultDataset.append(joinedRecords);
}
});
this.createMergedRecord = (leftRecord, rightRecord, condition, consumerColumns, resultDataset) => {
const mergedValues = {};
// Map each field from the appropriate source
for (const column of consumerColumns) {
const fieldName = column.consumerAlias || column.consumerKey;
if (column.owner === condition.leftProducer) {
// Get value from left dataset
const leftFieldName = column.nameInProducer || fieldName;
mergedValues[fieldName] = leftRecord.getValue(leftFieldName);
}
else if (column.owner === condition.rightProducer) {
// Get value from right dataset (if exists)
if (rightRecord) {
const rightFieldName = column.nameInProducer || fieldName;
mergedValues[fieldName] = rightRecord.getValue(rightFieldName);
}
else {
mergedValues[fieldName] = null;
}
}
}
// Create the merged record
const dimensions = resultDataset.getDimensions();
const delimiter = resultDataset.getDelimiter();
const values = dimensions.map(dim => mergedValues[dim.name] || '');
const recordString = values.join(delimiter);
return new DatasetRecord_1.default(recordString, dimensions, delimiter);
};
}
}
const JoinEngine = new JoinEngineClass();
exports.default = JoinEngine;