UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

286 lines (265 loc) 11.7 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const openai_1 = __importDefault(require("openai")); const zod_1 = require("openai/helpers/zod"); const zod_2 = require("zod"); const baseProducersSystemPrompt = ` # TASK You are an agent tasked with creating the mapping between an INPUT DATA SPEC and one or more OUTPUT DATA SPEC. The mapping between the two is made by creating PRODUCERS and CONSUMERS. You are tasked with creating the PRODUCER(S) that will then be used. # PRODUCERS A producer maps directly to a dataset and exposes it's dimensions. ## FIELDS - classification: make your best guess if the field falls under any of these regulations # RULES - Add only the required fields to comply with the OUTPUT DATA SPEC - Add fields that you think are important - The name of the producer must be the same as the name of the dataset. - Avoid creating multiple providers with similar data. - Try to create the least number of providers - Awlays include this exact property as the first -> "$schema": "https://raw.githubusercontent.com/ForzaLabs/remora-public/refs/heads/main/json_schemas/producer-schema.json" - Based on the producer select the source that makes the most sense to connect otherwise leave the string "<source_name>" # FORMAT The INPUT DATA SPEC will be the first 10 rows from the dataset that the producer needs to map to. The OUTPUT DATA SPEC is a JSON of the desired output that the consumer needs to return. The result must be returned as a JSON object. # INPUT DATA SPEC {{input data spec}} # OUTPUT DATA SPEC {{output data spec}} # File name {{file name}} # SOURCES {{sources}} `; const baseConsumersSystemPrompt = ` # TASK You are an agent tasked with creating the mapping between a list of PRODUCERS and one or more OUTPUT DATA SPEC. You are going to receive a list of PRODUCERS that expose some dimensions, and you will create the CONSUMERS that comply with the provided OUTPUT DATA SPEC. # CONSUMERS A consumer takes the data from one or more producers and changes it's shape to transform it into the required output schema. ## FIELDS - fields.from: used to distinct between the producers imported by the consumer. The value is the name of the producer. # RULES - If a field is not needed, do not add it e.g. - Only import a producer once - Awlays include this exact property as the first -> $schema": "https://raw.githubusercontent.com/ForzaLabs/remora-public/refs/heads/main/json_schemas/consumer-schema.json", - Use "API" as the only valid output format. - The "from" must contain only the name of the producer # FORMAT The INPUT DATA SPEC will be the first 10 rows from the dataset that the producer needs to map to. The OUTPUT DATA SPEC is a JSON of the desired output that the consumer needs to return. The result must be returned as a JSON object. # PRODUCERS {{producers}} # OUTPUT DATA SPEC {{output data spec}} # EXAMPLES producers: """ { "name": "claim", "dimensions": [ { "name": "id", "type": "string", "pk": true }, { "name": "amount", "type": "number" }, { "name": "provider", "type": "string" }, { "name": "date", "type": "datetime" } ] } """ output data spec: """ { "name": "claim", "fields": [ { "name": "id", "type": "string", "pk": true }, { "name": "amount", "type": "number" }, { "name": "provider", "type": "string" }, { "name": "date", "type": "datetime" } ] } """ resulting consumer: """ { "name": "claims", "fields": [ { "key": "id" }, { "key": "amount" }, { "key": "provider" }, { "key": "date", "alias": "Creation date" } ], "outputs": [ { "format": "API" } ], "producers": [ { "name": "claim" } ] } """ `; const baseQASystemPrompt = ` # TASK You are an agent tasked with ensuring that the CONSUMER(S) created follow the guidelines given. You are going to receive a list of CONSUMERS and you need to return in the correct JSON format the same CONSUMERS with the needed updates to ensure that they follow all the rules. # CONSUMER DEFINITION A consumer takes the data from one or more producers and changes it's shape to transform it into the required output schema. ## FIELDS - fields.from: used to distinct between the producers imported by the consumer. The value is the name of the producer. # RULES - If a field is not needed, do not add it e.g. - Only import a producer once - Awlays include this exact property as the first -> "https://raw.githubusercontent.com/ForzaLabs/remora-public/refs/heads/main/json_schemas/consumer-schema.json", - Use "API" as the only valid output format. - The "from" must contain only the name of the producer # CONSUMERS {{consumers}} `; class LLM { constructor() { this.inferProducers = (input, outputs, fileName, sources) => __awaiter(this, void 0, void 0, function* () { let systemPrompt = baseProducersSystemPrompt; systemPrompt = systemPrompt.replace('{{input data spec}}', input.map(x => `- ${JSON.stringify(x)}`).join('\n')); systemPrompt = systemPrompt.replace('{{output data spec}}', outputs.map(x => `- ${JSON.stringify(x)}`).join('\n')); systemPrompt = systemPrompt.replace('{{file name}}', fileName); systemPrompt = systemPrompt.replace('{{sources}}', sources.map(x => `- ${JSON.stringify(x)}`).join('\n')); const res = yield this._client.beta.chat.completions.parse({ model: 'gpt-4o', messages: [ { role: 'system', content: systemPrompt } ], response_format: (0, zod_1.zodResponseFormat)(zod_2.z.object({ producers: zod_2.z.array(zod_2.z.object({ $schema: zod_2.z.string().describe('The schema of the producer. This should always be the same.'), name: zod_2.z.string(), description: zod_2.z.string(), source: zod_2.z.string().describe('The name of the source linked to this producer.'), settings: zod_2.z.object({ fileKey: zod_2.z.string().describe('The name of the file'), fileType: zod_2.z.string().describe('The file extension (CSV | JSONL | JSON)') }), dimensions: zod_2.z.array(zod_2.z.object({ name: zod_2.z.string(), // alias: z.string().optional(), description: zod_2.z.string().optional(), type: zod_2.z.enum(['string', 'number', 'datetime']), pk: zod_2.z.boolean().optional(), classification: zod_2.z.array(zod_2.z.enum(['PHI', 'PII', 'GDPR'])).optional() })) })) }), 'environment') }); const msg = res.choices[0].message; return msg.parsed; }); this.inferConsumers = (producers, outputs) => __awaiter(this, void 0, void 0, function* () { let systemPrompt = baseConsumersSystemPrompt; systemPrompt = systemPrompt.replace('{{producers}}', producers.map(x => `- ${JSON.stringify(x)}`).join('\n')); systemPrompt = systemPrompt.replace('{{output data spec}}', outputs.map(x => `- ${JSON.stringify(x)}`).join('\n')); const item = { model: 'gpt-4o', messages: [ { role: 'system', content: systemPrompt } ], response_format: (0, zod_1.zodResponseFormat)(zod_2.z.object({ consumers: zod_2.z.array(zod_2.z.object({ $schema: zod_2.z.string().describe('The schema of the consumer. This should always be the same.'), name: zod_2.z.string(), description: zod_2.z.string(), producers: zod_2.z.array(zod_2.z.object({ name: zod_2.z.string().describe('References one of the producers. Must be unique, there can\'t be two entry with the same name.'), joins: zod_2.z.array(zod_2.z.object({ otherName: zod_2.z.string(), relationship: zod_2.z.enum(['one-to-one', 'one-to-many', 'many-to-one']), sql: zod_2.z.string() })).optional().describe('Which other producer to join this one with. Omit if empty.') })), fields: zod_2.z.array(zod_2.z.object({ key: zod_2.z.string(), from: zod_2.z.string().optional(), alias: zod_2.z.string().optional() // grouping: z.object({ // groupingKey: z.string(), // subFields: z.array(z.lazy(() => z.object({ // key: z.string(), // from: z.string().optional() // }))) // }).optional() })), outputs: zod_2.z.array(zod_2.z.object({ format: zod_2.z.enum(['SQL', 'API', 'CSV', 'PARQUET', 'JSON']) })) })) }), 'environment') }; const res = yield this._client.beta.chat.completions.parse(item); const msg = res.choices[0].message; const finalDraft = msg.parsed; // Do some manual adjustments cause some things still don't work... if (finalDraft && finalDraft.consumers) { for (const cons of finalDraft.consumers) { for (const field of cons.fields) { if (field.grouping) { if (!field.grouping.groupingKey || field.grouping.groupingKey.length === 0) field.grouping = undefined; } } } } return finalDraft; }); this._client = new openai_1.default({ apiKey: process.env.OPENAI_API_KEY }); } } exports.default = LLM;