UNPKG

@maximai/maxim-js

Version:

Maxim AI JS SDK. Visit https://getmaxim.ai for more info.

791 lines (790 loc) • 59.4 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.createTestRunBuilder = void 0; const dataset_1 = require("../apis/dataset"); const evaluator_1 = require("../apis/evaluator"); const testRun_1 = require("../apis/testRun"); const dataset_2 = require("../models/dataset"); const csvParser_1 = require("../utils/csvParser"); const semaphore_1 = require("../utils/semaphore"); const defaultLogger_1 = require("./defaultLogger"); const dataset_3 = require("../dataset/dataset"); const utils_1 = require("../utils/utils"); const runUtils_1 = require("./runUtils"); const sanitizationUtils_1 = require("./sanitizationUtils"); const utils_2 = require("./utils"); const uuid_1 = require("uuid"); /** * Creates a new TestRunBuilder with the given configuration. * @param config The configuration for the TestRunBuilder. * @returns A TestRunBuilder with the given configuration. */ const createTestRunBuilder = (config) => ({ withDataStructure: (dataStructure) => { (0, dataset_3.sanitizeDataStructure)(dataStructure); return (0, exports.createTestRunBuilder)({ ...config, dataStructure }); }, withData: (data) => { (0, sanitizationUtils_1.sanitizeData)(data, config.dataStructure); return (0, exports.createTestRunBuilder)({ ...config, data }); }, withEvaluators: (...evaluators) => { (0, sanitizationUtils_1.sanitizeEvaluators)(evaluators); return (0, exports.createTestRunBuilder)({ ...config, evaluators: [...evaluators] }); }, withHumanEvaluationConfig: (humanEvaluationConfig) => { const emailRegex = /^(?!\.)(?!.*\.\.)([A-Z0-9_'+\-\.]*)[A-Z0-9_+-]@([A-Z0-9][A-Z0-9\-]*\.)+[A-Z]{2,}$/i; humanEvaluationConfig.emails.forEach((email) => { if (!emailRegex.test(email)) { throw new Error(`Invalid email address: ${email}`); } }); return (0, exports.createTestRunBuilder)({ ...config, humanEvaluationConfig }); }, withPromptVersionId: (id, contextToEvaluate) => (0, exports.createTestRunBuilder)({ ...config, promptVersion: { id, contextToEvaluate } }), withPromptChainVersionId: (id, contextToEvaluate) => (0, exports.createTestRunBuilder)({ ...config, promptChainVersion: { id, contextToEvaluate } }), withWorkflowId: (id, contextToEvaluate) => (0, exports.createTestRunBuilder)({ ...config, workflow: { id, contextToEvaluate } }), withSimulationConfig: (simulationConfig) => (0, exports.createTestRunBuilder)({ ...config, simulationConfig }), yieldsOutput: (outputFunction) => (0, exports.createTestRunBuilder)({ ...config, outputFunction }), yieldsOutputWithTracing: (outputFunctionWithTracing, maximLogger, disableDefaultTraceCreation) => (0, exports.createTestRunBuilder)({ ...config, outputFunctionWithTracing, maximLogger, disableDefaultTraceCreation: disableDefaultTraceCreation !== null && disableDefaultTraceCreation !== void 0 ? disableDefaultTraceCreation : false }), withLogger: (logger) => (0, exports.createTestRunBuilder)({ ...config, logger }), getConfig: () => config, withConcurrency: (concurrency) => (0, exports.createTestRunBuilder)({ ...config, concurrency }), withTags: (tags) => (0, exports.createTestRunBuilder)({ ...config, tags }), run: async (timeoutInMinutes = 15) => { var _a, _b; let errors = []; const logger = (_a = config.logger) !== null && _a !== void 0 ? _a : new defaultLogger_1.DefaultLogger(); // ===== Sanitization ===== logger.info("Running sanitization checks..."); if (!config.name) { errors.push("Name is required to run a test."); } if (!config.workspaceId) { errors.push("Workspace Id is required to run a test."); } if (!config.outputFunction && !config.outputFunctionWithTracing && !config.promptVersion && !config.promptChainVersion && !config.workflow) { errors.push("Output function or prompt version id, prompt chain version id, or workflow id is required to run a test. You can use either yieldsOutput, yieldsOutputWithTracing, withPromptVersionId, withPromptChainVersionId or withWorkflowId to set them respectively."); } const hasOutputFunction = !!config.outputFunction; const hasPromptVersion = !!config.promptVersion; const hasPromptChainVersion = !!config.promptChainVersion; const hasWorkflow = !!config.workflow; const hasOutputFunctionWithTracing = !!config.outputFunctionWithTracing; const outputSourceCount = (hasOutputFunction ? 1 : 0) + (hasOutputFunctionWithTracing ? 1 : 0) + (hasPromptVersion ? 1 : 0) + (hasPromptChainVersion ? 1 : 0) + (hasWorkflow ? 1 : 0); // Simulation + yieldsOutput: local-execution mode (no prompt/workflow ID required) if (config.simulationConfig && hasOutputFunction) { if (hasPromptChainVersion) { errors.push("Simulation config with yieldsOutput cannot use withPromptChainVersionId. Use withPromptVersionId or withWorkflowId."); } if (hasPromptVersion && hasWorkflow) { errors.push("Simulation config with yieldsOutput cannot use both withPromptVersionId and withWorkflowId. Set exactly one."); } } else if (outputSourceCount !== 1) { errors.push("Exactly one of outputFunction, promptVersionId, promptChainVersionId, or workflowId must be set."); } if (!config.data) { errors.push("Data or dataset id is required to run a test."); } if (config.simulationConfig) { if (config.outputFunctionWithTracing) { errors.push("Simulation config cannot be used with yieldsOutputWithTracing. Use yieldsOutput instead."); } if (config.promptChainVersion) { errors.push("Simulation config cannot be used with withPromptChainVersionId. Use withWorkflowId, withPromptVersionId, or yieldsOutput instead."); } if (!config.workflow && !config.promptVersion && !config.outputFunction) { errors.push("Simulation config requires either withWorkflowId, withPromptVersionId, or yieldsOutput to be set."); } if (config.simulationConfig.responseFields && config.simulationConfig.responseFields.length > 0 && !config.workflow) { errors.push("responseFields in simulationConfig can only be used with withWorkflowId, not with withPromptVersionId, yieldsOutput or yieldsOutputWithTracing."); } } if (errors.length > 0) { throw new Error(`Missing required configuration for test run ${config.name ? ` "${config.name}"` : ""}:\n\t${errors.join(", \n\t")}`, { cause: JSON.stringify({ config }, null, 2), }); } (0, dataset_3.sanitizeDataStructure)(config.dataStructure); (0, sanitizationUtils_1.sanitizeData)(config.data, config.dataStructure); (0, sanitizationUtils_1.sanitizeEvaluators)(config.evaluators); const APIEvaluatorService = new evaluator_1.MaximEvaluatorAPI(config.baseUrl, config.apiKey, config.isDebug); const platformEvaluatorsConfig = await Promise.all(config.evaluators .filter((e) => typeof e === "string" || (typeof e === "object" && !("evaluationFunction" in e))) .map(async (e) => { const evaluatorName = typeof e === "string" ? e : e.name; const evaluatorConfig = await APIEvaluatorService.fetchPlatformEvaluator(evaluatorName, config.workspaceId); return evaluatorConfig; })); if (platformEvaluatorsConfig.some((e) => e.type === "Human")) { if (!config.humanEvaluationConfig) { throw new Error("Human evaluator found in evaluators, but no human evaluation config was provided."); } } // ===== Extracting Variables ===== const dataStructure = config.dataStructure; const concurrency = (_b = config.concurrency) !== null && _b !== void 0 ? _b : 10; const name = config.name; const workspaceId = config.workspaceId; const data = config.data; const testConfigId = config.testConfigId; const evaluators = config.evaluators; const humanEvaluationConfig = config.humanEvaluationConfig; const outputFunction = config.outputFunction; const outputFunctionWithTracing = config.outputFunctionWithTracing; const promptVersion = config.promptVersion; const promptChainVersion = config.promptChainVersion; const workflow = config.workflow; const tags = config.tags; const failedEntryIndices = []; let internalMaximLogger = undefined; const localEvaluatorNameToIdAndPassFailCriteriaMap = (0, utils_2.getLocalEvaluatorNameToIdAndPassFailCriteriaMap)(evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e)); const APITestRunService = new testRun_1.MaximTestRunAPI(config.baseUrl, config.apiKey, config.isDebug); // ===== Common Processor ===== async function processEntry(testRun, index, mappingKeys, getRow, datasetId) { var _a, _b, _c, _d, _e, _f, _g; // 1. fetch row const row = await getRow(index); // if row is not found, return if (!row) { throw new Error(`No row found at index ${index}`); } const input = mappingKeys.input ? (row.data[mappingKeys.input] ? String(row.data[mappingKeys.input]) : undefined) : undefined; const expectedOutput = mappingKeys.expectedOutput ? row.data[mappingKeys.expectedOutput] ? String(row.data[mappingKeys.expectedOutput]) : undefined : undefined; let contextToEvaluate = (mappingKeys.contextToEvaluate ? row.data[mappingKeys.contextToEvaluate] === null ? undefined : row.data[mappingKeys.contextToEvaluate] : undefined); const scenario = mappingKeys.scenario ? row.data[mappingKeys.scenario] ? String(row.data[mappingKeys.scenario]) : undefined : undefined; const expectedSteps = mappingKeys.expectedSteps ? row.data[mappingKeys.expectedSteps] ? String(row.data[mappingKeys.expectedSteps]) : undefined : undefined; // 2. get the output if (config.simulationConfig && outputFunction) { let contextToEvaluateForSimulation = (_a = contextToEvaluate !== null && contextToEvaluate !== void 0 ? contextToEvaluate : promptVersion === null || promptVersion === void 0 ? void 0 : promptVersion.contextToEvaluate) !== null && _a !== void 0 ? _a : workflow === null || workflow === void 0 ? void 0 : workflow.contextToEvaluate; // Build the simulation closure const outputFunctionToExecute = (0, runUtils_1.simulationYieldsOutputFunctionClosure)(testRun.id, workspaceId, config.simulationConfig, outputFunction, APITestRunService, row.id, input, scenario, expectedSteps, contextToEvaluateForSimulation, timeoutInMinutes, logger); // Execute the simulation const output = await (0, runUtils_1.runOutputFunction)(outputFunctionToExecute, row.data); if (output.retrievedContextToEvaluate) { if (contextToEvaluateForSimulation) { logger.info(`Detected retrieved context returned from output function for row ${index + 1} that had contextToEvaluate set from the dataset.\nOverriding the contextToEvaluate from dataset with the retrieved context`); } contextToEvaluateForSimulation = output.retrievedContextToEvaluate; } // 3. run evaluations let localEvaluationResults = undefined; const localEvaluators = evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e); if (localEvaluators.length > 0) { localEvaluationResults = await (0, runUtils_1.runLocalEvaluations)(localEvaluators, row.data, output, contextToEvaluateForSimulation); } // 4. Build output for push // Find all platform evaluators with variableMapping const platformEvaluatorsWithMangler = evaluators.filter((e) => typeof e !== "string" && !("evaluationFunction" in e) && "variableMapping" in e && typeof e.variableMapping === "object"); let evaluatorOutputOverrides; evaluatorOutputOverrides = {}; for (const platformEval of platformEvaluatorsWithMangler) { if (!platformEval.variableMapping) continue; const mappingKeysList = Object.keys(platformEval.variableMapping); if (mappingKeysList.length > 0) { const evalConfig = platformEvaluatorsConfig.find((c) => c.name === platformEval.name); if (!evalConfig) continue; const mappingResult = {}; // Resolve persona with priority: dataset column > simulation config let datasetPersona; for (const [key, value] of Object.entries(row.data)) { if (key.toLowerCase() === "persona" && value != null) { const personaStr = String(value).trim(); if (personaStr) { datasetPersona = personaStr; break; } } } let simconfigPersona; if (((_b = config.simulationConfig) === null || _b === void 0 ? void 0 : _b.persona) && !datasetPersona) { if (typeof config.simulationConfig.persona === "string") { simconfigPersona = config.simulationConfig.persona; } else { const val = row.data[config.simulationConfig.persona.payload]; simconfigPersona = val != null ? String(val).trim() || undefined : undefined; } } const persona = (_c = datasetPersona !== null && datasetPersona !== void 0 ? datasetPersona : simconfigPersona) !== null && _c !== void 0 ? _c : ""; const runObj = { input, output: output.data, retrieval: contextToEvaluateForSimulation, toolCalls: [], scenario, persona, messages: output.messages, ...output, }; for (const key of mappingKeysList) { const mappingFn = platformEval.variableMapping[key]; if (!mappingFn) continue; try { const version = workflow ? { id: workflow.id, type: "workflow", } : promptVersion ? { id: promptVersion.id, type: "prompt", } : undefined; mappingResult[key] = mappingFn(runObj, row.data, version); } catch (e) { logger.error(`Error in variable mapping for key "${key}": ${e instanceof Error ? e.message : String(e)}`); } } evaluatorOutputOverrides[evalConfig.id] = mappingResult; } } // Simulation: cost/usage in simulationMeta, no runConfig try { await APITestRunService.pushTestRunEntry({ testRun: { ...testRun, datasetId, datasetEntryId: row.id }, entry: { input, output: output.data, meta: { sdkVariables: evaluatorOutputOverrides && Object.keys(evaluatorOutputOverrides).length > 0 ? Object.entries(evaluatorOutputOverrides).reduce((acc, [id, val]) => { acc[id] = { type: dataset_2.VariableType.JSON, payload: JSON.stringify(val), }; return acc; }, {}) : undefined, }, expectedOutput, contextToEvaluate: contextToEvaluateForSimulation, scenario, expectedSteps, dataEntry: row.data, localEvaluationResults: localEvaluationResults ? localEvaluationResults.map((result) => ({ ...result, id: localEvaluatorNameToIdAndPassFailCriteriaMap.get(result.name).id, })) : undefined, simulationMeta: output.simulationMeta, }, localSimulation: true, }); } catch (pushError) { const testRunEntryId = (_d = output === null || output === void 0 ? void 0 : output.simulationMeta) === null || _d === void 0 ? void 0 : _d.testRunEntryId; if (testRunEntryId) { try { await APITestRunService.updateSimulationStatus(testRunEntryId, "FAILED"); } catch (cleanupError) { const msg = `Failed to mark simulation as failed after push error (testRunEntryId: ${testRunEntryId}): ${cleanupError instanceof Error ? cleanupError.message : String(cleanupError)}`; "error" in logger && typeof logger.error === "function" ? logger.error(msg) : logger.info(msg); } } throw pushError; } // 5. log the test run entry with local evaluation results logger.processed(`Ran test run entry ${index + 1}`, { datasetEntry: row.data, output, evaluationResults: localEvaluationResults, }); return; } // Make sure if its local workflow or remote workflow else if (outputFunction || outputFunctionWithTracing || evaluators.filter((e) => typeof e !== "string").length > 0 || (config.simulationConfig && (workflow || promptVersion))) { // Make sure if its local workflow or remote workflow let outputFunctionToExecute; let outputFunctionWithTracingToExecute; if (outputFunction) { outputFunctionToExecute = outputFunction; } else if (outputFunctionWithTracing) { outputFunctionWithTracingToExecute = outputFunctionWithTracing; } else { if (workflow) { if (config.simulationConfig) { const contextToEvaluateForSimulation = contextToEvaluate !== null && contextToEvaluate !== void 0 ? contextToEvaluate : workflow.contextToEvaluate; outputFunctionToExecute = (0, runUtils_1.simulationWorkflowIdOutputFunctionClosure)(testRun.id, workflow.id, workspaceId, scenario, APITestRunService, config.simulationConfig, contextToEvaluateForSimulation, row.id, input, expectedSteps, timeoutInMinutes); } else { outputFunctionToExecute = (0, runUtils_1.workflowIdOutputFunctionClosure)(workflow.id, APITestRunService, workflow.contextToEvaluate); } } else if (promptVersion) { if (config.simulationConfig) { // Use contextToEvaluate from row data, or fallback to promptVersion.contextToEvaluate const contextToEvaluateForSimulation = contextToEvaluate !== null && contextToEvaluate !== void 0 ? contextToEvaluate : promptVersion.contextToEvaluate; outputFunctionToExecute = (0, runUtils_1.simulationPromptVersionIdOutputFunctionClosure)(testRun.id, promptVersion.id, workspaceId, scenario, APITestRunService, config.simulationConfig, contextToEvaluateForSimulation, row.id, input, expectedSteps, timeoutInMinutes); } else { outputFunctionToExecute = (0, runUtils_1.promptVersionIdOutputFunctionClosure)(promptVersion.id, input !== null && input !== void 0 ? input : "", APITestRunService, promptVersion.contextToEvaluate); } } else if (promptChainVersion) { outputFunctionToExecute = (0, runUtils_1.promptChainVersionIdOutputFunctionClosure)(promptChainVersion.id, input !== null && input !== void 0 ? input : "", APITestRunService, promptChainVersion.contextToEvaluate); } else { throw new Error("Found no output function to execute, please make sure you have either `yieldsOutput`, `yieldsOutputWithTracing`, `withPromptVersionId`, `withPromptChainVersionId` or `withWorkflowId` set."); } } // When using simulation endpoints, the backend creates the entry via the simulation API — // do NOT create one separately (matches Python: create_test_run_entry only when simulation_config is None) let testRunEntry; if (!config.simulationConfig) { testRunEntry = await APITestRunService.createTestRunEntry({ testRun: { ...testRun, datasetId, datasetEntryId: row.id }, }); } const traceId = (0, uuid_1.v4)(); if (!config.disableDefaultTraceCreation && internalMaximLogger && testRunEntry) { try { const testRunEntryTrace = internalMaximLogger.trace({ id: traceId, name: `Test Run Entry ${index + 1}`, }); testRunEntryTrace.addTag("testRunEntryId", testRunEntry.id); testRunEntryTrace.addTag("testRunId", testRun.id); } catch (e) { logger.error(`Error creating trace for test run entry ${index + 1}: ${e instanceof Error ? e.message : String(e)}`); } } const output = outputFunctionWithTracingToExecute !== undefined ? await (0, runUtils_1.runOutputFunctionWithTracing)(outputFunctionWithTracingToExecute, row.data, traceId) : await (0, runUtils_1.runOutputFunction)(outputFunctionToExecute, row.data); if (output.retrievedContextToEvaluate) { if (contextToEvaluate) { logger.info(`Detected retrieved context returned from output function for row ${index + 1} that had contextToEvaluate set from the dataset.\nOverriding the contextToEvaluate from dataset with the retrieved context`); } contextToEvaluate = output.retrievedContextToEvaluate; } // 3. run evaluations let localEvaluationResults = undefined; const localEvaluators = evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e); if (localEvaluators.length > 0) { localEvaluationResults = await (0, runUtils_1.runLocalEvaluations)(localEvaluators, row.data, output, contextToEvaluate); } // 4. push the test run entry // Use the first evaluator's mangled output if available // 4. Build output for push // Find all platform evaluators with variableMapping const platformEvaluatorsWithMangler = evaluators.filter((e) => typeof e !== "string" && !("evaluationFunction" in e) && "variableMapping" in e && typeof e.variableMapping === "object"); let evaluatorOutputOverrides; evaluatorOutputOverrides = {}; for (const platformEval of platformEvaluatorsWithMangler) { if (!platformEval.variableMapping) continue; const mappingKeysList = Object.keys(platformEval.variableMapping); if (mappingKeysList.length > 0) { const evalConfig = platformEvaluatorsConfig.find((c) => c.name === platformEval.name); if (!evalConfig) continue; const mappingResult = {}; // Resolve persona with priority: dataset column > simulation config let datasetPersona; for (const [key, value] of Object.entries(row.data)) { if (key.toLowerCase() === "persona" && value != null) { const personaStr = String(value).trim(); if (personaStr) { datasetPersona = personaStr; break; } } } let simconfigPersona; if ((_e = config.simulationConfig) === null || _e === void 0 ? void 0 : _e.persona) { if (typeof config.simulationConfig.persona === "string") { simconfigPersona = config.simulationConfig.persona; } else { const val = row.data[config.simulationConfig.persona.payload]; simconfigPersona = val != null ? String(val).trim() || undefined : undefined; } } const persona = (_f = datasetPersona !== null && datasetPersona !== void 0 ? datasetPersona : simconfigPersona) !== null && _f !== void 0 ? _f : ""; const runObj = { input, output: output.data, retrieval: contextToEvaluate, toolCalls: [], scenario, persona, messages: output.messages, ...output, }; for (const key of mappingKeysList) { const mappingFn = platformEval.variableMapping[key]; if (!mappingFn) continue; try { const version = workflow ? { id: workflow.id, type: "workflow", } : promptVersion ? { id: promptVersion.id, type: "prompt", } : promptChainVersion ? { id: promptChainVersion.id, type: "promptChain", } : undefined; mappingResult[key] = mappingFn(runObj, row.data, version); } catch (e) { logger.error(`Error in variable mapping for key "${key}": ${e instanceof Error ? e.message : String(e)}`); } } evaluatorOutputOverrides[evalConfig.id] = mappingResult; } } // For simulation endpoint entries, the backend already ran full simulation + evaluation. // Only push if we have local eval results to send; skip otherwise to avoid double evaluation. const isSimulationEndpointEntry = !!config.simulationConfig && !outputFunction && !outputFunctionWithTracing; const hasLocalEvalResults = localEvaluationResults && localEvaluationResults.length > 0; const shouldPush = !isSimulationEndpointEntry || hasLocalEvalResults; if (shouldPush) { // For simulation endpoint entries with local results: filter evalConfig to only // local evaluators so the V4 push fast path marks COMPLETE without re-queuing // for platform evals (which the simulation worker already ran). const pushTestRun = { ...testRun, datasetId, datasetEntryId: row.id }; if (isSimulationEndpointEntry) { const ec = pushTestRun["evalConfig"]; if (ec && Array.isArray(ec["evals"])) { pushTestRun["evalConfig"] = { ...ec, evals: ec["evals"].filter((e) => e["type"] === "Local"), }; } } try { await APITestRunService.pushTestRunEntry({ testRun: pushTestRun, runConfig: output.meta ? { cost: output.meta.cost, usage: output.meta.usage ? "completionTokens" in output.meta.usage ? { completion_tokens: output.meta.usage.completionTokens, prompt_tokens: output.meta.usage.promptTokens, total_tokens: output.meta.usage.totalTokens, latency: output.meta.usage.latency, } : { latency: output.meta.usage.latency, } : undefined, } : undefined, entry: { id: testRunEntry === null || testRunEntry === void 0 ? void 0 : testRunEntry.id, input, output: output.data, meta: { sdkVariables: evaluatorOutputOverrides && Object.keys(evaluatorOutputOverrides).length > 0 ? Object.entries(evaluatorOutputOverrides).reduce((acc, [id, val]) => { acc[id] = { type: dataset_2.VariableType.JSON, payload: JSON.stringify(val), }; return acc; }, {}) : undefined, connectedTraceId: internalMaximLogger ? traceId : undefined, }, expectedOutput, contextToEvaluate, scenario, expectedSteps, dataEntry: row.data, localEvaluationResults: localEvaluationResults ? localEvaluationResults.map((result) => ({ ...result, id: localEvaluatorNameToIdAndPassFailCriteriaMap.get(result.name).id, })) : undefined, simulationMeta: output.simulationMeta, }, }); } catch (pushError) { const testRunEntryId = (_g = output === null || output === void 0 ? void 0 : output.simulationMeta) === null || _g === void 0 ? void 0 : _g.testRunEntryId; if (testRunEntryId) { try { await APITestRunService.updateSimulationStatus(testRunEntryId, "FAILED"); } catch (cleanupError) { const msg = `Failed to mark simulation as failed after push error (testRunEntryId: ${testRunEntryId}): ${cleanupError instanceof Error ? cleanupError.message : String(cleanupError)}`; "error" in logger && typeof logger.error === "function" ? logger.error(msg) : logger.info(msg); } } throw pushError; } } // 5. log the test run entry with local evaluation results logger.processed(`Ran test run entry ${index + 1}`, { datasetEntry: row.data, output, evaluationResults: localEvaluationResults, }); return; } // Else we will be just pushing back the dataset entry from the SDK side await APITestRunService.pushTestRunEntry({ testRun: { ...testRun, datasetId, datasetEntryId: row.id }, entry: { input, expectedOutput, contextToEvaluate: (workflow === null || workflow === void 0 ? void 0 : workflow.contextToEvaluate) ? workflow.contextToEvaluate : (promptVersion === null || promptVersion === void 0 ? void 0 : promptVersion.contextToEvaluate) ? promptVersion.contextToEvaluate : (promptChainVersion === null || promptChainVersion === void 0 ? void 0 : promptChainVersion.contextToEvaluate) ? promptChainVersion.contextToEvaluate : typeof mappingKeys.contextToEvaluate === "string" ? mappingKeys.contextToEvaluate : undefined, scenario, expectedSteps, dataEntry: row.data, }, }); logger.processed(`Ran test run entry ${index + 1}`, { datasetEntry: row.data, }); } // ===== Test Run Starts ===== try { logger.info(`Creating test run "${name}"...`); // ===== Create Test Run ===== // create eval config (needed for local evals) const evalConfig = [ ...platformEvaluatorsConfig, ...Array.from(localEvaluatorNameToIdAndPassFailCriteriaMap.entries()).map(([name, value]) => ({ id: value.id, name, type: "Local", builtin: false, reversed: undefined, config: { passFailCriteria: { entryLevel: { value: typeof value.passFailCriteria.onEachEntry.value === "boolean" ? value.passFailCriteria.onEachEntry.value ? "Yes" : "No" : value.passFailCriteria.onEachEntry.value, operator: value.passFailCriteria.onEachEntry.scoreShouldBe, name: "score", }, runLevel: { value: value.passFailCriteria.forTestrunOverall.value, operator: value.passFailCriteria.forTestrunOverall.overallShouldBe, name: value.passFailCriteria.forTestrunOverall.for === "average" ? "meanScore" : "queriesPassed", }, }, }, })), ]; if (config.maximLogger) { internalMaximLogger = config.maximLogger; } const tagsEnrichedWithRepoId = config.maximLogger ? [...(tags !== null && tags !== void 0 ? tags : []), `repoId:${config.maximLogger.id}`] : tags; const hasLocalEvaluators = evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e).length > 0; const requiresLocalRun = hasLocalEvaluators || !!config.outputFunction || !!config.outputFunctionWithTracing; const testRun = await APITestRunService.createTestRun(name, workspaceId, "SINGLE", evalConfig, requiresLocalRun, workflow === null || workflow === void 0 ? void 0 : workflow.id, promptVersion === null || promptVersion === void 0 ? void 0 : promptVersion.id, promptChainVersion === null || promptChainVersion === void 0 ? void 0 : promptChainVersion.id, humanEvaluationConfig, tagsEnrichedWithRepoId, config.simulationConfig, internalMaximLogger === null || internalMaximLogger === void 0 ? void 0 : internalMaximLogger.id); try { // ===== Create Semaphore ===== const semaphore = semaphore_1.Semaphore.get(`${workspaceId}:${name}:${testRun.id}`, concurrency); if (data) { if (dataStructure) { const inputKey = (0, utils_1.getAllKeysByValue)(dataStructure, "INPUT")[0]; const expectedOutputKey = (0, utils_1.getAllKeysByValue)(dataStructure, "EXPECTED_OUTPUT")[0]; const contextToEvaluateKey = (0, utils_1.getAllKeysByValue)(dataStructure, "CONTEXT_TO_EVALUATE")[0]; const scenarioKey = (0, utils_1.getAllKeysByValue)(dataStructure, "SCENARIO")[0]; const expectedStepsKey = (0, utils_1.getAllKeysByValue)(dataStructure, "EXPECTED_STEPS")[0]; if (typeof data === "string") { const APIDatasetService = new dataset_1.MaximDatasetAPI(config.baseUrl, config.apiKey, config.isDebug); logger.info(`Fetching dataset "${data}" from platform...`); const platformDataStructure = await APIDatasetService.getDatasetDatastructure(data); (0, dataset_3.validateDataStructure)(dataStructure, platformDataStructure); await APITestRunService.attachDatasetToTestRun(testRun.id, data); // ===== Platform Dataset Processor ===== async function processDatasetEntry(index, datasetId) { try { // 1. acquire semaphore await semaphore.acquire(); // 2. process the entry await processEntry(testRun, index, { input: inputKey, expectedOutput: expectedOutputKey, contextToEvaluate: contextToEvaluateKey, scenario: scenarioKey, expectedSteps: expectedStepsKey, }, async (index) => { return (await APIDatasetService.getDatasetRow(datasetId, index)); }, datasetId); } catch (err) { // 3. handle error (if any) logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${index}]`, { cause: err, }))); failedEntryIndices.push(index); } finally { // 4. release semaphore semaphore.release(); } } // 1. get length of dataset const totalRows = await APIDatasetService.getDatasetTotalRows(data); // 2. process each row in parallel const dataEntryPromises = []; for (let i = 0; i < totalRows; i++) { dataEntryPromises.push(processDatasetEntry(i, data)); } // 3. wait for all promises to resolve await Promise.all(dataEntryPromises); } else if (data instanceof csvParser_1.CSVFile) { const columnStructure = {}; Object.keys(dataStructure).forEach((key, index) => { columnStructure[key] = index; }); const csv = await csvParser_1.CSVFile.restructure(data, columnStructure); // ===== CSV Dataset Processor ===== async function processCSVEntry(index) { try { // 1. acquire semaphore await semaphore.acquire(); // 2. process the entry await processEntry(testRun, index, { input: inputKey, expectedOutput: expectedOutputKey, contextToEvaluate: contextToEvaluateKey, scenario: scenarioKey, expectedSteps: expectedStepsKey, }, async (index) => { return { data: (await csv.getRow(index)) }; }); } catch (err) { // 3. handle error (if any) logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${index}]`, { cause: err, }))); failedEntryIndices.push(index); } finally { // 4. release semaphore semaphore.release(); } } // 1. get length of dataset const totalRows = await csv.getRowCount(); // 2. process each row in parallel const dataEntryPromises = []; for (let i = 0; i < totalRows; i++) { dataEntryPromises.push(processCSVEntry(i)); } // 3. wait for all promises to resolve await Promise.all(dataEntryPromises); } else if (Array.isArray(data)) { // ===== Manual Array Dataset Processor ===== async function processDataEntry(index, getRow) { try { // 1. acquire semaphore await semaphore.acquire(); // 2. process the entry await processEntry(testRun, index, { input: inputKey, expectedOutput: expectedOutputKey, contextToEvaluate: contextToEvaluateKey, scenario: scenarioKey, expectedSteps: expectedStepsKey, }, getRow); } catch (err) { // 3. handle error (if any) logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${index}]`, { cause: err, }))); failedEntryIndices.push(index); } finally { // 4. release semaphore semaphore.release(); } } // 1. get length of dataset const totalRows = data.length; // 2. process each row in parallel const dataEntryPromises = []; for (let i = 0; i < totalRows; i++) { dataEntryPromises.push(processDataEntry(i, (index) => ({ data: data[index], }))); } // 3. wait for all promises to resolve await Promise.all(dataEntryPromises); } else if (typeof data === "function") { // ===== Manual Function Dataset Processor ===== async function processDataEntry(mainIndex, index, getRow) { try { // 1. acquire semaphore await semaphore.acquire(); // 2. process the entry await processEntry(testRun, index, { input: inputKey, expectedOutput: expectedOutputKey, contextToEvaluate: contextToEvaluateKey, scenario: scenarioKey, expectedSteps: expectedStepsKey, }, getRow); } catch (err) { // 3. handle error (if any) logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${mainIndex}]`, { cause: err, }))); failedEntryIndices.push(mainIndex); } finally {