@maximai/maxim-js
Version:
Maxim AI JS SDK. Visit https://getmaxim.ai for more info.
791 lines (790 loc) • 59.4 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.createTestRunBuilder = void 0;
const dataset_1 = require("../apis/dataset");
const evaluator_1 = require("../apis/evaluator");
const testRun_1 = require("../apis/testRun");
const dataset_2 = require("../models/dataset");
const csvParser_1 = require("../utils/csvParser");
const semaphore_1 = require("../utils/semaphore");
const defaultLogger_1 = require("./defaultLogger");
const dataset_3 = require("../dataset/dataset");
const utils_1 = require("../utils/utils");
const runUtils_1 = require("./runUtils");
const sanitizationUtils_1 = require("./sanitizationUtils");
const utils_2 = require("./utils");
const uuid_1 = require("uuid");
/**
* Creates a new TestRunBuilder with the given configuration.
* @param config The configuration for the TestRunBuilder.
* @returns A TestRunBuilder with the given configuration.
*/
const createTestRunBuilder = (config) => ({
withDataStructure: (dataStructure) => {
(0, dataset_3.sanitizeDataStructure)(dataStructure);
return (0, exports.createTestRunBuilder)({ ...config, dataStructure });
},
withData: (data) => {
(0, sanitizationUtils_1.sanitizeData)(data, config.dataStructure);
return (0, exports.createTestRunBuilder)({ ...config, data });
},
withEvaluators: (...evaluators) => {
(0, sanitizationUtils_1.sanitizeEvaluators)(evaluators);
return (0, exports.createTestRunBuilder)({ ...config, evaluators: [...evaluators] });
},
withHumanEvaluationConfig: (humanEvaluationConfig) => {
const emailRegex = /^(?!\.)(?!.*\.\.)([A-Z0-9_'+\-\.]*)[A-Z0-9_+-]@([A-Z0-9][A-Z0-9\-]*\.)+[A-Z]{2,}$/i;
humanEvaluationConfig.emails.forEach((email) => {
if (!emailRegex.test(email)) {
throw new Error(`Invalid email address: ${email}`);
}
});
return (0, exports.createTestRunBuilder)({ ...config, humanEvaluationConfig });
},
withPromptVersionId: (id, contextToEvaluate) => (0, exports.createTestRunBuilder)({ ...config, promptVersion: { id, contextToEvaluate } }),
withPromptChainVersionId: (id, contextToEvaluate) => (0, exports.createTestRunBuilder)({ ...config, promptChainVersion: { id, contextToEvaluate } }),
withWorkflowId: (id, contextToEvaluate) => (0, exports.createTestRunBuilder)({ ...config, workflow: { id, contextToEvaluate } }),
withSimulationConfig: (simulationConfig) => (0, exports.createTestRunBuilder)({ ...config, simulationConfig }),
yieldsOutput: (outputFunction) => (0, exports.createTestRunBuilder)({ ...config, outputFunction }),
yieldsOutputWithTracing: (outputFunctionWithTracing, maximLogger, disableDefaultTraceCreation) => (0, exports.createTestRunBuilder)({ ...config, outputFunctionWithTracing, maximLogger, disableDefaultTraceCreation: disableDefaultTraceCreation !== null && disableDefaultTraceCreation !== void 0 ? disableDefaultTraceCreation : false }),
withLogger: (logger) => (0, exports.createTestRunBuilder)({ ...config, logger }),
getConfig: () => config,
withConcurrency: (concurrency) => (0, exports.createTestRunBuilder)({ ...config, concurrency }),
withTags: (tags) => (0, exports.createTestRunBuilder)({ ...config, tags }),
run: async (timeoutInMinutes = 15) => {
var _a, _b;
let errors = [];
const logger = (_a = config.logger) !== null && _a !== void 0 ? _a : new defaultLogger_1.DefaultLogger();
// ===== Sanitization =====
logger.info("Running sanitization checks...");
if (!config.name) {
errors.push("Name is required to run a test.");
}
if (!config.workspaceId) {
errors.push("Workspace Id is required to run a test.");
}
if (!config.outputFunction &&
!config.outputFunctionWithTracing &&
!config.promptVersion &&
!config.promptChainVersion &&
!config.workflow) {
errors.push("Output function or prompt version id, prompt chain version id, or workflow id is required to run a test. You can use either yieldsOutput, yieldsOutputWithTracing, withPromptVersionId, withPromptChainVersionId or withWorkflowId to set them respectively.");
}
const hasOutputFunction = !!config.outputFunction;
const hasPromptVersion = !!config.promptVersion;
const hasPromptChainVersion = !!config.promptChainVersion;
const hasWorkflow = !!config.workflow;
const hasOutputFunctionWithTracing = !!config.outputFunctionWithTracing;
const outputSourceCount = (hasOutputFunction ? 1 : 0) + (hasOutputFunctionWithTracing ? 1 : 0) + (hasPromptVersion ? 1 : 0) + (hasPromptChainVersion ? 1 : 0) + (hasWorkflow ? 1 : 0);
// Simulation + yieldsOutput: local-execution mode (no prompt/workflow ID required)
if (config.simulationConfig && hasOutputFunction) {
if (hasPromptChainVersion) {
errors.push("Simulation config with yieldsOutput cannot use withPromptChainVersionId. Use withPromptVersionId or withWorkflowId.");
}
if (hasPromptVersion && hasWorkflow) {
errors.push("Simulation config with yieldsOutput cannot use both withPromptVersionId and withWorkflowId. Set exactly one.");
}
}
else if (outputSourceCount !== 1) {
errors.push("Exactly one of outputFunction, promptVersionId, promptChainVersionId, or workflowId must be set.");
}
if (!config.data) {
errors.push("Data or dataset id is required to run a test.");
}
if (config.simulationConfig) {
if (config.outputFunctionWithTracing) {
errors.push("Simulation config cannot be used with yieldsOutputWithTracing. Use yieldsOutput instead.");
}
if (config.promptChainVersion) {
errors.push("Simulation config cannot be used with withPromptChainVersionId. Use withWorkflowId, withPromptVersionId, or yieldsOutput instead.");
}
if (!config.workflow && !config.promptVersion && !config.outputFunction) {
errors.push("Simulation config requires either withWorkflowId, withPromptVersionId, or yieldsOutput to be set.");
}
if (config.simulationConfig.responseFields && config.simulationConfig.responseFields.length > 0 && !config.workflow) {
errors.push("responseFields in simulationConfig can only be used with withWorkflowId, not with withPromptVersionId, yieldsOutput or yieldsOutputWithTracing.");
}
}
if (errors.length > 0) {
throw new Error(`Missing required configuration for test run ${config.name ? ` "${config.name}"` : ""}:\n\t${errors.join(", \n\t")}`, {
cause: JSON.stringify({ config }, null, 2),
});
}
(0, dataset_3.sanitizeDataStructure)(config.dataStructure);
(0, sanitizationUtils_1.sanitizeData)(config.data, config.dataStructure);
(0, sanitizationUtils_1.sanitizeEvaluators)(config.evaluators);
const APIEvaluatorService = new evaluator_1.MaximEvaluatorAPI(config.baseUrl, config.apiKey, config.isDebug);
const platformEvaluatorsConfig = await Promise.all(config.evaluators
.filter((e) => typeof e === "string" || (typeof e === "object" && !("evaluationFunction" in e)))
.map(async (e) => {
const evaluatorName = typeof e === "string" ? e : e.name;
const evaluatorConfig = await APIEvaluatorService.fetchPlatformEvaluator(evaluatorName, config.workspaceId);
return evaluatorConfig;
}));
if (platformEvaluatorsConfig.some((e) => e.type === "Human")) {
if (!config.humanEvaluationConfig) {
throw new Error("Human evaluator found in evaluators, but no human evaluation config was provided.");
}
}
// ===== Extracting Variables =====
const dataStructure = config.dataStructure;
const concurrency = (_b = config.concurrency) !== null && _b !== void 0 ? _b : 10;
const name = config.name;
const workspaceId = config.workspaceId;
const data = config.data;
const testConfigId = config.testConfigId;
const evaluators = config.evaluators;
const humanEvaluationConfig = config.humanEvaluationConfig;
const outputFunction = config.outputFunction;
const outputFunctionWithTracing = config.outputFunctionWithTracing;
const promptVersion = config.promptVersion;
const promptChainVersion = config.promptChainVersion;
const workflow = config.workflow;
const tags = config.tags;
const failedEntryIndices = [];
let internalMaximLogger = undefined;
const localEvaluatorNameToIdAndPassFailCriteriaMap = (0, utils_2.getLocalEvaluatorNameToIdAndPassFailCriteriaMap)(evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e));
const APITestRunService = new testRun_1.MaximTestRunAPI(config.baseUrl, config.apiKey, config.isDebug);
// ===== Common Processor =====
async function processEntry(testRun, index, mappingKeys, getRow, datasetId) {
var _a, _b, _c, _d, _e, _f, _g;
// 1. fetch row
const row = await getRow(index);
// if row is not found, return
if (!row) {
throw new Error(`No row found at index ${index}`);
}
const input = mappingKeys.input ? (row.data[mappingKeys.input] ? String(row.data[mappingKeys.input]) : undefined) : undefined;
const expectedOutput = mappingKeys.expectedOutput
? row.data[mappingKeys.expectedOutput]
? String(row.data[mappingKeys.expectedOutput])
: undefined
: undefined;
let contextToEvaluate = (mappingKeys.contextToEvaluate
? row.data[mappingKeys.contextToEvaluate] === null
? undefined
: row.data[mappingKeys.contextToEvaluate]
: undefined);
const scenario = mappingKeys.scenario
? row.data[mappingKeys.scenario]
? String(row.data[mappingKeys.scenario])
: undefined
: undefined;
const expectedSteps = mappingKeys.expectedSteps
? row.data[mappingKeys.expectedSteps]
? String(row.data[mappingKeys.expectedSteps])
: undefined
: undefined;
// 2. get the output
if (config.simulationConfig && outputFunction) {
let contextToEvaluateForSimulation = (_a = contextToEvaluate !== null && contextToEvaluate !== void 0 ? contextToEvaluate : promptVersion === null || promptVersion === void 0 ? void 0 : promptVersion.contextToEvaluate) !== null && _a !== void 0 ? _a : workflow === null || workflow === void 0 ? void 0 : workflow.contextToEvaluate;
// Build the simulation closure
const outputFunctionToExecute = (0, runUtils_1.simulationYieldsOutputFunctionClosure)(testRun.id, workspaceId, config.simulationConfig, outputFunction, APITestRunService, row.id, input, scenario, expectedSteps, contextToEvaluateForSimulation, timeoutInMinutes, logger);
// Execute the simulation
const output = await (0, runUtils_1.runOutputFunction)(outputFunctionToExecute, row.data);
if (output.retrievedContextToEvaluate) {
if (contextToEvaluateForSimulation) {
logger.info(`Detected retrieved context returned from output function for row ${index + 1} that had contextToEvaluate set from the dataset.\nOverriding the contextToEvaluate from dataset with the retrieved context`);
}
contextToEvaluateForSimulation = output.retrievedContextToEvaluate;
}
// 3. run evaluations
let localEvaluationResults = undefined;
const localEvaluators = evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e);
if (localEvaluators.length > 0) {
localEvaluationResults = await (0, runUtils_1.runLocalEvaluations)(localEvaluators, row.data, output, contextToEvaluateForSimulation);
}
// 4. Build output for push
// Find all platform evaluators with variableMapping
const platformEvaluatorsWithMangler = evaluators.filter((e) => typeof e !== "string" && !("evaluationFunction" in e) && "variableMapping" in e && typeof e.variableMapping === "object");
let evaluatorOutputOverrides;
evaluatorOutputOverrides = {};
for (const platformEval of platformEvaluatorsWithMangler) {
if (!platformEval.variableMapping)
continue;
const mappingKeysList = Object.keys(platformEval.variableMapping);
if (mappingKeysList.length > 0) {
const evalConfig = platformEvaluatorsConfig.find((c) => c.name === platformEval.name);
if (!evalConfig)
continue;
const mappingResult = {};
// Resolve persona with priority: dataset column > simulation config
let datasetPersona;
for (const [key, value] of Object.entries(row.data)) {
if (key.toLowerCase() === "persona" && value != null) {
const personaStr = String(value).trim();
if (personaStr) {
datasetPersona = personaStr;
break;
}
}
}
let simconfigPersona;
if (((_b = config.simulationConfig) === null || _b === void 0 ? void 0 : _b.persona) && !datasetPersona) {
if (typeof config.simulationConfig.persona === "string") {
simconfigPersona = config.simulationConfig.persona;
}
else {
const val = row.data[config.simulationConfig.persona.payload];
simconfigPersona = val != null ? String(val).trim() || undefined : undefined;
}
}
const persona = (_c = datasetPersona !== null && datasetPersona !== void 0 ? datasetPersona : simconfigPersona) !== null && _c !== void 0 ? _c : "";
const runObj = {
input,
output: output.data,
retrieval: contextToEvaluateForSimulation,
toolCalls: [],
scenario,
persona,
messages: output.messages,
...output,
};
for (const key of mappingKeysList) {
const mappingFn = platformEval.variableMapping[key];
if (!mappingFn)
continue;
try {
const version = workflow
? {
id: workflow.id,
type: "workflow",
}
: promptVersion
? {
id: promptVersion.id,
type: "prompt",
}
: undefined;
mappingResult[key] = mappingFn(runObj, row.data, version);
}
catch (e) {
logger.error(`Error in variable mapping for key "${key}": ${e instanceof Error ? e.message : String(e)}`);
}
}
evaluatorOutputOverrides[evalConfig.id] = mappingResult;
}
}
// Simulation: cost/usage in simulationMeta, no runConfig
try {
await APITestRunService.pushTestRunEntry({
testRun: { ...testRun, datasetId, datasetEntryId: row.id },
entry: {
input,
output: output.data,
meta: {
sdkVariables: evaluatorOutputOverrides && Object.keys(evaluatorOutputOverrides).length > 0
? Object.entries(evaluatorOutputOverrides).reduce((acc, [id, val]) => {
acc[id] = {
type: dataset_2.VariableType.JSON,
payload: JSON.stringify(val),
};
return acc;
}, {})
: undefined,
},
expectedOutput,
contextToEvaluate: contextToEvaluateForSimulation,
scenario,
expectedSteps,
dataEntry: row.data,
localEvaluationResults: localEvaluationResults
? localEvaluationResults.map((result) => ({
...result,
id: localEvaluatorNameToIdAndPassFailCriteriaMap.get(result.name).id,
}))
: undefined,
simulationMeta: output.simulationMeta,
},
localSimulation: true,
});
}
catch (pushError) {
const testRunEntryId = (_d = output === null || output === void 0 ? void 0 : output.simulationMeta) === null || _d === void 0 ? void 0 : _d.testRunEntryId;
if (testRunEntryId) {
try {
await APITestRunService.updateSimulationStatus(testRunEntryId, "FAILED");
}
catch (cleanupError) {
const msg = `Failed to mark simulation as failed after push error (testRunEntryId: ${testRunEntryId}): ${cleanupError instanceof Error ? cleanupError.message : String(cleanupError)}`;
"error" in logger && typeof logger.error === "function" ? logger.error(msg) : logger.info(msg);
}
}
throw pushError;
}
// 5. log the test run entry with local evaluation results
logger.processed(`Ran test run entry ${index + 1}`, {
datasetEntry: row.data,
output,
evaluationResults: localEvaluationResults,
});
return;
} // Make sure if its local workflow or remote workflow
else if (outputFunction || outputFunctionWithTracing || evaluators.filter((e) => typeof e !== "string").length > 0 || (config.simulationConfig && (workflow || promptVersion))) {
// Make sure if its local workflow or remote workflow
let outputFunctionToExecute;
let outputFunctionWithTracingToExecute;
if (outputFunction) {
outputFunctionToExecute = outputFunction;
}
else if (outputFunctionWithTracing) {
outputFunctionWithTracingToExecute = outputFunctionWithTracing;
}
else {
if (workflow) {
if (config.simulationConfig) {
const contextToEvaluateForSimulation = contextToEvaluate !== null && contextToEvaluate !== void 0 ? contextToEvaluate : workflow.contextToEvaluate;
outputFunctionToExecute = (0, runUtils_1.simulationWorkflowIdOutputFunctionClosure)(testRun.id, workflow.id, workspaceId, scenario, APITestRunService, config.simulationConfig, contextToEvaluateForSimulation, row.id, input, expectedSteps, timeoutInMinutes);
}
else {
outputFunctionToExecute = (0, runUtils_1.workflowIdOutputFunctionClosure)(workflow.id, APITestRunService, workflow.contextToEvaluate);
}
}
else if (promptVersion) {
if (config.simulationConfig) {
// Use contextToEvaluate from row data, or fallback to promptVersion.contextToEvaluate
const contextToEvaluateForSimulation = contextToEvaluate !== null && contextToEvaluate !== void 0 ? contextToEvaluate : promptVersion.contextToEvaluate;
outputFunctionToExecute = (0, runUtils_1.simulationPromptVersionIdOutputFunctionClosure)(testRun.id, promptVersion.id, workspaceId, scenario, APITestRunService, config.simulationConfig, contextToEvaluateForSimulation, row.id, input, expectedSteps, timeoutInMinutes);
}
else {
outputFunctionToExecute = (0, runUtils_1.promptVersionIdOutputFunctionClosure)(promptVersion.id, input !== null && input !== void 0 ? input : "", APITestRunService, promptVersion.contextToEvaluate);
}
}
else if (promptChainVersion) {
outputFunctionToExecute = (0, runUtils_1.promptChainVersionIdOutputFunctionClosure)(promptChainVersion.id, input !== null && input !== void 0 ? input : "", APITestRunService, promptChainVersion.contextToEvaluate);
}
else {
throw new Error("Found no output function to execute, please make sure you have either `yieldsOutput`, `yieldsOutputWithTracing`, `withPromptVersionId`, `withPromptChainVersionId` or `withWorkflowId` set.");
}
}
// When using simulation endpoints, the backend creates the entry via the simulation API —
// do NOT create one separately (matches Python: create_test_run_entry only when simulation_config is None)
let testRunEntry;
if (!config.simulationConfig) {
testRunEntry = await APITestRunService.createTestRunEntry({
testRun: { ...testRun, datasetId, datasetEntryId: row.id },
});
}
const traceId = (0, uuid_1.v4)();
if (!config.disableDefaultTraceCreation && internalMaximLogger && testRunEntry) {
try {
const testRunEntryTrace = internalMaximLogger.trace({
id: traceId,
name: `Test Run Entry ${index + 1}`,
});
testRunEntryTrace.addTag("testRunEntryId", testRunEntry.id);
testRunEntryTrace.addTag("testRunId", testRun.id);
}
catch (e) {
logger.error(`Error creating trace for test run entry ${index + 1}: ${e instanceof Error ? e.message : String(e)}`);
}
}
const output = outputFunctionWithTracingToExecute !== undefined
? await (0, runUtils_1.runOutputFunctionWithTracing)(outputFunctionWithTracingToExecute, row.data, traceId)
: await (0, runUtils_1.runOutputFunction)(outputFunctionToExecute, row.data);
if (output.retrievedContextToEvaluate) {
if (contextToEvaluate) {
logger.info(`Detected retrieved context returned from output function for row ${index + 1} that had contextToEvaluate set from the dataset.\nOverriding the contextToEvaluate from dataset with the retrieved context`);
}
contextToEvaluate = output.retrievedContextToEvaluate;
}
// 3. run evaluations
let localEvaluationResults = undefined;
const localEvaluators = evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e);
if (localEvaluators.length > 0) {
localEvaluationResults = await (0, runUtils_1.runLocalEvaluations)(localEvaluators, row.data, output, contextToEvaluate);
}
// 4. push the test run entry
// Use the first evaluator's mangled output if available
// 4. Build output for push
// Find all platform evaluators with variableMapping
const platformEvaluatorsWithMangler = evaluators.filter((e) => typeof e !== "string" && !("evaluationFunction" in e) && "variableMapping" in e && typeof e.variableMapping === "object");
let evaluatorOutputOverrides;
evaluatorOutputOverrides = {};
for (const platformEval of platformEvaluatorsWithMangler) {
if (!platformEval.variableMapping)
continue;
const mappingKeysList = Object.keys(platformEval.variableMapping);
if (mappingKeysList.length > 0) {
const evalConfig = platformEvaluatorsConfig.find((c) => c.name === platformEval.name);
if (!evalConfig)
continue;
const mappingResult = {};
// Resolve persona with priority: dataset column > simulation config
let datasetPersona;
for (const [key, value] of Object.entries(row.data)) {
if (key.toLowerCase() === "persona" && value != null) {
const personaStr = String(value).trim();
if (personaStr) {
datasetPersona = personaStr;
break;
}
}
}
let simconfigPersona;
if ((_e = config.simulationConfig) === null || _e === void 0 ? void 0 : _e.persona) {
if (typeof config.simulationConfig.persona === "string") {
simconfigPersona = config.simulationConfig.persona;
}
else {
const val = row.data[config.simulationConfig.persona.payload];
simconfigPersona = val != null ? String(val).trim() || undefined : undefined;
}
}
const persona = (_f = datasetPersona !== null && datasetPersona !== void 0 ? datasetPersona : simconfigPersona) !== null && _f !== void 0 ? _f : "";
const runObj = {
input,
output: output.data,
retrieval: contextToEvaluate,
toolCalls: [],
scenario,
persona,
messages: output.messages,
...output,
};
for (const key of mappingKeysList) {
const mappingFn = platformEval.variableMapping[key];
if (!mappingFn)
continue;
try {
const version = workflow
? {
id: workflow.id,
type: "workflow",
}
: promptVersion
? {
id: promptVersion.id,
type: "prompt",
}
: promptChainVersion
? {
id: promptChainVersion.id,
type: "promptChain",
}
: undefined;
mappingResult[key] = mappingFn(runObj, row.data, version);
}
catch (e) {
logger.error(`Error in variable mapping for key "${key}": ${e instanceof Error ? e.message : String(e)}`);
}
}
evaluatorOutputOverrides[evalConfig.id] = mappingResult;
}
}
// For simulation endpoint entries, the backend already ran full simulation + evaluation.
// Only push if we have local eval results to send; skip otherwise to avoid double evaluation.
const isSimulationEndpointEntry = !!config.simulationConfig && !outputFunction && !outputFunctionWithTracing;
const hasLocalEvalResults = localEvaluationResults && localEvaluationResults.length > 0;
const shouldPush = !isSimulationEndpointEntry || hasLocalEvalResults;
if (shouldPush) {
// For simulation endpoint entries with local results: filter evalConfig to only
// local evaluators so the V4 push fast path marks COMPLETE without re-queuing
// for platform evals (which the simulation worker already ran).
const pushTestRun = { ...testRun, datasetId, datasetEntryId: row.id };
if (isSimulationEndpointEntry) {
const ec = pushTestRun["evalConfig"];
if (ec && Array.isArray(ec["evals"])) {
pushTestRun["evalConfig"] = {
...ec,
evals: ec["evals"].filter((e) => e["type"] === "Local"),
};
}
}
try {
await APITestRunService.pushTestRunEntry({
testRun: pushTestRun,
runConfig: output.meta
? {
cost: output.meta.cost,
usage: output.meta.usage
? "completionTokens" in output.meta.usage
? {
completion_tokens: output.meta.usage.completionTokens,
prompt_tokens: output.meta.usage.promptTokens,
total_tokens: output.meta.usage.totalTokens,
latency: output.meta.usage.latency,
}
: {
latency: output.meta.usage.latency,
}
: undefined,
}
: undefined,
entry: {
id: testRunEntry === null || testRunEntry === void 0 ? void 0 : testRunEntry.id,
input,
output: output.data,
meta: {
sdkVariables: evaluatorOutputOverrides && Object.keys(evaluatorOutputOverrides).length > 0
? Object.entries(evaluatorOutputOverrides).reduce((acc, [id, val]) => {
acc[id] = {
type: dataset_2.VariableType.JSON,
payload: JSON.stringify(val),
};
return acc;
}, {})
: undefined,
connectedTraceId: internalMaximLogger ? traceId : undefined,
},
expectedOutput,
contextToEvaluate,
scenario,
expectedSteps,
dataEntry: row.data,
localEvaluationResults: localEvaluationResults
? localEvaluationResults.map((result) => ({
...result,
id: localEvaluatorNameToIdAndPassFailCriteriaMap.get(result.name).id,
}))
: undefined,
simulationMeta: output.simulationMeta,
},
});
}
catch (pushError) {
const testRunEntryId = (_g = output === null || output === void 0 ? void 0 : output.simulationMeta) === null || _g === void 0 ? void 0 : _g.testRunEntryId;
if (testRunEntryId) {
try {
await APITestRunService.updateSimulationStatus(testRunEntryId, "FAILED");
}
catch (cleanupError) {
const msg = `Failed to mark simulation as failed after push error (testRunEntryId: ${testRunEntryId}): ${cleanupError instanceof Error ? cleanupError.message : String(cleanupError)}`;
"error" in logger && typeof logger.error === "function" ? logger.error(msg) : logger.info(msg);
}
}
throw pushError;
}
}
// 5. log the test run entry with local evaluation results
logger.processed(`Ran test run entry ${index + 1}`, {
datasetEntry: row.data,
output,
evaluationResults: localEvaluationResults,
});
return;
}
// Else we will be just pushing back the dataset entry from the SDK side
await APITestRunService.pushTestRunEntry({
testRun: { ...testRun, datasetId, datasetEntryId: row.id },
entry: {
input,
expectedOutput,
contextToEvaluate: (workflow === null || workflow === void 0 ? void 0 : workflow.contextToEvaluate)
? workflow.contextToEvaluate
: (promptVersion === null || promptVersion === void 0 ? void 0 : promptVersion.contextToEvaluate)
? promptVersion.contextToEvaluate
: (promptChainVersion === null || promptChainVersion === void 0 ? void 0 : promptChainVersion.contextToEvaluate)
? promptChainVersion.contextToEvaluate
: typeof mappingKeys.contextToEvaluate === "string"
? mappingKeys.contextToEvaluate
: undefined,
scenario,
expectedSteps,
dataEntry: row.data,
},
});
logger.processed(`Ran test run entry ${index + 1}`, {
datasetEntry: row.data,
});
}
// ===== Test Run Starts =====
try {
logger.info(`Creating test run "${name}"...`);
// ===== Create Test Run =====
// create eval config (needed for local evals)
const evalConfig = [
...platformEvaluatorsConfig,
...Array.from(localEvaluatorNameToIdAndPassFailCriteriaMap.entries()).map(([name, value]) => ({
id: value.id,
name,
type: "Local",
builtin: false,
reversed: undefined,
config: {
passFailCriteria: {
entryLevel: {
value: typeof value.passFailCriteria.onEachEntry.value === "boolean"
? value.passFailCriteria.onEachEntry.value
? "Yes"
: "No"
: value.passFailCriteria.onEachEntry.value,
operator: value.passFailCriteria.onEachEntry.scoreShouldBe,
name: "score",
},
runLevel: {
value: value.passFailCriteria.forTestrunOverall.value,
operator: value.passFailCriteria.forTestrunOverall.overallShouldBe,
name: value.passFailCriteria.forTestrunOverall.for === "average" ? "meanScore" : "queriesPassed",
},
},
},
})),
];
if (config.maximLogger) {
internalMaximLogger = config.maximLogger;
}
const tagsEnrichedWithRepoId = config.maximLogger ? [...(tags !== null && tags !== void 0 ? tags : []), `repoId:${config.maximLogger.id}`] : tags;
const hasLocalEvaluators = evaluators.filter((e) => typeof e !== "string" && "evaluationFunction" in e).length > 0;
const requiresLocalRun = hasLocalEvaluators || !!config.outputFunction || !!config.outputFunctionWithTracing;
const testRun = await APITestRunService.createTestRun(name, workspaceId, "SINGLE", evalConfig, requiresLocalRun, workflow === null || workflow === void 0 ? void 0 : workflow.id, promptVersion === null || promptVersion === void 0 ? void 0 : promptVersion.id, promptChainVersion === null || promptChainVersion === void 0 ? void 0 : promptChainVersion.id, humanEvaluationConfig, tagsEnrichedWithRepoId, config.simulationConfig, internalMaximLogger === null || internalMaximLogger === void 0 ? void 0 : internalMaximLogger.id);
try {
// ===== Create Semaphore =====
const semaphore = semaphore_1.Semaphore.get(`${workspaceId}:${name}:${testRun.id}`, concurrency);
if (data) {
if (dataStructure) {
const inputKey = (0, utils_1.getAllKeysByValue)(dataStructure, "INPUT")[0];
const expectedOutputKey = (0, utils_1.getAllKeysByValue)(dataStructure, "EXPECTED_OUTPUT")[0];
const contextToEvaluateKey = (0, utils_1.getAllKeysByValue)(dataStructure, "CONTEXT_TO_EVALUATE")[0];
const scenarioKey = (0, utils_1.getAllKeysByValue)(dataStructure, "SCENARIO")[0];
const expectedStepsKey = (0, utils_1.getAllKeysByValue)(dataStructure, "EXPECTED_STEPS")[0];
if (typeof data === "string") {
const APIDatasetService = new dataset_1.MaximDatasetAPI(config.baseUrl, config.apiKey, config.isDebug);
logger.info(`Fetching dataset "${data}" from platform...`);
const platformDataStructure = await APIDatasetService.getDatasetDatastructure(data);
(0, dataset_3.validateDataStructure)(dataStructure, platformDataStructure);
await APITestRunService.attachDatasetToTestRun(testRun.id, data);
// ===== Platform Dataset Processor =====
async function processDatasetEntry(index, datasetId) {
try {
// 1. acquire semaphore
await semaphore.acquire();
// 2. process the entry
await processEntry(testRun, index, {
input: inputKey,
expectedOutput: expectedOutputKey,
contextToEvaluate: contextToEvaluateKey,
scenario: scenarioKey,
expectedSteps: expectedStepsKey,
}, async (index) => {
return (await APIDatasetService.getDatasetRow(datasetId, index));
}, datasetId);
}
catch (err) {
// 3. handle error (if any)
logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${index}]`, {
cause: err,
})));
failedEntryIndices.push(index);
}
finally {
// 4. release semaphore
semaphore.release();
}
}
// 1. get length of dataset
const totalRows = await APIDatasetService.getDatasetTotalRows(data);
// 2. process each row in parallel
const dataEntryPromises = [];
for (let i = 0; i < totalRows; i++) {
dataEntryPromises.push(processDatasetEntry(i, data));
}
// 3. wait for all promises to resolve
await Promise.all(dataEntryPromises);
}
else if (data instanceof csvParser_1.CSVFile) {
const columnStructure = {};
Object.keys(dataStructure).forEach((key, index) => {
columnStructure[key] = index;
});
const csv = await csvParser_1.CSVFile.restructure(data, columnStructure);
// ===== CSV Dataset Processor =====
async function processCSVEntry(index) {
try {
// 1. acquire semaphore
await semaphore.acquire();
// 2. process the entry
await processEntry(testRun, index, {
input: inputKey,
expectedOutput: expectedOutputKey,
contextToEvaluate: contextToEvaluateKey,
scenario: scenarioKey,
expectedSteps: expectedStepsKey,
}, async (index) => {
return { data: (await csv.getRow(index)) };
});
}
catch (err) {
// 3. handle error (if any)
logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${index}]`, {
cause: err,
})));
failedEntryIndices.push(index);
}
finally {
// 4. release semaphore
semaphore.release();
}
}
// 1. get length of dataset
const totalRows = await csv.getRowCount();
// 2. process each row in parallel
const dataEntryPromises = [];
for (let i = 0; i < totalRows; i++) {
dataEntryPromises.push(processCSVEntry(i));
}
// 3. wait for all promises to resolve
await Promise.all(dataEntryPromises);
}
else if (Array.isArray(data)) {
// ===== Manual Array Dataset Processor =====
async function processDataEntry(index, getRow) {
try {
// 1. acquire semaphore
await semaphore.acquire();
// 2. process the entry
await processEntry(testRun, index, {
input: inputKey,
expectedOutput: expectedOutputKey,
contextToEvaluate: contextToEvaluateKey,
scenario: scenarioKey,
expectedSteps: expectedStepsKey,
}, getRow);
}
catch (err) {
// 3. handle error (if any)
logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${index}]`, {
cause: err,
})));
failedEntryIndices.push(index);
}
finally {
// 4. release semaphore
semaphore.release();
}
}
// 1. get length of dataset
const totalRows = data.length;
// 2. process each row in parallel
const dataEntryPromises = [];
for (let i = 0; i < totalRows; i++) {
dataEntryPromises.push(processDataEntry(i, (index) => ({
data: data[index],
})));
}
// 3. wait for all promises to resolve
await Promise.all(dataEntryPromises);
}
else if (typeof data === "function") {
// ===== Manual Function Dataset Processor =====
async function processDataEntry(mainIndex, index, getRow) {
try {
// 1. acquire semaphore
await semaphore.acquire();
// 2. process the entry
await processEntry(testRun, index, {
input: inputKey,
expectedOutput: expectedOutputKey,
contextToEvaluate: contextToEvaluateKey,
scenario: scenarioKey,
expectedSteps: expectedStepsKey,
}, getRow);
}
catch (err) {
// 3. handle error (if any)
logger.error((0, utils_2.buildErrorMessage)(new Error(`Error while running data entry at index [${mainIndex}]`, {
cause: err,
})));
failedEntryIndices.push(mainIndex);
}
finally {