UNPKG

@arizeai/phoenix-client

Version:
583 lines 25 kB
import { MimeType, OpenInferenceSpanKind, SemanticConventions, } from "@arizeai/openinference-semantic-conventions"; import { attachGlobalTracerProvider, createNoOpProvider, objectAsAttributes, register, SpanStatusCode, } from "@arizeai/phoenix-otel"; import { queue } from "async"; import invariant from "tiny-invariant"; import { createClient } from "../client.js"; import { getDataset } from "../datasets/getDataset.js"; import { createLogger } from "../logger.js"; import { ensureString } from "../utils/ensureString.js"; import { pluralize } from "../utils/pluralize.js"; import { promisifyResult } from "../utils/promisifyResult.js"; import { toObjectHeaders } from "../utils/toObjectHeaders.js"; import { getDatasetExperimentsUrl, getDatasetUrl, getExperimentUrl, } from "../utils/urlUtils.js"; import { getExperimentInfo } from "./getExperimentInfo.js"; import { getExperimentEvaluators } from "./helpers/index.js"; import { logEvalSummary, logLinks, logTaskSummary, PROGRESS_PREFIX, } from "./logging.js"; import { cleanupOwnedTracerProvider } from "./tracing.js"; /** * Validate that a repetition is valid */ function isValidRepetitionParam(repetitions) { return Number.isInteger(repetitions) && repetitions > 0; } /** * Runs an experiment using a given set of dataset of examples. * * An experiment is a user-defined task that runs on each example in a dataset. The results from * each experiment can be evaluated using any number of evaluators to measure the behavior of the * task. The experiment and evaluation results are stored in the Phoenix database for comparison * and analysis. * * A `task` is either a sync or async function that returns a JSON serializable * output. If the `task` is a function of one argument then that argument will be bound to the * `input` field of the dataset example. Alternatively, the `task` can be a function of any * combination of specific argument names that will be bound to special values: * * - `input`: The input field of the dataset example * - `expected`: The expected or reference output of the dataset example * - `reference`: An alias for `expected` * - `metadata`: Metadata associated with the dataset example * - `example`: The dataset `Example` object with all associated fields * * @example * ```ts * import { asEvaluator, runExperiment } from "@phoenix/client/experiments"; * * const experiment = await runExperiment({ * dataset: "my-dataset", * task: async (example) => example.input, * evaluators: [ * asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }), * ], * }); * ``` */ export async function runExperiment({ experimentName, experimentDescription, experimentMetadata = {}, client: _client, dataset: datasetSelector, task, evaluators, logger = createLogger(), record = true, concurrency = 5, dryRun = false, setGlobalTracerProvider = true, repetitions = 1, useBatchSpanProcessor = true, diagLogLevel, }) { // Validation invariant(isValidRepetitionParam(repetitions), "repetitions must be an integer greater than 0"); let taskProvider; let taskGlobalRegistration = null; const isDryRun = typeof dryRun === "number" || dryRun === true; const client = _client ?? createClient(); const dataset = await getDataset({ dataset: datasetSelector, client, }); invariant(dataset, `Dataset not found`); invariant(dataset.examples.length > 0, `Dataset has no examples`); const nExamples = typeof dryRun === "number" ? Math.min(dryRun, dataset.examples.length) : dataset.examples.length; let projectName = `${dataset.name}-exp-${new Date().toISOString()}`; // initialize the tracer into scope let taskTracer; let experiment; if (isDryRun) { const now = new Date().toISOString(); const totalExamples = nExamples; experiment = { id: localId(), datasetId: dataset.id, datasetVersionId: dataset.versionId, // @todo: the dataset should return splits in response body datasetSplits: datasetSelector?.splits ?? [], projectName, metadata: experimentMetadata, repetitions, createdAt: now, updatedAt: now, exampleCount: totalExamples, successfulRunCount: 0, failedRunCount: 0, missingRunCount: totalExamples * repetitions, }; taskTracer = createNoOpProvider().getTracer("no-op"); } else { const experimentResponse = await client .POST("/v1/datasets/{dataset_id}/experiments", { params: { path: { dataset_id: dataset.id, }, }, body: { name: experimentName, description: experimentDescription, metadata: experimentMetadata, project_name: projectName, repetitions, // @todo: the dataset should return splits in response body ...(datasetSelector?.splits ? { splits: datasetSelector.splits } : {}), ...(dataset?.versionId ? { version_id: dataset.versionId } : {}), }, }) .then((res) => res.data?.data); invariant(experimentResponse, `Failed to create experiment`); projectName = experimentResponse.project_name ?? projectName; experiment = { id: experimentResponse.id, datasetId: experimentResponse.dataset_id, datasetVersionId: experimentResponse.dataset_version_id, // @todo: the dataset should return splits in response body datasetSplits: datasetSelector?.splits ?? [], projectName, repetitions: experimentResponse.repetitions, metadata: experimentResponse.metadata || {}, createdAt: experimentResponse.created_at, updatedAt: experimentResponse.updated_at, exampleCount: experimentResponse.example_count, successfulRunCount: experimentResponse.successful_run_count, failedRunCount: experimentResponse.failed_run_count, missingRunCount: experimentResponse.missing_run_count, }; // Initialize the tracer, now that we have a project name const baseUrl = client.config.baseUrl; invariant(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."); taskProvider = register({ projectName, url: baseUrl, headers: client.config.headers ? toObjectHeaders(client.config.headers) : undefined, batch: useBatchSpanProcessor, diagLogLevel, global: false, }); taskGlobalRegistration = setGlobalTracerProvider ? attachGlobalTracerProvider(taskProvider) : null; taskTracer = taskProvider.getTracer(projectName); } try { if (!record) { logger.info(`Running experiment in readonly mode. Results will not be recorded.`); } const links = []; if (!isDryRun && client.config.baseUrl) { links.push({ label: "Dataset", url: getDatasetUrl({ baseUrl: client.config.baseUrl, datasetId: dataset.id, }), }); links.push({ label: "Experiments", url: getDatasetExperimentsUrl({ baseUrl: client.config.baseUrl, datasetId: dataset.id, }), }); links.push({ label: "Experiment", url: getExperimentUrl({ baseUrl: client.config.baseUrl, datasetId: dataset.id, experimentId: experiment.id, }), }); } const evCount = evaluators?.length ?? 0; logger.info(`${PROGRESS_PREFIX.start}Experiment ${experimentName || "<unnamed>"} (dataset ${dataset.name}, ${nExamples} ${pluralize("example", nExamples)}, ${evCount} ${pluralize("evaluator", evCount)})`); const runs = {}; await runTaskWithExamples({ client, experimentId: experiment.id, task, dataset, logger, onComplete: (run) => { runs[run.id] = run; }, concurrency, isDryRun, nExamples, tracer: taskTracer, repetitions, }); const taskRuns = Object.values(runs); const taskErrors = taskRuns.filter((run) => run.error != null).length; const taskTotal = nExamples * repetitions; const taskOkStr = taskErrors > 0 ? `${taskRuns.length - taskErrors}/${taskTotal} ok (${taskErrors} failed)` : `${taskRuns.length}/${taskTotal} ok`; logger.info(`${PROGRESS_PREFIX.completed}Tasks ${taskOkStr}`); const ranExperiment = { ...experiment, runs, }; await cleanupOwnedTracerProvider({ provider: taskProvider, globalRegistration: taskGlobalRegistration, }); taskProvider = undefined; taskGlobalRegistration = null; if (evaluators && evaluators.length > 0) { const evNames = getExperimentEvaluators(evaluators) .map((evaluator) => evaluator.name) .join(", "); logger.info(`${PROGRESS_PREFIX.start}Evaluations (${evNames})`); } const { evaluationRuns } = await evaluateExperiment({ experiment: ranExperiment, evaluators: evaluators ?? [], client, logger, concurrency, dryRun, diagLogLevel, useBatchSpanProcessor, setGlobalTracerProvider, }); ranExperiment.evaluationRuns = evaluationRuns; // Refresh experiment info from server to get updated counts (non-dry-run only) if (!isDryRun) { const updatedExperiment = await getExperimentInfo({ client, experimentId: experiment.id, }); // Update the experiment info with the latest from the server Object.assign(ranExperiment, updatedExperiment); } logTaskSummary(logger, { nExamples, repetitions, nRuns: taskRuns.length, nErrors: taskErrors, }); if (ranExperiment.evaluationRuns && ranExperiment.evaluationRuns.length > 0) { logEvalSummary(logger, ranExperiment.evaluationRuns); } logLinks(logger, links); return ranExperiment; } finally { // Safety net: on error paths the happy-path cleanup above is skipped, // so ensure the task provider is always cleaned up. On the happy path // taskProvider is already undefined (no-op). await cleanupOwnedTracerProvider({ provider: taskProvider, globalRegistration: taskGlobalRegistration, }); } } /** * Run a task against n examples in a dataset. */ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete, logger, concurrency = 5, isDryRun, nExamples, tracer, repetitions = 1, }) { // Validate the input invariant(isValidRepetitionParam(repetitions), "repetitions must be an integer greater than 0"); logger.debug(`${PROGRESS_PREFIX.start}Tasks (${nExamples} ${pluralize("example", nExamples)} × ${repetitions} ${pluralize("repetition", repetitions)})`); const run = async ({ example, repetitionNumber, }) => { return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => { logger.debug(`${PROGRESS_PREFIX.progress}Task on example ${example.id}`); const traceId = span.spanContext().traceId; const thisRun = { id: localId(), // initialized with local id, will be replaced with server-assigned id when dry run is false traceId, experimentId, datasetExampleId: example.id, startTime: new Date(), endTime: new Date(), // will get replaced with actual end time output: null, error: null, }; try { const taskOutput = await promisifyResult(task(example)); thisRun.output = taskOutput; } catch (error) { thisRun.error = error instanceof Error ? error.message : "Unknown error"; span.setStatus({ code: SpanStatusCode.ERROR }); } thisRun.endTime = new Date(); if (!isDryRun) { // Log the run to the server const res = await client.POST("/v1/experiments/{experiment_id}/runs", { params: { path: { experiment_id: experimentId, }, }, body: { dataset_example_id: example.id, output: thisRun.output, repetition_number: repetitionNumber, start_time: thisRun.startTime.toISOString(), end_time: thisRun.endTime.toISOString(), trace_id: thisRun.traceId, error: thisRun.error, }, }); // replace the local run id with the server-assigned id thisRun.id = res.data?.data.id ?? thisRun.id; const inputMimeType = typeof example.input === "string" ? MimeType.TEXT : MimeType.JSON; const outputMimeType = typeof thisRun.output === "string" ? MimeType.TEXT : MimeType.JSON; span.setStatus({ code: SpanStatusCode.OK }); span.setAttributes({ [SemanticConventions.OPENINFERENCE_SPAN_KIND]: OpenInferenceSpanKind.CHAIN, [SemanticConventions.INPUT_MIME_TYPE]: inputMimeType, [SemanticConventions.INPUT_VALUE]: ensureString(example.input), [SemanticConventions.OUTPUT_MIME_TYPE]: outputMimeType, [SemanticConventions.OUTPUT_VALUE]: ensureString(thisRun.output), }); } span?.end(); onComplete(thisRun); return thisRun; }); }; const q = queue(run, concurrency); const examplesToUse = dataset.examples.slice(0, nExamples); examplesToUse .flatMap((example) => Array.from({ length: repetitions }, (_, index) => ({ example, repetitionNumber: index + 1, // Repetitions start at 1 }))) .forEach((exampleWithRepetition) => q.push(exampleWithRepetition, (err) => { if (err) { logger.error(`Error running task "${task.name}" on example "${exampleWithRepetition.example.id}" repetition ${exampleWithRepetition.repetitionNumber}: ${err}`); } })); return q.drain(); } /** * Evaluate an experiment. * * @experimental This feature is not complete, and will change in the future. */ export async function evaluateExperiment({ experiment, evaluators, client: _client, logger = createLogger(), concurrency = 5, dryRun = false, setGlobalTracerProvider = true, useBatchSpanProcessor = true, tracerProvider: paramsTracerProvider, diagLogLevel, }) { const isDryRun = typeof dryRun === "number" || dryRun === true; const client = _client ?? createClient(); const baseUrl = client.config.baseUrl; invariant(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."); let provider; let globalRegistration = null; const ownsProvider = !paramsTracerProvider; // Always allow changing of tracer providers if (paramsTracerProvider) { provider = paramsTracerProvider; } else if (!isDryRun) { provider = register({ projectName: "evaluators", url: baseUrl, headers: client.config.headers ? toObjectHeaders(client.config.headers) : undefined, batch: useBatchSpanProcessor, diagLogLevel, global: false, }); globalRegistration = setGlobalTracerProvider ? attachGlobalTracerProvider(provider) : null; } else { provider = createNoOpProvider(); } const tracer = isDryRun ? provider.getTracer("no-op") : provider.getTracer("evaluators"); try { const nRuns = typeof dryRun === "number" ? Math.min(dryRun, Object.keys(experiment.runs).length) : Object.keys(experiment.runs).length; const dataset = await getDataset({ dataset: { datasetId: experiment.datasetId, versionId: experiment.datasetVersionId, splits: experiment.datasetSplits, }, client, }); invariant(dataset, `Dataset "${experiment.datasetId}" not found`); invariant(dataset.examples.length > 0, `Dataset "${experiment.datasetId}" has no examples`); invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`); const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns); if (evaluators?.length === 0) { return { ...experiment, evaluationRuns: [], }; } const evaluationRuns = {}; const examplesById = {}; for (const example of dataset.examples) { examplesById[example.id] = example; } const onEvaluationComplete = (run) => { evaluationRuns[run.id] = run; }; // Run evaluators against all runs // Flat list of evaluator + run tuples const normalizedEvaluators = getExperimentEvaluators(evaluators); const evaluatorsAndRuns = normalizedEvaluators.flatMap((evaluator) => runsToEvaluate.map((run) => ({ evaluator, run, }))); const evaluatorsQueue = queue(async (evaluatorAndRun) => { return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, async (span) => { const evalResult = await runEvaluator({ evaluator: evaluatorAndRun.evaluator, run: evaluatorAndRun.run, exampleCache: examplesById, onComplete: onEvaluationComplete, logger, }); span.setAttributes({ [SemanticConventions.OPENINFERENCE_SPAN_KIND]: OpenInferenceSpanKind.EVALUATOR, [SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON, [SemanticConventions.INPUT_VALUE]: ensureString({ input: examplesById[evaluatorAndRun.run.datasetExampleId]?.input, output: evaluatorAndRun.run.output, expected: examplesById[evaluatorAndRun.run.datasetExampleId]?.output, metadata: examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata, }), [SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON, [SemanticConventions.OUTPUT_VALUE]: ensureString(evalResult.result), }); if (evalResult.error) { span.setStatus({ code: SpanStatusCode.ERROR, message: evalResult.error, }); } else { span.setStatus({ code: SpanStatusCode.OK }); } if (evalResult.result) { span.setAttributes(objectAsAttributes(evalResult.result)); } evalResult.traceId = span.spanContext().traceId; if (!isDryRun) { // Log the evaluation to the server // We log this without awaiting (e.g. best effort) client.POST("/v1/experiment_evaluations", { body: { experiment_run_id: evaluatorAndRun.run.id, name: evaluatorAndRun.evaluator.name, annotator_kind: evaluatorAndRun.evaluator.kind, start_time: evalResult.startTime.toISOString(), end_time: evalResult.endTime.toISOString(), result: { ...evalResult.result, }, error: evalResult.error, trace_id: evalResult.traceId, }, }); } span.end(); return evalResult; }); }, concurrency); if (!evaluatorsAndRuns.length) { logger.warn(`No evaluators to run`); return { ...experiment, evaluationRuns: [], }; } evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => { if (err) { logger.error(`Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`); } })); await evaluatorsQueue.drain(); const evalTotal = Object.values(evaluationRuns).length; const evalErrors = Object.values(evaluationRuns).filter((ev) => ev.error != null).length; const evalExpected = runsToEvaluate.length * normalizedEvaluators.length; const evalOkStr = evalErrors > 0 ? `${evalTotal - evalErrors}/${evalExpected} ok (${evalErrors} failed)` : `${evalTotal}/${evalExpected} ok`; logger.info(`${PROGRESS_PREFIX.completed}Evaluations ${evalOkStr}`); return { ...experiment, evaluationRuns: Object.values(evaluationRuns), }; } finally { if (ownsProvider) { await cleanupOwnedTracerProvider({ provider, globalRegistration, }); } } } /** * Run an evaluator against a run. * * @experimental This feature is not complete, and will change in the future. */ async function runEvaluator({ evaluator, run, exampleCache, onComplete, logger, }) { const example = exampleCache[run.datasetExampleId]; invariant(example, `Example "${run.datasetExampleId}" not found`); const evaluate = async () => { logger.debug(`${PROGRESS_PREFIX.progress}Eval ${evaluator.name} on run ${run.id}`); const thisEval = { id: localId(), traceId: null, experimentRunId: run.id, startTime: new Date(), endTime: new Date(), // will get replaced with actual end time name: evaluator.name, result: null, error: null, annotatorKind: evaluator.kind, }; try { const result = await evaluator.evaluate({ input: example.input, output: run.output ?? null, expected: example.output, metadata: example?.metadata, }); thisEval.result = result; } catch (error) { thisEval.error = error instanceof Error ? error.message : "Unknown error"; logger.error(`Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`); } thisEval.endTime = new Date(); onComplete(thisEval); return thisEval; }; return evaluate(); } /** * Wrap an evaluator function in an object with a name property. * * @experimental This feature is not complete, and will change in the future. * * @param params - The parameters for creating the evaluator * @param params.name - The name of the evaluator. * @param params.kind - The kind of evaluator (e.g., "CODE", "LLM") * @param params.evaluate - The evaluator function. * @returns The evaluator object. * @deprecated use asExperimentEvaluator instead */ export function asEvaluator({ name, kind, evaluate, }) { return { name, kind, evaluate, }; } let _localIdIndex = 1000; /** * Generate a local id. * * @returns A semi-unique id. */ function localId() { _localIdIndex++; return `local_${_localIdIndex}`; } //# sourceMappingURL=runExperiment.js.map