@arizeai/phoenix-client

import { type DiagLogLevel, NodeTracerProvider } from "@arizeai/phoenix-otel"; import { type PhoenixClient } from "../client.js"; import { AnnotatorKind } from "../types/annotations.js"; import { ClientFn } from "../types/core.js"; import { DatasetSelector } from "../types/datasets.js"; import type { Evaluator, ExperimentEvaluatorLike, ExperimentTask, RanExperiment } from "../types/experiments.js"; import { type Logger } from "../types/logger.js"; /** * Parameters for running an experiment. * * @experimental This feature is not complete, and will change in the future. */ export type RunExperimentParams = ClientFn & { /** * An optional name for the experiment. * Defaults to the dataset name + a timestamp */ experimentName?: string; /** * The description of the experiment */ experimentDescription?: string; /** * Experiment metadata * E.x. modelName */ experimentMetadata?: Record<string, unknown>; /** * The dataset to run the experiment on */ dataset: DatasetSelector; /** * The task to run */ task: ExperimentTask; /** * The evaluators to use */ evaluators?: ExperimentEvaluatorLike[]; /** * The logger to use */ logger?: Logger; /** * Whether to record the experiment results */ record?: boolean; /** * The number of dataset examples to run in parallel */ concurrency?: number; /** * Whether or not to run the experiment as a dry run. If a number is privided, n examples will be run. * @default false */ dryRun?: number | boolean; /** * Whether to set the global tracer provider when running the task. * If set to false, a locally scoped tracer will be created but will not get registered. * This may cause certain spans to not be picked up by Phoenix. Notably libraries like the AI SDK that leverage the global tracer. * @default true */ setGlobalTracerProvider?: boolean; /** * Number of times to repeat each dataset example * @default 1 */ repetitions?: number; useBatchSpanProcessor?: boolean; /** * Log level to set for the default DiagConsoleLogger when tracing. * Omit to disable default diag logging, or to bring your own. */ diagLogLevel?: DiagLogLevel; }; /** * Runs an experiment using a given set of dataset of examples. * * An experiment is a user-defined task that runs on each example in a dataset. The results from * each experiment can be evaluated using any number of evaluators to measure the behavior of the * task. The experiment and evaluation results are stored in the Phoenix database for comparison * and analysis. * * A `task` is either a sync or async function that returns a JSON serializable * output. If the `task` is a function of one argument then that argument will be bound to the * `input` field of the dataset example. Alternatively, the `task` can be a function of any * combination of specific argument names that will be bound to special values: * * - `input`: The input field of the dataset example * - `expected`: The expected or reference output of the dataset example * - `reference`: An alias for `expected` * - `metadata`: Metadata associated with the dataset example * - `example`: The dataset `Example` object with all associated fields * * @example * ```ts * import { asEvaluator, runExperiment } from "@phoenix/client/experiments"; * * const experiment = await runExperiment({ * dataset: "my-dataset", * task: async (example) => example.input, * evaluators: [ * asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }), * ], * }); * ``` */ export declare function runExperiment({ experimentName, experimentDescription, experimentMetadata, client: _client, dataset: datasetSelector, task, evaluators, logger, record, concurrency, dryRun, setGlobalTracerProvider, repetitions, useBatchSpanProcessor, diagLogLevel, }: RunExperimentParams): Promise<RanExperiment>; /** * Evaluate an experiment. * * @experimental This feature is not complete, and will change in the future. */ export declare function evaluateExperiment({ experiment, evaluators, client: _client, logger, concurrency, dryRun, setGlobalTracerProvider, useBatchSpanProcessor, tracerProvider: paramsTracerProvider, diagLogLevel, }: { /** * The experiment to evaluate **/ experiment: RanExperiment; /** The evaluators to use */ evaluators: ExperimentEvaluatorLike[]; /** The client to use */ client?: PhoenixClient; /** The logger to use */ logger?: Logger; /** The number of evaluators to run in parallel */ concurrency?: number; /** * Whether to run the evaluation as a dry run * If a number is provided, the evaluation will be run for the first n runs * @default false * */ dryRun?: boolean | number; /** * Whether to set the global tracer provider when running the evaluators * @default true */ setGlobalTracerProvider?: boolean; /** * Whether to use batching for the span processor. * @default true */ useBatchSpanProcessor?: boolean; /** * The tracer provider to use. If set, the other parameters will be ignored and the passed tracer provider will get used * Intended as a pass-through from runExperiment */ tracerProvider?: NodeTracerProvider | null; /** * Log level to set for the default DiagConsoleLogger when tracing. * Omit to disable default diag logging, or to bring your own. */ diagLogLevel?: DiagLogLevel; }): Promise<RanExperiment>; /** * Wrap an evaluator function in an object with a name property. * * @experimental This feature is not complete, and will change in the future. * * @param params - The parameters for creating the evaluator * @param params.name - The name of the evaluator. * @param params.kind - The kind of evaluator (e.g., "CODE", "LLM") * @param params.evaluate - The evaluator function. * @returns The evaluator object. * @deprecated use asExperimentEvaluator instead */ export declare function asEvaluator({ name, kind, evaluate, }: { name: string; kind: AnnotatorKind; evaluate: Evaluator["evaluate"]; }): Evaluator; //# sourceMappingURL=runExperiment.d.ts.map