@arizeai/phoenix-client
Version:
A client for the Phoenix API
171 lines • 6.41 kB
TypeScript
import { type DiagLogLevel, NodeTracerProvider } from "@arizeai/phoenix-otel";
import { type PhoenixClient } from "../client.js";
import { AnnotatorKind } from "../types/annotations.js";
import { ClientFn } from "../types/core.js";
import { DatasetSelector } from "../types/datasets.js";
import type { Evaluator, ExperimentEvaluatorLike, ExperimentTask, RanExperiment } from "../types/experiments.js";
import { type Logger } from "../types/logger.js";
/**
* Parameters for running an experiment.
*
* @experimental This feature is not complete, and will change in the future.
*/
export type RunExperimentParams = ClientFn & {
/**
* An optional name for the experiment.
* Defaults to the dataset name + a timestamp
*/
experimentName?: string;
/**
* The description of the experiment
*/
experimentDescription?: string;
/**
* Experiment metadata
* E.x. modelName
*/
experimentMetadata?: Record<string, unknown>;
/**
* The dataset to run the experiment on
*/
dataset: DatasetSelector;
/**
* The task to run
*/
task: ExperimentTask;
/**
* The evaluators to use
*/
evaluators?: ExperimentEvaluatorLike[];
/**
* The logger to use
*/
logger?: Logger;
/**
* Whether to record the experiment results
*/
record?: boolean;
/**
* The number of dataset examples to run in parallel
*/
concurrency?: number;
/**
* Whether or not to run the experiment as a dry run. If a number is privided, n examples will be run.
* @default false
*/
dryRun?: number | boolean;
/**
* Whether to set the global tracer provider when running the task.
* If set to false, a locally scoped tracer will be created but will not get registered.
* This may cause certain spans to not be picked up by Phoenix. Notably libraries like the AI SDK that leverage the global tracer.
* @default true
*/
setGlobalTracerProvider?: boolean;
/**
* Number of times to repeat each dataset example
* @default 1
*/
repetitions?: number;
useBatchSpanProcessor?: boolean;
/**
* Log level to set for the default DiagConsoleLogger when tracing.
* Omit to disable default diag logging, or to bring your own.
*/
diagLogLevel?: DiagLogLevel;
};
/**
* Runs an experiment using a given set of dataset of examples.
*
* An experiment is a user-defined task that runs on each example in a dataset. The results from
* each experiment can be evaluated using any number of evaluators to measure the behavior of the
* task. The experiment and evaluation results are stored in the Phoenix database for comparison
* and analysis.
*
* A `task` is either a sync or async function that returns a JSON serializable
* output. If the `task` is a function of one argument then that argument will be bound to the
* `input` field of the dataset example. Alternatively, the `task` can be a function of any
* combination of specific argument names that will be bound to special values:
*
* - `input`: The input field of the dataset example
* - `expected`: The expected or reference output of the dataset example
* - `reference`: An alias for `expected`
* - `metadata`: Metadata associated with the dataset example
* - `example`: The dataset `Example` object with all associated fields
*
* @example
* ```ts
* import { asEvaluator, runExperiment } from "@phoenix/client/experiments";
*
* const experiment = await runExperiment({
* dataset: "my-dataset",
* task: async (example) => example.input,
* evaluators: [
* asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }),
* ],
* });
* ```
*/
export declare function runExperiment({ experimentName, experimentDescription, experimentMetadata, client: _client, dataset: datasetSelector, task, evaluators, logger, record, concurrency, dryRun, setGlobalTracerProvider, repetitions, useBatchSpanProcessor, diagLogLevel, }: RunExperimentParams): Promise<RanExperiment>;
/**
* Evaluate an experiment.
*
* @experimental This feature is not complete, and will change in the future.
*/
export declare function evaluateExperiment({ experiment, evaluators, client: _client, logger, concurrency, dryRun, setGlobalTracerProvider, useBatchSpanProcessor, tracerProvider: paramsTracerProvider, diagLogLevel, }: {
/**
* The experiment to evaluate
**/
experiment: RanExperiment;
/** The evaluators to use */
evaluators: ExperimentEvaluatorLike[];
/** The client to use */
client?: PhoenixClient;
/** The logger to use */
logger?: Logger;
/** The number of evaluators to run in parallel */
concurrency?: number;
/**
* Whether to run the evaluation as a dry run
* If a number is provided, the evaluation will be run for the first n runs
* @default false
* */
dryRun?: boolean | number;
/**
* Whether to set the global tracer provider when running the evaluators
* @default true
*/
setGlobalTracerProvider?: boolean;
/**
* Whether to use batching for the span processor.
* @default true
*/
useBatchSpanProcessor?: boolean;
/**
* The tracer provider to use. If set, the other parameters will be ignored and the passed tracer provider will get used
* Intended as a pass-through from runExperiment
*/
tracerProvider?: NodeTracerProvider | null;
/**
* Log level to set for the default DiagConsoleLogger when tracing.
* Omit to disable default diag logging, or to bring your own.
*/
diagLogLevel?: DiagLogLevel;
}): Promise<RanExperiment>;
/**
* Wrap an evaluator function in an object with a name property.
*
* @experimental This feature is not complete, and will change in the future.
*
* @param params - The parameters for creating the evaluator
* @param params.name - The name of the evaluator.
* @param params.kind - The kind of evaluator (e.g., "CODE", "LLM")
* @param params.evaluate - The evaluator function.
* @returns The evaluator object.
* @deprecated use asExperimentEvaluator instead
*/
export declare function asEvaluator({ name, kind, evaluate, }: {
name: string;
kind: AnnotatorKind;
evaluate: Evaluator["evaluate"];
}): Evaluator;
//# sourceMappingURL=runExperiment.d.ts.map