@arizeai/phoenix-client
Version:
A client for the Phoenix API
524 lines โข 25.3 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.runExperiment = runExperiment;
exports.evaluateExperiment = evaluateExperiment;
exports.asEvaluator = asEvaluator;
const openinference_semantic_conventions_1 = require("@arizeai/openinference-semantic-conventions");
const phoenix_otel_1 = require("@arizeai/phoenix-otel");
const client_1 = require("../client");
const getDataset_1 = require("../datasets/getDataset");
const ensureString_1 = require("../utils/ensureString");
const pluralize_1 = require("../utils/pluralize");
const promisifyResult_1 = require("../utils/promisifyResult");
const toObjectHeaders_1 = require("../utils/toObjectHeaders");
const urlUtils_1 = require("../utils/urlUtils");
const getExperimentInfo_1 = require("./getExperimentInfo");
const helpers_1 = require("./helpers");
const assert_1 = __importDefault(require("assert"));
const async_1 = require("async");
const tiny_invariant_1 = __importDefault(require("tiny-invariant"));
/**
* Validate that a repetition is valid
*/
function isValidRepetitionParam(repetitions) {
return Number.isInteger(repetitions) && repetitions > 0;
}
/**
* Runs an experiment using a given set of dataset of examples.
*
* An experiment is a user-defined task that runs on each example in a dataset. The results from
* each experiment can be evaluated using any number of evaluators to measure the behavior of the
* task. The experiment and evaluation results are stored in the Phoenix database for comparison
* and analysis.
*
* A `task` is either a sync or async function that returns a JSON serializable
* output. If the `task` is a function of one argument then that argument will be bound to the
* `input` field of the dataset example. Alternatively, the `task` can be a function of any
* combination of specific argument names that will be bound to special values:
*
* - `input`: The input field of the dataset example
* - `expected`: The expected or reference output of the dataset example
* - `reference`: An alias for `expected`
* - `metadata`: Metadata associated with the dataset example
* - `example`: The dataset `Example` object with all associated fields
*
* @example
* ```ts
* import { asEvaluator, runExperiment } from "@phoenix/client/experiments";
*
* const experiment = await runExperiment({
* dataset: "my-dataset",
* task: async (example) => example.input,
* evaluators: [
* asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }),
* ],
* });
* ```
*/
async function runExperiment({ experimentName, experimentDescription, experimentMetadata = {}, client: _client, dataset: datasetSelector, task, evaluators, logger = console, record = true, concurrency = 5, dryRun = false, setGlobalTracerProvider = true, repetitions = 1, useBatchSpanProcessor = true, diagLogLevel, }) {
var _a, _b, _c, _d, _e;
// Validation
(0, assert_1.default)(isValidRepetitionParam(repetitions), "repetitions must be an integer greater than 0");
let provider;
const isDryRun = typeof dryRun === "number" || dryRun === true;
const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
const dataset = await (0, getDataset_1.getDataset)({
dataset: datasetSelector,
client,
});
(0, tiny_invariant_1.default)(dataset, `Dataset not found`);
(0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset has no examples`);
const nExamples = typeof dryRun === "number"
? Math.min(dryRun, dataset.examples.length)
: dataset.examples.length;
let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
// initialize the tracer into scope
let taskTracer;
let experiment;
if (isDryRun) {
const now = new Date().toISOString();
const totalExamples = nExamples;
experiment = {
id: localId(),
datasetId: dataset.id,
datasetVersionId: dataset.versionId,
// @todo: the dataset should return splits in response body
datasetSplits: (_a = datasetSelector === null || datasetSelector === void 0 ? void 0 : datasetSelector.splits) !== null && _a !== void 0 ? _a : [],
projectName,
metadata: experimentMetadata,
repetitions,
createdAt: now,
updatedAt: now,
exampleCount: totalExamples,
successfulRunCount: 0,
failedRunCount: 0,
missingRunCount: totalExamples * repetitions,
};
taskTracer = (0, phoenix_otel_1.createNoOpProvider)().getTracer("no-op");
}
else {
const experimentResponse = await client
.POST("/v1/datasets/{dataset_id}/experiments", {
params: {
path: {
dataset_id: dataset.id,
},
},
body: Object.assign(Object.assign({ name: experimentName, description: experimentDescription, metadata: experimentMetadata, project_name: projectName, repetitions }, ((datasetSelector === null || datasetSelector === void 0 ? void 0 : datasetSelector.splits)
? { splits: datasetSelector.splits }
: {})), ((dataset === null || dataset === void 0 ? void 0 : dataset.versionId) ? { version_id: dataset.versionId } : {})),
})
.then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; });
(0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`);
projectName = (_b = experimentResponse.project_name) !== null && _b !== void 0 ? _b : projectName;
experiment = {
id: experimentResponse.id,
datasetId: experimentResponse.dataset_id,
datasetVersionId: experimentResponse.dataset_version_id,
// @todo: the dataset should return splits in response body
datasetSplits: (_c = datasetSelector === null || datasetSelector === void 0 ? void 0 : datasetSelector.splits) !== null && _c !== void 0 ? _c : [],
projectName,
repetitions: experimentResponse.repetitions,
metadata: experimentResponse.metadata || {},
createdAt: experimentResponse.created_at,
updatedAt: experimentResponse.updated_at,
exampleCount: experimentResponse.example_count,
successfulRunCount: experimentResponse.successful_run_count,
failedRunCount: experimentResponse.failed_run_count,
missingRunCount: experimentResponse.missing_run_count,
};
// Initialize the tracer, now that we have a project name
const baseUrl = client.config.baseUrl;
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
provider = (0, phoenix_otel_1.register)({
projectName,
url: baseUrl,
headers: client.config.headers
? (0, toObjectHeaders_1.toObjectHeaders)(client.config.headers)
: undefined,
batch: useBatchSpanProcessor,
diagLogLevel,
global: setGlobalTracerProvider,
});
taskTracer = provider.getTracer(projectName);
}
if (!record) {
logger.info(`๐ง Running experiment in readonly mode. Results will not be recorded.`);
}
if (!isDryRun && client.config.baseUrl) {
const datasetUrl = (0, urlUtils_1.getDatasetUrl)({
baseUrl: client.config.baseUrl,
datasetId: dataset.id,
});
const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({
baseUrl: client.config.baseUrl,
datasetId: dataset.id,
});
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
baseUrl: client.config.baseUrl,
datasetId: dataset.id,
experimentId: experiment.id,
});
logger.info(`๐ View dataset: ${datasetUrl}`);
logger.info(`๐บ View dataset experiments: ${datasetExperimentsUrl}`);
logger.info(`๐ View this experiment: ${experimentUrl}`);
}
logger.info(`๐งช Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0} ${(0, pluralize_1.pluralize)("evaluator", (_e = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _e !== void 0 ? _e : 0)} and ${concurrency} concurrent runs`);
const runs = {};
await runTaskWithExamples({
client,
experimentId: experiment.id,
task,
dataset,
logger,
onComplete: (run) => {
runs[run.id] = run;
},
concurrency,
isDryRun,
nExamples,
tracer: taskTracer,
repetitions,
});
logger.info(`โ
Task runs completed`);
const ranExperiment = Object.assign(Object.assign({}, experiment), { runs });
const { evaluationRuns } = await evaluateExperiment({
experiment: ranExperiment,
evaluators: evaluators !== null && evaluators !== void 0 ? evaluators : [],
client,
logger,
concurrency,
dryRun,
tracerProvider: provider,
diagLogLevel,
useBatchSpanProcessor,
});
ranExperiment.evaluationRuns = evaluationRuns;
logger.info(`โ
Experiment ${experiment.id} completed`);
// Refresh experiment info from server to get updated counts (non-dry-run only)
if (!isDryRun) {
const updatedExperiment = await (0, getExperimentInfo_1.getExperimentInfo)({
client,
experimentId: experiment.id,
});
// Update the experiment info with the latest from the server
Object.assign(ranExperiment, updatedExperiment);
}
if (!isDryRun && client.config.baseUrl) {
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
baseUrl: client.config.baseUrl,
datasetId: dataset.id,
experimentId: experiment.id,
});
logger.info(`๐ View results: ${experimentUrl}`);
}
return ranExperiment;
}
/**
* Run a task against n examples in a dataset.
*/
function runTaskWithExamples({ client, experimentId, task, dataset, onComplete, logger, concurrency = 5, isDryRun, nExamples, tracer, repetitions = 1, }) {
// Validate the input
(0, assert_1.default)(isValidRepetitionParam(repetitions), "repetitions must be an integer greater than 0");
logger.info(`๐ง Running task "${task.name}" on dataset "${dataset.id}"`);
const run = async ({ example, repetitionNumber, }) => {
return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
var _a, _b;
logger.info(`๐ง Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`);
const traceId = span.spanContext().traceId;
const thisRun = {
id: localId(), // initialized with local id, will be replaced with server-assigned id when dry run is false
traceId,
experimentId,
datasetExampleId: example.id,
startTime: new Date(),
endTime: new Date(), // will get replaced with actual end time
output: null,
error: null,
};
try {
const taskOutput = await (0, promisifyResult_1.promisifyResult)(task(example));
thisRun.output = taskOutput;
}
catch (error) {
thisRun.error =
error instanceof Error ? error.message : "Unknown error";
span.setStatus({ code: phoenix_otel_1.SpanStatusCode.ERROR });
}
thisRun.endTime = new Date();
if (!isDryRun) {
// Log the run to the server
const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
params: {
path: {
experiment_id: experimentId,
},
},
body: {
dataset_example_id: example.id,
output: thisRun.output,
repetition_number: repetitionNumber,
start_time: thisRun.startTime.toISOString(),
end_time: thisRun.endTime.toISOString(),
trace_id: thisRun.traceId,
error: thisRun.error,
},
});
// replace the local run id with the server-assigned id
thisRun.id = (_b = (_a = res.data) === null || _a === void 0 ? void 0 : _a.data.id) !== null && _b !== void 0 ? _b : thisRun.id;
const inputMimeType = typeof example.input === "string" ? openinference_semantic_conventions_1.MimeType.TEXT : openinference_semantic_conventions_1.MimeType.JSON;
const outputMimeType = typeof thisRun.output === "string" ? openinference_semantic_conventions_1.MimeType.TEXT : openinference_semantic_conventions_1.MimeType.JSON;
span.setStatus({ code: phoenix_otel_1.SpanStatusCode.OK });
span.setAttributes({
[openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.CHAIN,
[openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: inputMimeType,
[openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: (0, ensureString_1.ensureString)(example.input),
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: outputMimeType,
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(thisRun.output),
});
}
span === null || span === void 0 ? void 0 : span.end();
onComplete(thisRun);
return thisRun;
});
};
const q = (0, async_1.queue)(run, concurrency);
const examplesToUse = dataset.examples.slice(0, nExamples);
examplesToUse
.flatMap((example) => Array.from({ length: repetitions }, (_, index) => ({
example,
repetitionNumber: index + 1, // Repetitions start at 1
})))
.forEach((exampleWithRepetition) => q.push(exampleWithRepetition, (err) => {
if (err) {
logger.error(`Error running task "${task.name}" on example "${exampleWithRepetition.example.id}" repetition ${exampleWithRepetition.repetitionNumber}: ${err}`);
}
}));
return q.drain();
}
/**
* Evaluate an experiment.
*
* @experimental This feature is not complete, and will change in the future.
*/
async function evaluateExperiment({ experiment, evaluators, client: _client, logger = console, concurrency = 5, dryRun = false, setGlobalTracerProvider = true, useBatchSpanProcessor = true, tracerProvider: paramsTracerProvider, diagLogLevel, }) {
var _a, _b;
const isDryRun = typeof dryRun === "number" || dryRun === true;
const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
const baseUrl = client.config.baseUrl;
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
let provider;
// Always allow changing of tracer providers
if (paramsTracerProvider) {
provider = paramsTracerProvider;
}
else if (!isDryRun) {
provider = (0, phoenix_otel_1.register)({
projectName: "evaluators",
url: baseUrl,
headers: client.config.headers
? (0, toObjectHeaders_1.toObjectHeaders)(client.config.headers)
: undefined,
batch: useBatchSpanProcessor,
diagLogLevel,
global: setGlobalTracerProvider,
});
}
else {
provider = (0, phoenix_otel_1.createNoOpProvider)();
}
const tracer = isDryRun
? provider.getTracer("no-op")
: provider.getTracer("evaluators");
const nRuns = typeof dryRun === "number"
? Math.min(dryRun, Object.keys(experiment.runs).length)
: Object.keys(experiment.runs).length;
const dataset = await (0, getDataset_1.getDataset)({
dataset: {
datasetId: experiment.datasetId,
versionId: experiment.datasetVersionId,
splits: experiment.datasetSplits,
},
client,
});
(0, tiny_invariant_1.default)(dataset, `Dataset "${experiment.datasetId}" not found`);
(0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset "${experiment.datasetId}" has no examples`);
(0, tiny_invariant_1.default)(experiment.runs, `Experiment "${experiment.id}" has no runs`);
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
if ((evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) === 0) {
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
}
logger.info(`๐ง Evaluating experiment "${experiment.id}" with ${(_a = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _a !== void 0 ? _a : 0} ${(0, pluralize_1.pluralize)("evaluator", (_b = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _b !== void 0 ? _b : 0)}`);
if (!isDryRun && client.config.baseUrl) {
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
baseUrl: client.config.baseUrl,
datasetId: experiment.datasetId,
experimentId: experiment.id,
});
logger.info(`๐ View experiment evaluation: ${experimentUrl}`);
}
const evaluationRuns = {};
const examplesById = {};
for (const example of dataset.examples) {
examplesById[example.id] = example;
}
const onEvaluationComplete = (run) => {
evaluationRuns[run.id] = run;
};
// Run evaluators against all runs
// Flat list of evaluator + run tuples
const normalizedEvaluators = (0, helpers_1.getExperimentEvaluators)(evaluators);
const evaluatorsAndRuns = normalizedEvaluators.flatMap((evaluator) => runsToEvaluate.map((run) => ({
evaluator,
run,
})));
const evaluatorsQueue = (0, async_1.queue)(async (evaluatorAndRun) => {
return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, async (span) => {
var _a, _b, _c;
const evalResult = await runEvaluator({
evaluator: evaluatorAndRun.evaluator,
run: evaluatorAndRun.run,
exampleCache: examplesById,
onComplete: onEvaluationComplete,
logger,
});
span.setAttributes({
[openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.EVALUATOR,
[openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
[openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: (0, ensureString_1.ensureString)({
input: (_a = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _a === void 0 ? void 0 : _a.input,
output: evaluatorAndRun.run.output,
expected: (_b = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _b === void 0 ? void 0 : _b.output,
metadata: (_c = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _c === void 0 ? void 0 : _c.metadata,
}),
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(evalResult.result),
});
if (evalResult.error) {
span.setStatus({
code: phoenix_otel_1.SpanStatusCode.ERROR,
message: evalResult.error,
});
}
else {
span.setStatus({ code: phoenix_otel_1.SpanStatusCode.OK });
}
if (evalResult.result) {
span.setAttributes((0, phoenix_otel_1.objectAsAttributes)(evalResult.result));
}
evalResult.traceId = span.spanContext().traceId;
if (!isDryRun) {
// Log the evaluation to the server
// We log this without awaiting (e.g. best effort)
client.POST("/v1/experiment_evaluations", {
body: {
experiment_run_id: evaluatorAndRun.run.id,
name: evaluatorAndRun.evaluator.name,
annotator_kind: evaluatorAndRun.evaluator.kind,
start_time: evalResult.startTime.toISOString(),
end_time: evalResult.endTime.toISOString(),
result: Object.assign({}, evalResult.result),
error: evalResult.error,
trace_id: evalResult.traceId,
},
});
}
span.end();
return evalResult;
});
}, concurrency);
if (!evaluatorsAndRuns.length) {
logger.info(`โ No evaluators to run`);
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
}
evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => {
if (err) {
logger.error(`โ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`);
}
}));
await evaluatorsQueue.drain();
logger.info(`โ
Evaluation runs completed`);
if (provider) {
await provider.shutdown();
// Make sure it's not set globally anymore
if (setGlobalTracerProvider) {
phoenix_otel_1.trace.disable();
}
}
return Object.assign(Object.assign({}, experiment), { evaluationRuns: Object.values(evaluationRuns) });
}
/**
* Run an evaluator against a run.
*
* @experimental This feature is not complete, and will change in the future.
*/
async function runEvaluator({ evaluator, run, exampleCache, onComplete, logger, }) {
const example = exampleCache[run.datasetExampleId];
(0, tiny_invariant_1.default)(example, `Example "${run.datasetExampleId}" not found`);
const evaluate = async () => {
var _a;
logger.info(`๐ง Evaluating run "${run.id}" with evaluator "${evaluator.name}"`);
const thisEval = {
id: localId(),
traceId: null,
experimentRunId: run.id,
startTime: new Date(),
endTime: new Date(), // will get replaced with actual end time
name: evaluator.name,
result: null,
error: null,
annotatorKind: evaluator.kind,
};
try {
const result = await evaluator.evaluate({
input: example.input,
output: (_a = run.output) !== null && _a !== void 0 ? _a : null,
expected: example.output,
metadata: example === null || example === void 0 ? void 0 : example.metadata,
});
thisEval.result = result;
logger.info(`โ
Evaluator "${evaluator.name}" on run "${run.id}" completed`);
}
catch (error) {
thisEval.error = error instanceof Error ? error.message : "Unknown error";
logger.error(`โ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`);
}
thisEval.endTime = new Date();
onComplete(thisEval);
return thisEval;
};
return evaluate();
}
/**
* Wrap an evaluator function in an object with a name property.
*
* @experimental This feature is not complete, and will change in the future.
*
* @param params - The parameters for creating the evaluator
* @param params.name - The name of the evaluator.
* @param params.kind - The kind of evaluator (e.g., "CODE", "LLM")
* @param params.evaluate - The evaluator function.
* @returns The evaluator object.
* @deprecated use asExperimentEvaluator instead
*/
function asEvaluator({ name, kind, evaluate, }) {
return {
name,
kind,
evaluate,
};
}
let _localIdIndex = 1000;
/**
* Generate a local id.
*
* @returns A semi-unique id.
*/
function localId() {
_localIdIndex++;
return `local_${_localIdIndex}`;
}
//# sourceMappingURL=runExperiment.js.map