UNPKG

@arizeai/phoenix-client

Version:

A client for the Phoenix API

524 lines • 25.3 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.runExperiment = runExperiment; exports.evaluateExperiment = evaluateExperiment; exports.asEvaluator = asEvaluator; const openinference_semantic_conventions_1 = require("@arizeai/openinference-semantic-conventions"); const phoenix_otel_1 = require("@arizeai/phoenix-otel"); const client_1 = require("../client"); const getDataset_1 = require("../datasets/getDataset"); const ensureString_1 = require("../utils/ensureString"); const pluralize_1 = require("../utils/pluralize"); const promisifyResult_1 = require("../utils/promisifyResult"); const toObjectHeaders_1 = require("../utils/toObjectHeaders"); const urlUtils_1 = require("../utils/urlUtils"); const getExperimentInfo_1 = require("./getExperimentInfo"); const helpers_1 = require("./helpers"); const assert_1 = __importDefault(require("assert")); const async_1 = require("async"); const tiny_invariant_1 = __importDefault(require("tiny-invariant")); /** * Validate that a repetition is valid */ function isValidRepetitionParam(repetitions) { return Number.isInteger(repetitions) && repetitions > 0; } /** * Runs an experiment using a given set of dataset of examples. * * An experiment is a user-defined task that runs on each example in a dataset. The results from * each experiment can be evaluated using any number of evaluators to measure the behavior of the * task. The experiment and evaluation results are stored in the Phoenix database for comparison * and analysis. * * A `task` is either a sync or async function that returns a JSON serializable * output. If the `task` is a function of one argument then that argument will be bound to the * `input` field of the dataset example. Alternatively, the `task` can be a function of any * combination of specific argument names that will be bound to special values: * * - `input`: The input field of the dataset example * - `expected`: The expected or reference output of the dataset example * - `reference`: An alias for `expected` * - `metadata`: Metadata associated with the dataset example * - `example`: The dataset `Example` object with all associated fields * * @example * ```ts * import { asEvaluator, runExperiment } from "@phoenix/client/experiments"; * * const experiment = await runExperiment({ * dataset: "my-dataset", * task: async (example) => example.input, * evaluators: [ * asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }), * ], * }); * ``` */ async function runExperiment({ experimentName, experimentDescription, experimentMetadata = {}, client: _client, dataset: datasetSelector, task, evaluators, logger = console, record = true, concurrency = 5, dryRun = false, setGlobalTracerProvider = true, repetitions = 1, useBatchSpanProcessor = true, diagLogLevel, }) { var _a, _b, _c, _d, _e; // Validation (0, assert_1.default)(isValidRepetitionParam(repetitions), "repetitions must be an integer greater than 0"); let provider; const isDryRun = typeof dryRun === "number" || dryRun === true; const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)(); const dataset = await (0, getDataset_1.getDataset)({ dataset: datasetSelector, client, }); (0, tiny_invariant_1.default)(dataset, `Dataset not found`); (0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset has no examples`); const nExamples = typeof dryRun === "number" ? Math.min(dryRun, dataset.examples.length) : dataset.examples.length; let projectName = `${dataset.name}-exp-${new Date().toISOString()}`; // initialize the tracer into scope let taskTracer; let experiment; if (isDryRun) { const now = new Date().toISOString(); const totalExamples = nExamples; experiment = { id: localId(), datasetId: dataset.id, datasetVersionId: dataset.versionId, // @todo: the dataset should return splits in response body datasetSplits: (_a = datasetSelector === null || datasetSelector === void 0 ? void 0 : datasetSelector.splits) !== null && _a !== void 0 ? _a : [], projectName, metadata: experimentMetadata, repetitions, createdAt: now, updatedAt: now, exampleCount: totalExamples, successfulRunCount: 0, failedRunCount: 0, missingRunCount: totalExamples * repetitions, }; taskTracer = (0, phoenix_otel_1.createNoOpProvider)().getTracer("no-op"); } else { const experimentResponse = await client .POST("/v1/datasets/{dataset_id}/experiments", { params: { path: { dataset_id: dataset.id, }, }, body: Object.assign(Object.assign({ name: experimentName, description: experimentDescription, metadata: experimentMetadata, project_name: projectName, repetitions }, ((datasetSelector === null || datasetSelector === void 0 ? void 0 : datasetSelector.splits) ? { splits: datasetSelector.splits } : {})), ((dataset === null || dataset === void 0 ? void 0 : dataset.versionId) ? { version_id: dataset.versionId } : {})), }) .then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; }); (0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`); projectName = (_b = experimentResponse.project_name) !== null && _b !== void 0 ? _b : projectName; experiment = { id: experimentResponse.id, datasetId: experimentResponse.dataset_id, datasetVersionId: experimentResponse.dataset_version_id, // @todo: the dataset should return splits in response body datasetSplits: (_c = datasetSelector === null || datasetSelector === void 0 ? void 0 : datasetSelector.splits) !== null && _c !== void 0 ? _c : [], projectName, repetitions: experimentResponse.repetitions, metadata: experimentResponse.metadata || {}, createdAt: experimentResponse.created_at, updatedAt: experimentResponse.updated_at, exampleCount: experimentResponse.example_count, successfulRunCount: experimentResponse.successful_run_count, failedRunCount: experimentResponse.failed_run_count, missingRunCount: experimentResponse.missing_run_count, }; // Initialize the tracer, now that we have a project name const baseUrl = client.config.baseUrl; (0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."); provider = (0, phoenix_otel_1.register)({ projectName, url: baseUrl, headers: client.config.headers ? (0, toObjectHeaders_1.toObjectHeaders)(client.config.headers) : undefined, batch: useBatchSpanProcessor, diagLogLevel, global: setGlobalTracerProvider, }); taskTracer = provider.getTracer(projectName); } if (!record) { logger.info(`🔧 Running experiment in readonly mode. Results will not be recorded.`); } if (!isDryRun && client.config.baseUrl) { const datasetUrl = (0, urlUtils_1.getDatasetUrl)({ baseUrl: client.config.baseUrl, datasetId: dataset.id, }); const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({ baseUrl: client.config.baseUrl, datasetId: dataset.id, }); const experimentUrl = (0, urlUtils_1.getExperimentUrl)({ baseUrl: client.config.baseUrl, datasetId: dataset.id, experimentId: experiment.id, }); logger.info(`📊 View dataset: ${datasetUrl}`); logger.info(`📺 View dataset experiments: ${datasetExperimentsUrl}`); logger.info(`🔗 View this experiment: ${experimentUrl}`); } logger.info(`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0} ${(0, pluralize_1.pluralize)("evaluator", (_e = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _e !== void 0 ? _e : 0)} and ${concurrency} concurrent runs`); const runs = {}; await runTaskWithExamples({ client, experimentId: experiment.id, task, dataset, logger, onComplete: (run) => { runs[run.id] = run; }, concurrency, isDryRun, nExamples, tracer: taskTracer, repetitions, }); logger.info(`✅ Task runs completed`); const ranExperiment = Object.assign(Object.assign({}, experiment), { runs }); const { evaluationRuns } = await evaluateExperiment({ experiment: ranExperiment, evaluators: evaluators !== null && evaluators !== void 0 ? evaluators : [], client, logger, concurrency, dryRun, tracerProvider: provider, diagLogLevel, useBatchSpanProcessor, }); ranExperiment.evaluationRuns = evaluationRuns; logger.info(`✅ Experiment ${experiment.id} completed`); // Refresh experiment info from server to get updated counts (non-dry-run only) if (!isDryRun) { const updatedExperiment = await (0, getExperimentInfo_1.getExperimentInfo)({ client, experimentId: experiment.id, }); // Update the experiment info with the latest from the server Object.assign(ranExperiment, updatedExperiment); } if (!isDryRun && client.config.baseUrl) { const experimentUrl = (0, urlUtils_1.getExperimentUrl)({ baseUrl: client.config.baseUrl, datasetId: dataset.id, experimentId: experiment.id, }); logger.info(`🔍 View results: ${experimentUrl}`); } return ranExperiment; } /** * Run a task against n examples in a dataset. */ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete, logger, concurrency = 5, isDryRun, nExamples, tracer, repetitions = 1, }) { // Validate the input (0, assert_1.default)(isValidRepetitionParam(repetitions), "repetitions must be an integer greater than 0"); logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`); const run = async ({ example, repetitionNumber, }) => { return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => { var _a, _b; logger.info(`🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`); const traceId = span.spanContext().traceId; const thisRun = { id: localId(), // initialized with local id, will be replaced with server-assigned id when dry run is false traceId, experimentId, datasetExampleId: example.id, startTime: new Date(), endTime: new Date(), // will get replaced with actual end time output: null, error: null, }; try { const taskOutput = await (0, promisifyResult_1.promisifyResult)(task(example)); thisRun.output = taskOutput; } catch (error) { thisRun.error = error instanceof Error ? error.message : "Unknown error"; span.setStatus({ code: phoenix_otel_1.SpanStatusCode.ERROR }); } thisRun.endTime = new Date(); if (!isDryRun) { // Log the run to the server const res = await client.POST("/v1/experiments/{experiment_id}/runs", { params: { path: { experiment_id: experimentId, }, }, body: { dataset_example_id: example.id, output: thisRun.output, repetition_number: repetitionNumber, start_time: thisRun.startTime.toISOString(), end_time: thisRun.endTime.toISOString(), trace_id: thisRun.traceId, error: thisRun.error, }, }); // replace the local run id with the server-assigned id thisRun.id = (_b = (_a = res.data) === null || _a === void 0 ? void 0 : _a.data.id) !== null && _b !== void 0 ? _b : thisRun.id; const inputMimeType = typeof example.input === "string" ? openinference_semantic_conventions_1.MimeType.TEXT : openinference_semantic_conventions_1.MimeType.JSON; const outputMimeType = typeof thisRun.output === "string" ? openinference_semantic_conventions_1.MimeType.TEXT : openinference_semantic_conventions_1.MimeType.JSON; span.setStatus({ code: phoenix_otel_1.SpanStatusCode.OK }); span.setAttributes({ [openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.CHAIN, [openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: inputMimeType, [openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: (0, ensureString_1.ensureString)(example.input), [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: outputMimeType, [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(thisRun.output), }); } span === null || span === void 0 ? void 0 : span.end(); onComplete(thisRun); return thisRun; }); }; const q = (0, async_1.queue)(run, concurrency); const examplesToUse = dataset.examples.slice(0, nExamples); examplesToUse .flatMap((example) => Array.from({ length: repetitions }, (_, index) => ({ example, repetitionNumber: index + 1, // Repetitions start at 1 }))) .forEach((exampleWithRepetition) => q.push(exampleWithRepetition, (err) => { if (err) { logger.error(`Error running task "${task.name}" on example "${exampleWithRepetition.example.id}" repetition ${exampleWithRepetition.repetitionNumber}: ${err}`); } })); return q.drain(); } /** * Evaluate an experiment. * * @experimental This feature is not complete, and will change in the future. */ async function evaluateExperiment({ experiment, evaluators, client: _client, logger = console, concurrency = 5, dryRun = false, setGlobalTracerProvider = true, useBatchSpanProcessor = true, tracerProvider: paramsTracerProvider, diagLogLevel, }) { var _a, _b; const isDryRun = typeof dryRun === "number" || dryRun === true; const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)(); const baseUrl = client.config.baseUrl; (0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."); let provider; // Always allow changing of tracer providers if (paramsTracerProvider) { provider = paramsTracerProvider; } else if (!isDryRun) { provider = (0, phoenix_otel_1.register)({ projectName: "evaluators", url: baseUrl, headers: client.config.headers ? (0, toObjectHeaders_1.toObjectHeaders)(client.config.headers) : undefined, batch: useBatchSpanProcessor, diagLogLevel, global: setGlobalTracerProvider, }); } else { provider = (0, phoenix_otel_1.createNoOpProvider)(); } const tracer = isDryRun ? provider.getTracer("no-op") : provider.getTracer("evaluators"); const nRuns = typeof dryRun === "number" ? Math.min(dryRun, Object.keys(experiment.runs).length) : Object.keys(experiment.runs).length; const dataset = await (0, getDataset_1.getDataset)({ dataset: { datasetId: experiment.datasetId, versionId: experiment.datasetVersionId, splits: experiment.datasetSplits, }, client, }); (0, tiny_invariant_1.default)(dataset, `Dataset "${experiment.datasetId}" not found`); (0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset "${experiment.datasetId}" has no examples`); (0, tiny_invariant_1.default)(experiment.runs, `Experiment "${experiment.id}" has no runs`); const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns); if ((evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) === 0) { return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] }); } logger.info(`🧠 Evaluating experiment "${experiment.id}" with ${(_a = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _a !== void 0 ? _a : 0} ${(0, pluralize_1.pluralize)("evaluator", (_b = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _b !== void 0 ? _b : 0)}`); if (!isDryRun && client.config.baseUrl) { const experimentUrl = (0, urlUtils_1.getExperimentUrl)({ baseUrl: client.config.baseUrl, datasetId: experiment.datasetId, experimentId: experiment.id, }); logger.info(`🔗 View experiment evaluation: ${experimentUrl}`); } const evaluationRuns = {}; const examplesById = {}; for (const example of dataset.examples) { examplesById[example.id] = example; } const onEvaluationComplete = (run) => { evaluationRuns[run.id] = run; }; // Run evaluators against all runs // Flat list of evaluator + run tuples const normalizedEvaluators = (0, helpers_1.getExperimentEvaluators)(evaluators); const evaluatorsAndRuns = normalizedEvaluators.flatMap((evaluator) => runsToEvaluate.map((run) => ({ evaluator, run, }))); const evaluatorsQueue = (0, async_1.queue)(async (evaluatorAndRun) => { return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, async (span) => { var _a, _b, _c; const evalResult = await runEvaluator({ evaluator: evaluatorAndRun.evaluator, run: evaluatorAndRun.run, exampleCache: examplesById, onComplete: onEvaluationComplete, logger, }); span.setAttributes({ [openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.EVALUATOR, [openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON, [openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: (0, ensureString_1.ensureString)({ input: (_a = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _a === void 0 ? void 0 : _a.input, output: evaluatorAndRun.run.output, expected: (_b = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _b === void 0 ? void 0 : _b.output, metadata: (_c = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _c === void 0 ? void 0 : _c.metadata, }), [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON, [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(evalResult.result), }); if (evalResult.error) { span.setStatus({ code: phoenix_otel_1.SpanStatusCode.ERROR, message: evalResult.error, }); } else { span.setStatus({ code: phoenix_otel_1.SpanStatusCode.OK }); } if (evalResult.result) { span.setAttributes((0, phoenix_otel_1.objectAsAttributes)(evalResult.result)); } evalResult.traceId = span.spanContext().traceId; if (!isDryRun) { // Log the evaluation to the server // We log this without awaiting (e.g. best effort) client.POST("/v1/experiment_evaluations", { body: { experiment_run_id: evaluatorAndRun.run.id, name: evaluatorAndRun.evaluator.name, annotator_kind: evaluatorAndRun.evaluator.kind, start_time: evalResult.startTime.toISOString(), end_time: evalResult.endTime.toISOString(), result: Object.assign({}, evalResult.result), error: evalResult.error, trace_id: evalResult.traceId, }, }); } span.end(); return evalResult; }); }, concurrency); if (!evaluatorsAndRuns.length) { logger.info(`⛔ No evaluators to run`); return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] }); } evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => { if (err) { logger.error(`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`); } })); await evaluatorsQueue.drain(); logger.info(`✅ Evaluation runs completed`); if (provider) { await provider.shutdown(); // Make sure it's not set globally anymore if (setGlobalTracerProvider) { phoenix_otel_1.trace.disable(); } } return Object.assign(Object.assign({}, experiment), { evaluationRuns: Object.values(evaluationRuns) }); } /** * Run an evaluator against a run. * * @experimental This feature is not complete, and will change in the future. */ async function runEvaluator({ evaluator, run, exampleCache, onComplete, logger, }) { const example = exampleCache[run.datasetExampleId]; (0, tiny_invariant_1.default)(example, `Example "${run.datasetExampleId}" not found`); const evaluate = async () => { var _a; logger.info(`🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`); const thisEval = { id: localId(), traceId: null, experimentRunId: run.id, startTime: new Date(), endTime: new Date(), // will get replaced with actual end time name: evaluator.name, result: null, error: null, annotatorKind: evaluator.kind, }; try { const result = await evaluator.evaluate({ input: example.input, output: (_a = run.output) !== null && _a !== void 0 ? _a : null, expected: example.output, metadata: example === null || example === void 0 ? void 0 : example.metadata, }); thisEval.result = result; logger.info(`✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`); } catch (error) { thisEval.error = error instanceof Error ? error.message : "Unknown error"; logger.error(`❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`); } thisEval.endTime = new Date(); onComplete(thisEval); return thisEval; }; return evaluate(); } /** * Wrap an evaluator function in an object with a name property. * * @experimental This feature is not complete, and will change in the future. * * @param params - The parameters for creating the evaluator * @param params.name - The name of the evaluator. * @param params.kind - The kind of evaluator (e.g., "CODE", "LLM") * @param params.evaluate - The evaluator function. * @returns The evaluator object. * @deprecated use asExperimentEvaluator instead */ function asEvaluator({ name, kind, evaluate, }) { return { name, kind, evaluate, }; } let _localIdIndex = 1000; /** * Generate a local id. * * @returns A semi-unique id. */ function localId() { _localIdIndex++; return `local_${_localIdIndex}`; } //# sourceMappingURL=runExperiment.js.map