UNPKG

axiom

Version:

Axiom AI SDK provides - an API to wrap your AI calls with observability instrumentation. - offline evals - online evals

1,145 lines (1,137 loc) 43 kB
import { AxiomReporter, ensureInstrumentationInitialized, flush, resolveAxiomConnection, startActiveSpan, startSpan } from "./chunk-SQJ53C2N.js"; import "./chunk-72PVEOMF.js"; import "./chunk-3VKWOZAQ.js"; import { dotNotationToNested, getConfigScope, getEvalContext, getGlobalFlagOverrides, setGlobalFlagOverrides, withEvalContext } from "./chunk-63VQQCZB.js"; import { recordName } from "./chunk-FWPCBQBZ.js"; import { AxiomCLIError, errorToString } from "./chunk-ISSDOC43.js"; import "./chunk-PU64TWX4.js"; import "./chunk-MM5FFQJT.js"; import { Mean } from "./chunk-73F2PMAH.js"; import { Attr } from "./chunk-4TKUTT24.js"; import { __publicField } from "./chunk-KEXKKQVW.js"; // src/evals/eval.ts import { afterAll, beforeAll, describe, inject, it } from "vitest"; import { context, SpanStatusCode, trace } from "@opentelemetry/api"; import { customAlphabet } from "nanoid"; // src/evals/git-info.ts import { execSync } from "child_process"; function getGitUserInfo() { try { const name = execSync("git config --get user.name").toString().trim(); const email = execSync("git config --get user.email").toString().trim(); return { name, email }; } catch { return null; } } // src/utils/fetcher.ts var createFetcher = ({ baseUrl, token, orgId }) => { return (path, options) => fetch(new URL(path, baseUrl).toString(), { ...options, headers: { ...options.headers, "content-type": "application/json", authorization: `Bearer ${token}`, ...orgId ? { "X-AXIOM-ORG-ID": orgId } : {} } }); }; // src/util/traces.ts function getCustomOrRegularAttribute(obj, accessKey) { if (typeof obj !== "object" || obj === null) { return void 0; } const keyParts = accessKey.split("."); const custom = obj.custom; if (custom && typeof custom === "object" && custom !== null && accessKey in custom) { return custom[accessKey]; } let current = obj; for (const part of keyParts) { if (typeof current !== "object" || current === null) { return void 0; } current = current[part]; } return current; } function getCustomOrRegularString(obj, key) { const value = getCustomOrRegularAttribute(obj, key); return typeof value === "string" ? value : void 0; } function getCustomOrRegularNumber(obj, key) { const value = getCustomOrRegularAttribute(obj, key); if (typeof value === "number") { return value; } if (typeof value === "string") { const parsed = Number(value); return Number.isNaN(parsed) ? void 0 : parsed; } return void 0; } // src/evals/eval.service.ts var EvaluationApiClient = class { constructor(config, consoleUrl) { __publicField(this, "fetcher"); const { consoleEndpointUrl, token, orgId } = resolveAxiomConnection(config, consoleUrl); this.fetcher = createFetcher({ baseUrl: consoleEndpointUrl, token: token ?? "", orgId }); } async createEvaluation(evaluation) { const resp = await this.fetcher(`/api/v3/evaluations`, { method: "POST", body: JSON.stringify(evaluation) }); if (!resp.ok) { const text = await resp.text().catch(() => ""); throw new AxiomCLIError( `Failed to create evaluation: ${resp.statusText}${text ? ` - ${text}` : ""}` ); } return resp.json(); } async updateEvaluation(evaluation) { const resp = await this.fetcher(`/api/v3/evaluations/${evaluation.id}`, { method: "PATCH", body: JSON.stringify(evaluation) }); if (!resp.ok) { const text = await resp.text().catch(() => ""); throw new AxiomCLIError( `Failed to update evaluation: ${resp.statusText}${text ? ` - ${text}` : ""}` ); } const body = await resp.json(); if (body.error) { throw new AxiomCLIError( `Failed to update evaluation ${evaluation.id}: ${JSON.stringify(body.error)}` ); } return body; } }; var findEvaluationCases = async (evalId, config) => { const { dataset, edgeUrl, url, token, orgId } = resolveAxiomConnection(config); const apl = `['${dataset}'] | where column_ifexists('trace_id', '') == "${evalId}" | order by _time`; const headers = new Headers({ Authorization: `Bearer ${token}`, "Content-Type": "application/json", ...orgId ? { "X-AXIOM-ORG-ID": orgId } : {} }); const hasExplicitEdgeUrl = !!config.eval.edgeUrl; const queryBaseUrl = hasExplicitEdgeUrl ? edgeUrl : url; const queryPath = hasExplicitEdgeUrl ? "/v1/query/_apl?format=legacy" : "/v1/datasets/_apl?format=legacy"; const resp = await fetch(`${queryBaseUrl}${queryPath}`, { headers, method: "POST", body: JSON.stringify({ apl }) }); const payload = await resp.json(); if (!resp.ok) { throw new Error( `Failed to query evaluation cases: ${payload?.message || resp?.statusText || "Unknown error"}` ); } return payload.matches.length ? buildSpanTree(payload.matches) : null; }; var mapSpanToEval = (span) => { const flagConfigRaw = getCustomOrRegularAttribute(span.data.attributes, Attr.Eval.Config.Flags); const tagsRaw = getCustomOrRegularAttribute(span.data.attributes, Attr.Eval.Tags); const evaluation = { id: getCustomOrRegularString(span.data.attributes, Attr.Eval.ID), name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Name), type: getCustomOrRegularString(span.data.attributes, Attr.Eval.Type), version: getCustomOrRegularString(span.data.attributes, Attr.Eval.Version), collection: { name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Collection.Name), size: getCustomOrRegularNumber(span.data.attributes, Attr.Eval.Collection.Size) }, baseline: { id: getCustomOrRegularString(span.data.attributes, Attr.Eval.Baseline.ID), name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Baseline.Name) }, duration: span.data.duration, status: span.data.status.code, traceId: span.data.trace_id, runAt: span._time, tags: tagsRaw ? typeof tagsRaw === "string" ? JSON.parse(tagsRaw) : tagsRaw : [], user: { name: getCustomOrRegularString(span.data.attributes, Attr.Eval.User.Name), email: getCustomOrRegularString(span.data.attributes, Attr.Eval.User.Email) }, cases: [], flagConfig: flagConfigRaw ? typeof flagConfigRaw === "string" ? JSON.parse(flagConfigRaw) : flagConfigRaw : void 0 }; return evaluation; }; var mapSpanToCase = (item) => { const data = item.data; const d = data.duration; let duration = "-"; if (d.endsWith("s")) { duration = `${Number(d.replace("s", "")).toFixed(2)}s`; } else { duration = d; } const scoresRaw = getCustomOrRegularAttribute(data.attributes, Attr.Eval.Case.Scores); const scoresParsed = scoresRaw ? typeof scoresRaw === "string" ? JSON.parse(scoresRaw) : scoresRaw : {}; const scores = {}; for (const [name, scoreData] of Object.entries(scoresParsed)) { const s = scoreData; scores[name] = { name, value: s.value ?? s.score ?? 0, metadata: s.metadata ?? {} }; } const caseData = { index: getCustomOrRegularNumber(data.attributes, Attr.Eval.Case.Index), input: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Input), output: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Output), expected: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Expected), duration, status: data.status.code, scores, runAt: item._time, spanId: data.span_id, traceId: data.trace_id }; return caseData; }; var buildSpanTree = (spans) => { if (!spans.length) { return null; } const evalSpan = spans.find((span) => span.data.attributes.gen_ai.operation.name === "eval"); if (!evalSpan) { return null; } const rootSpan = mapSpanToEval(evalSpan); const caseSpans = spans.filter((span) => span.data.name.startsWith("case")); for (const caseSpan of caseSpans) { const caseData = mapSpanToCase(caseSpan); const taskSpans = spans.filter( (span) => span.data.name.startsWith("task") && span.data.parent_span_id === caseSpan.data.span_id ); if (taskSpans.length > 0) { const taskSpan = taskSpans[0]; const chatSpans = spans.filter( (span) => span.data.name.startsWith("chat") && span.data.parent_span_id === taskSpan.data.span_id ); const chatData = chatSpans.map((chatSpan) => ({ operation: getCustomOrRegularString(chatSpan.data.attributes, "operation") ?? "", capability: getCustomOrRegularString(chatSpan.data.attributes, "capability") ?? "", step: getCustomOrRegularString(chatSpan.data.attributes, "step") ?? "", request: { max_token: getCustomOrRegularString(chatSpan.data.attributes, "request.max_token") ?? "", model: getCustomOrRegularString(chatSpan.data.attributes, "request.model") ?? "", temperature: getCustomOrRegularNumber(chatSpan.data.attributes, "request.temperature") ?? 0 }, response: { finish_reasons: getCustomOrRegularString(chatSpan.data.attributes, "response.finish_reasons") ?? "" }, usage: { input_tokens: getCustomOrRegularNumber(chatSpan.data.attributes, "usage.input_tokens") ?? 0, output_tokens: getCustomOrRegularNumber(chatSpan.data.attributes, "usage.output_tokens") ?? 0 } })); const taskData = { name: taskSpan.data.name, output: getCustomOrRegularString(taskSpan.data.attributes, "output") || "", trial: getCustomOrRegularNumber(taskSpan.data.attributes, "trial") || 0, type: getCustomOrRegularString(taskSpan.data.attributes, "type") || "", error: getCustomOrRegularString(taskSpan.data.attributes, "error") || "", chat: chatData[0] || { operation: "", capability: "", step: "", request: { max_token: "", model: "", temperature: 0 }, response: { finish_reasons: "" }, usage: { input_tokens: 0, output_tokens: 0 } } }; caseData.task = taskData; } const scoreSpans = spans.filter( (span) => span.data.attributes.gen_ai.operation.name === "eval.score" && span.data.parent_span_id === caseSpan.data.span_id ); if (scoreSpans.length > 0) { caseData.scores = {}; scoreSpans.forEach((score) => { const name = getCustomOrRegularString(score.data.attributes, Attr.Eval.Score.Name) ?? ""; const value = getCustomOrRegularNumber(score.data.attributes, Attr.Eval.Score.Value) ?? 0; const metadataRaw = getCustomOrRegularString( score.data.attributes, Attr.Eval.Score.Metadata ); let metadata = {}; try { metadata = metadataRaw ? JSON.parse(metadataRaw) : {}; } catch { } caseData.scores[name] = { name, value, metadata: { error: score.data.attributes.error, ...metadata } }; }); } rootSpan.cases.push(caseData); } rootSpan.cases.sort((a, b) => a.index - b.index); return rootSpan; }; // src/util/deep-equal.ts function deepEqual(data, other) { if (data === other) { return true; } if (Object.is(data, other)) { return true; } if (typeof data !== "object" || typeof other !== "object") { return false; } if (data === null || other === null) { return false; } if (Object.getPrototypeOf(data) !== Object.getPrototypeOf(other)) { return false; } if (Array.isArray(data)) { return isDeepEqualArrays(data, other); } if (data instanceof Map) { return isDeepEqualMaps(data, other); } if (data instanceof Set) { return isDeepEqualSets(data, other); } if (data instanceof Date) { return data.getTime() === other.getTime(); } if (data instanceof RegExp) { return data.toString() === other.toString(); } if (Object.keys(data).length !== Object.keys(other).length) { return false; } for (const [key, value] of Object.entries(data)) { if (!(key in other)) { return false; } if (!deepEqual( value, // @ts-expect-error [ts7053] - We already checked that `other` has `key` other[key] )) { return false; } } return true; } function isDeepEqualArrays(data, other) { if (data.length !== other.length) { return false; } for (const [index, item] of data.entries()) { if (!deepEqual(item, other[index])) { return false; } } return true; } function isDeepEqualMaps(data, other) { if (data.size !== other.size) { return false; } for (const [key, value] of data.entries()) { if (!other.has(key)) { return false; } if (!deepEqual(value, other.get(key))) { return false; } } return true; } function isDeepEqualSets(data, other) { if (data.size !== other.size) { return false; } const otherCopy = [...other]; for (const dataItem of data) { let isFound = false; for (const [index, otherItem] of otherCopy.entries()) { if (deepEqual(dataItem, otherItem)) { isFound = true; otherCopy.splice(index, 1); break; } } if (!isFound) { return false; } } return true; } // src/util/tryCatch.ts function toError(rawError, operationName) { const processedError = rawError instanceof Error ? rawError : new Error(errorToString(rawError)); if (operationName) { processedError.message = `Operation "${operationName}" failed: ${processedError.message}`; } return processedError; } function tryCatchSync(fn, operationName) { try { return [fn(), null]; } catch (rawError) { return [null, toError(rawError, operationName)]; } } async function tryCatchAsync(fn, operationName) { try { const result = typeof fn === "function" ? fn() : fn; return [await result, null]; } catch (rawError) { return [null, toError(rawError, operationName)]; } } var tryCatch = ((fn, operationName) => { if (typeof fn === "function") { try { const result = fn(); if (result instanceof Promise) { return tryCatchAsync(result, operationName); } return [result, null]; } catch (rawError) { return [null, toError(rawError, operationName)]; } } if (fn instanceof Promise) { return tryCatchAsync(fn, operationName); } return [fn, null]; }); tryCatch.sync = tryCatchSync; tryCatch.async = tryCatchAsync; // src/evals/eval.ts var createVersionId = customAlphabet("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", 10); var RUN_TASK_FAILURE_DETAILS = Symbol.for("axiom.eval.runTaskFailureDetails"); function withCompatibleSuiteHook(fn) { return async function({}, maybeSuite) { const suite = maybeSuite ?? arguments[0]; await fn(suite); }; } function attachRunTaskFailureDetails(error, details) { const normalized = toError(error); normalized[RUN_TASK_FAILURE_DETAILS] = details; return normalized; } function getRunTaskFailureDetails(error) { if (typeof error !== "object" || error === null) { return void 0; } return error[RUN_TASK_FAILURE_DETAILS]; } function Eval(name, params) { recordName("eval", name); recordName("capability", params.capability); if (params.step) { recordName("step", params.step); } if (params.scorers) { for (const scorer of params.scorers) { const scorerName = getScorerName(scorer, ""); recordName("scorer", scorerName); } } registerEval(name, params).catch(console.error); } function captureFlagConfig(configFlags) { if (!configFlags || configFlags.length === 0) { return {}; } const scope = getConfigScope(); const allDefaults = scope?.getAllDefaultFlags?.() ?? {}; const overrides = getGlobalFlagOverrides(); const merged = { ...allDefaults, ...overrides }; const filtered = {}; for (const [key, value] of Object.entries(merged)) { const isInScope = configFlags.some((pattern) => key.startsWith(pattern)); if (isInScope) { filtered[key] = value; } } return dotNotationToNested(filtered); } var getScorerName = (scorer, fallback = "unknown") => { return scorer.name || fallback; }; async function registerEval(evalName, opts) { opts.data; const collectionPromise = typeof opts.data === "function" ? opts.data() : opts.data; const user = getGitUserInfo(); const baselineId = inject("baseline"); const isDebug = inject("debug"); const isList = inject("list"); const injectedOverrides = inject("overrides"); const axiomConfig = inject("axiomConfig"); const runId = inject("runId"); const consoleUrl = inject("consoleUrl"); if (!axiomConfig) { throw new AxiomCLIError("Axiom config not found"); } const timeoutMs = opts.timeout ?? axiomConfig?.eval.timeoutMs; const instrumentationReady = ensureInstrumentationInitialized(axiomConfig, { enabled: !isDebug && !isList }); const result = await describe( evalName, async () => { const collection = await collectionPromise; const evaluationApiClient = new EvaluationApiClient(axiomConfig, consoleUrl); const evalVersion = createVersionId(); let evalId = ""; let suiteStart; let suiteSpan; let suiteContext; let instrumentationError = void 0; let baseline = void 0; const allOutOfScopeFlags = []; let finalConfigSnapshot; const handleBeforeAll = async (suite) => { if (injectedOverrides && Object.keys(injectedOverrides).length > 0) { try { setGlobalFlagOverrides(injectedOverrides); } catch { } } suite.meta.evaluation = { id: evalId, name: evalName, version: evalVersion, runId, orgId: void 0, baseline: baseline ?? void 0, configFlags: opts.configFlags }; const [, instrumentationInitError] = await tryCatchAsync(instrumentationReady); if (instrumentationInitError) { instrumentationError = instrumentationInitError; } suiteSpan = startSpan(`eval ${evalName}-${evalVersion}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval", [Attr.Eval.Name]: evalName, [Attr.Eval.Version]: evalVersion, [Attr.Eval.Type]: "regression", // TODO: where to get experiment type value from? [Attr.Eval.Tags]: JSON.stringify(["offline"]), [Attr.Eval.Collection.ID]: "custom", // TODO: where to get collection split value from? [Attr.Eval.Collection.Name]: "custom", // TODO: where to get collection name from? [Attr.Eval.Collection.Size]: collection.length, // capability [Attr.Eval.Capability.Name]: opts.capability, [Attr.Eval.Step.Name]: opts.step ?? void 0, // metadata [Attr.Eval.Metadata]: JSON.stringify(opts.metadata), // run [Attr.Eval.Run.ID]: runId, // user info [Attr.Eval.User.Name]: user?.name, [Attr.Eval.User.Email]: user?.email } }); evalId = suiteSpan.spanContext().traceId; suite.meta.evaluation.id = evalId; suiteSpan.setAttribute(Attr.Eval.ID, evalId); suiteContext = trace.setSpan(context.active(), suiteSpan); const flagConfig = captureFlagConfig(opts.configFlags); suite.meta.evaluation.flagConfig = flagConfig; const flagConfigJson = JSON.stringify(flagConfig); suiteSpan.setAttribute(Attr.Eval.Config.Flags, flagConfigJson); let createEvalResponse; if (!isDebug && !isList) { createEvalResponse = await evaluationApiClient.createEvaluation({ id: evalId, name: evalName, capability: opts.capability, step: opts.step, dataset: axiomConfig.eval.dataset, version: evalVersion, baselineId: baselineId ?? void 0, runId, totalCases: collection.length, config: { overrides: injectedOverrides }, configTimeoutMs: timeoutMs, metadata: opts.metadata, status: "running" }); } const orgId = createEvalResponse?.data?.orgId; const resolvedBaselineId = createEvalResponse?.data?.baselineId; if (!isDebug && !isList && !!resolvedBaselineId) { const [baselineResult, baselineError] = await tryCatchAsync( () => findEvaluationCases(resolvedBaselineId, axiomConfig) ); if (baselineError) { console.error(`Failed to load baseline: ${errorToString(baselineError)}`); instrumentationError = instrumentationError || baselineError; } else { baseline = baselineResult; } } if (baseline) { suiteSpan.setAttribute(Attr.Eval.Baseline.ID, baseline.id); suiteSpan.setAttribute(Attr.Eval.Baseline.Name, baseline.name); suiteSpan.setAttribute(Attr.Eval.Baseline.Version, baseline.version); } suite.meta.evaluation = { id: evalId, name: evalName, version: evalVersion, runId, orgId: orgId ?? void 0, baseline: baseline ?? void 0, configFlags: opts.configFlags, registrationStatus: instrumentationError ? { status: "failed", error: errorToString(instrumentationError) } : { status: "success" }, trials: opts.trials }; suiteStart = performance.now(); }; const handleAfterAll = async (suite) => { if (instrumentationError) { throw instrumentationError; } const flagSummary = /* @__PURE__ */ new Map(); for (const flag of allOutOfScopeFlags) { if (flagSummary.has(flag.flagPath)) { const existing = flagSummary.get(flag.flagPath); existing.count++; existing.firstAccessedAt = Math.min(existing.firstAccessedAt, flag.accessedAt); existing.lastAccessedAt = Math.max(existing.lastAccessedAt, flag.accessedAt); } else { flagSummary.set(flag.flagPath, { flagPath: flag.flagPath, count: 1, firstAccessedAt: flag.accessedAt, lastAccessedAt: flag.accessedAt, stackTrace: flag.stackTrace }); } } if (suite.meta.evaluation && suiteSpan) { suite.meta.evaluation.outOfScopeFlags = Array.from(flagSummary.entries()).map( ([_flagPath, stats]) => stats ); const allDefaults = getConfigScope()?.getAllDefaultFlags(); const pickedFlags = finalConfigSnapshot?.pickedFlags; const overrides = injectedOverrides ?? getGlobalFlagOverrides(); suite.meta.evaluation.configEnd = { flags: allDefaults, pickedFlags, overrides }; } suiteSpan?.setStatus({ code: SpanStatusCode.OK }); suiteSpan?.end(); const [, flushError] = await tryCatchAsync(flush); if (flushError) { if (suite.meta.evaluation) { suite.meta.evaluation.registrationStatus = { status: "failed", error: errorToString(flushError) }; } } const durationMs = Math.round(performance.now() - suiteStart); const successCases = suite.tasks.filter( (task) => task.meta.case?.status === "success" ).length; const erroredCases = suite.tasks.filter( (task) => task.meta.case?.status === "fail" || task.meta.case?.status === "pending" ).length; if (!isDebug && !isList) { await evaluationApiClient.updateEvaluation({ id: evalId, status: "completed", totalCases: collection.length, successCases, erroredCases, durationMs }); } }; beforeAll(withCompatibleSuiteHook(handleBeforeAll)); afterAll(withCompatibleSuiteHook(handleAfterAll)); await it.concurrent.for( collection.map((d, index) => ({ ...d, index })) )("case", async (data, { task }) => { const start = performance.now(); if (!suiteContext) { throw new Error( "[Axiom AI] Suite context not initialized. This is likely a bug \u2013 instrumentation should complete before tests run." ); } let outOfScopeFlags = []; await startActiveSpan( `case ${data.index}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.case", [Attr.Eval.ID]: evalId, [Attr.Eval.Name]: evalName, [Attr.Eval.Version]: evalVersion, [Attr.Eval.Case.Index]: data.index, [Attr.Eval.Case.Input]: typeof data.input === "string" ? data.input : JSON.stringify(data.input), [Attr.Eval.Case.Expected]: typeof data.expected === "string" ? data.expected : JSON.stringify(data.expected), [Attr.Eval.Case.Metadata]: data.metadata ? JSON.stringify(data.metadata) : void 0, // user info [Attr.Eval.User.Name]: user?.name, [Attr.Eval.User.Email]: user?.email } }, async (caseSpan) => { const caseContext = trace.setSpan(context.active(), caseSpan); const trials = Math.max(1, opts.trials ?? 1); let intentionalTrialFailureError; let caseFinalConfigSnapshot; caseSpan.setAttribute(Attr.Eval.Case.Trials, trials); try { const perScorerTrials = {}; const trialErrors = Array.from({ length: trials }, () => null); const trialFailures = []; let lastOutput; let successfulTaskDuration = 0; for (let trialIndex = 0; trialIndex < trials; trialIndex++) { try { await startActiveSpan( `trial ${trialIndex}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.trial", [Attr.Eval.Trial.Index]: trialIndex, [Attr.Eval.ID]: evalId, [Attr.Eval.Name]: evalName, [Attr.Eval.Version]: evalVersion } }, async (trialSpan) => { const trialContext = trace.setSpan(context.active(), trialSpan); try { const result2 = await runTask( trialContext, { id: evalId, version: evalVersion, name: evalName }, { index: data.index, input: data.input, expected: data.expected, scorers: opts.scorers, task: opts.task, metadata: opts.metadata, configFlags: opts.configFlags, capability: opts.capability, step: opts.step } ); const { output: output2, duration } = result2; lastOutput = output2; successfulTaskDuration += duration; outOfScopeFlags.push(...result2.outOfScopeFlags); caseFinalConfigSnapshot = { flags: result2.finalFlags || {}, pickedFlags: opts.configFlags, overrides: result2.overrides }; await Promise.all( opts.scorers.map(async (scorer) => { const scorerName = getScorerName(scorer); return startActiveSpan( `score ${scorerName}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.score", [Attr.Eval.Tags]: JSON.stringify(["offline"]), [Attr.Eval.ID]: evalId, [Attr.Eval.Name]: evalName, [Attr.Eval.Version]: evalVersion, [Attr.Eval.Trial.Index]: trialIndex } }, async (scorerSpan) => { const scorerStart = performance.now(); try { const [result3, scorerError] = await tryCatchAsync( () => scorer({ input: data.input, output: output2, expected: data.expected, trialIndex }) ); if (scorerError || !result3) { const scorerDuration = Math.round( performance.now() - scorerStart ); console.error( `ERROR: scorer ${scorerName} failed. Cause: `, scorerError ); const msg = errorToString(scorerError); const metadata2 = { duration: scorerDuration, startedAt: scorerStart, error: msg }; (perScorerTrials[scorerName] ?? (perScorerTrials[scorerName] = [])).push(0); scorerSpan.setAttributes({ [Attr.Eval.Score.Name]: scorerName, [Attr.Eval.Score.Metadata]: JSON.stringify(metadata2) }); scorerSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg }); return; } const scoreDuration = Math.round(performance.now() - scorerStart); const scoreValue = result3.score; const metadata = Object.assign( { duration: scoreDuration, startedAt: scorerStart }, result3.metadata ); (perScorerTrials[scorerName] ?? (perScorerTrials[scorerName] = [])).push(scoreValue); const aggregation = scorer.aggregation ?? Mean(); scorerSpan.setAttributes({ [Attr.Eval.Score.Name]: scorerName, [Attr.Eval.Score.Value]: scoreValue, [Attr.Eval.Score.Metadata]: JSON.stringify(metadata), [Attr.Eval.Score.Aggregation]: aggregation.type }); if (metadata.error) { const msg = errorToString(metadata.error); scorerSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg }); } } finally { scorerSpan.end(); } }, trialContext ); }) ); } catch (error) { const taskFailureDetails = getRunTaskFailureDetails(error); const failure = toError(error); const msg = errorToString(failure); const spanErrorMessage = failure.message || msg; trialErrors[trialIndex] = msg; trialFailures.push(failure); trialSpan.setAttributes({ [Attr.Eval.Trial.Error]: spanErrorMessage }); for (const scorer of opts.scorers) { const scorerName = getScorerName(scorer); (perScorerTrials[scorerName] ?? (perScorerTrials[scorerName] = [])).push(0); } if (taskFailureDetails) { outOfScopeFlags.push(...taskFailureDetails.outOfScopeFlags); caseFinalConfigSnapshot = { flags: taskFailureDetails.finalFlags || {}, pickedFlags: opts.configFlags, overrides: taskFailureDetails.overrides }; } throw failure; } }, caseContext ); } catch { } } const scores = {}; for (const scorer of opts.scorers) { const scorerName = getScorerName(scorer); const trialsArr = perScorerTrials[scorerName] ?? []; const aggregation = scorer.aggregation ?? Mean(); const aggregatedValue = trialsArr.length > 0 ? aggregation.aggregate(trialsArr) : 0; scores[scorerName] = { name: scorerName, score: aggregatedValue, trials: trialsArr, aggregation: aggregation.type, threshold: aggregation.threshold, metadata: {} }; } const output = lastOutput; const failedTrials = trialFailures.length; const succeededTrials = trials - failedTrials; const trialSummary = { total: trials, succeeded: succeededTrials, failed: failedTrials }; caseSpan.setAttribute(Attr.Eval.Case.Scores, JSON.stringify(scores ? scores : {})); if (output !== void 0) { caseSpan.setAttribute( Attr.Eval.Case.Output, typeof output === "string" ? output : JSON.stringify(output) ); } task.meta.case = { index: data.index, name: evalName, expected: data.expected, input: data.input, output, metadata: data.metadata, scores, status: "success", errors: [], trialErrors, trialSummary, duration: successfulTaskDuration, startedAt: start, outOfScopeFlags, pickedFlags: opts.configFlags }; if (failedTrials > 0) { const error = new Error( `Eval case ${data.index} failed with ${failedTrials} trial error(s).` ); intentionalTrialFailureError = error; caseSpan.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); task.meta.case.status = "fail"; task.meta.case.errors = trialFailures; throw error; } allOutOfScopeFlags.push(...outOfScopeFlags); } catch (e) { console.log(e); const error = toError(e); if (e === intentionalTrialFailureError && task.meta.case) { task.meta.case.status = "fail"; task.meta.case.errors = task.meta.case.errors?.length ? task.meta.case.errors : [error]; allOutOfScopeFlags.push(...outOfScopeFlags); throw e; } const ctx = getEvalContext(); const ctxOutOfScopeFlags = ctx.outOfScopeFlags || []; if (ctxOutOfScopeFlags.length > 0) { outOfScopeFlags.push(...ctxOutOfScopeFlags); } const ctxFlags = ctx.flags || {}; if (!caseFinalConfigSnapshot && Object.keys(ctxFlags).length > 0) { caseFinalConfigSnapshot = { flags: ctxFlags, pickedFlags: opts.configFlags, overrides: ctx.overrides }; } const failedScores = {}; for (const scorer of opts.scorers) { const scorerName = getScorerName(scorer); failedScores[scorerName] = { name: scorerName, score: 0, trials: [], metadata: { duration: 0, startedAt: start, error: error.message } }; } task.meta.case = { name: evalName, index: data.index, expected: data.expected, input: data.input, output: String(e), metadata: data.metadata, scores: failedScores, status: "fail", errors: [error], startedAt: start, duration: Math.round(performance.now() - start), outOfScopeFlags, pickedFlags: opts.configFlags }; allOutOfScopeFlags.push(...outOfScopeFlags); throw e; } finally { try { const accessedFlags = caseFinalConfigSnapshot?.flags || {}; const accessed = Object.keys(accessedFlags); const allDefaults = getConfigScope()?.getAllDefaultFlags?.() ?? {}; const runtimeFlags = {}; for (const key of accessed) { const value = accessedFlags[key]; if (key in allDefaults) { const replaced = !deepEqual(value, allDefaults[key]); if (replaced) { runtimeFlags[key] = { kind: "replaced", value, default: allDefaults[key] }; } } else { runtimeFlags[key] = { kind: "introduced", value }; } } if (!isDebug && Object.keys(runtimeFlags).length > 0) { const serialized = JSON.stringify(runtimeFlags); caseSpan.setAttribute("eval.case.config.runtime_flags", serialized); } if (task.meta.case) { task.meta.case.runtimeFlags = runtimeFlags; } } catch { } if (caseFinalConfigSnapshot) { finalConfigSnapshot = caseFinalConfigSnapshot; } } }, suiteContext ); }); }, timeoutMs ); return result; } var joinArrayOfUnknownResults = (results) => { if (results.length === 0) { return ""; } if (results.every((r) => typeof r === "string")) { return results.join(""); } return results[results.length - 1]; }; var executeTask = async (task, input, expected) => { const taskResultOrStream = await task({ input, expected }); if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) { const chunks = []; for await (const chunk of taskResultOrStream) { chunks.push(chunk); } return joinArrayOfUnknownResults(chunks); } return taskResultOrStream; }; var runTask = async (caseContext, evaluation, opts) => { const taskName = opts.task.name ?? "anonymous"; return startActiveSpan( `task`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.task", [Attr.Eval.Task.Name]: taskName, [Attr.Eval.Task.Type]: "llm_completion", // TODO: How to determine task type? [Attr.Eval.ID]: evaluation.id, [Attr.Eval.Name]: evaluation.name, [Attr.Eval.Version]: evaluation.version } }, async (taskSpan) => { const { output, duration, outOfScopeFlags, finalFlags, overrides } = await withEvalContext( { pickedFlags: opts.configFlags }, async () => { const start = performance.now(); try { const output2 = await executeTask(opts.task, opts.input, opts.expected); const duration2 = Math.round(performance.now() - start); taskSpan.setAttributes({ [Attr.Eval.Task.Output]: typeof output2 === "string" ? output2 : JSON.stringify(output2) }); const ctx = getEvalContext(); const outOfScopeFlags2 = ctx.outOfScopeFlags || []; return { output: output2, duration: duration2, outOfScopeFlags: outOfScopeFlags2, finalFlags: ctx.flags || {}, overrides: ctx.overrides }; } catch (error) { const ctx = getEvalContext(); const duration2 = Math.round(performance.now() - start); throw attachRunTaskFailureDetails(error, { duration: duration2, outOfScopeFlags: ctx.outOfScopeFlags || [], finalFlags: ctx.flags || {}, overrides: ctx.overrides }); } } ); return { output, duration, outOfScopeFlags, finalFlags, overrides }; }, caseContext ); }; export { AxiomReporter, Eval }; //# sourceMappingURL=evals.js.map