axiom
Version:
Axiom AI SDK provides - an API to wrap your AI calls with observability instrumentation. - offline evals - online evals
1,145 lines (1,137 loc) • 43 kB
JavaScript
import {
AxiomReporter,
ensureInstrumentationInitialized,
flush,
resolveAxiomConnection,
startActiveSpan,
startSpan
} from "./chunk-SQJ53C2N.js";
import "./chunk-72PVEOMF.js";
import "./chunk-3VKWOZAQ.js";
import {
dotNotationToNested,
getConfigScope,
getEvalContext,
getGlobalFlagOverrides,
setGlobalFlagOverrides,
withEvalContext
} from "./chunk-63VQQCZB.js";
import {
recordName
} from "./chunk-FWPCBQBZ.js";
import {
AxiomCLIError,
errorToString
} from "./chunk-ISSDOC43.js";
import "./chunk-PU64TWX4.js";
import "./chunk-MM5FFQJT.js";
import {
Mean
} from "./chunk-73F2PMAH.js";
import {
Attr
} from "./chunk-4TKUTT24.js";
import {
__publicField
} from "./chunk-KEXKKQVW.js";
// src/evals/eval.ts
import { afterAll, beforeAll, describe, inject, it } from "vitest";
import { context, SpanStatusCode, trace } from "@opentelemetry/api";
import { customAlphabet } from "nanoid";
// src/evals/git-info.ts
import { execSync } from "child_process";
function getGitUserInfo() {
try {
const name = execSync("git config --get user.name").toString().trim();
const email = execSync("git config --get user.email").toString().trim();
return { name, email };
} catch {
return null;
}
}
// src/utils/fetcher.ts
var createFetcher = ({
baseUrl,
token,
orgId
}) => {
return (path, options) => fetch(new URL(path, baseUrl).toString(), {
...options,
headers: {
...options.headers,
"content-type": "application/json",
authorization: `Bearer ${token}`,
...orgId ? { "X-AXIOM-ORG-ID": orgId } : {}
}
});
};
// src/util/traces.ts
function getCustomOrRegularAttribute(obj, accessKey) {
if (typeof obj !== "object" || obj === null) {
return void 0;
}
const keyParts = accessKey.split(".");
const custom = obj.custom;
if (custom && typeof custom === "object" && custom !== null && accessKey in custom) {
return custom[accessKey];
}
let current = obj;
for (const part of keyParts) {
if (typeof current !== "object" || current === null) {
return void 0;
}
current = current[part];
}
return current;
}
function getCustomOrRegularString(obj, key) {
const value = getCustomOrRegularAttribute(obj, key);
return typeof value === "string" ? value : void 0;
}
function getCustomOrRegularNumber(obj, key) {
const value = getCustomOrRegularAttribute(obj, key);
if (typeof value === "number") {
return value;
}
if (typeof value === "string") {
const parsed = Number(value);
return Number.isNaN(parsed) ? void 0 : parsed;
}
return void 0;
}
// src/evals/eval.service.ts
var EvaluationApiClient = class {
constructor(config, consoleUrl) {
__publicField(this, "fetcher");
const { consoleEndpointUrl, token, orgId } = resolveAxiomConnection(config, consoleUrl);
this.fetcher = createFetcher({ baseUrl: consoleEndpointUrl, token: token ?? "", orgId });
}
async createEvaluation(evaluation) {
const resp = await this.fetcher(`/api/v3/evaluations`, {
method: "POST",
body: JSON.stringify(evaluation)
});
if (!resp.ok) {
const text = await resp.text().catch(() => "");
throw new AxiomCLIError(
`Failed to create evaluation: ${resp.statusText}${text ? ` - ${text}` : ""}`
);
}
return resp.json();
}
async updateEvaluation(evaluation) {
const resp = await this.fetcher(`/api/v3/evaluations/${evaluation.id}`, {
method: "PATCH",
body: JSON.stringify(evaluation)
});
if (!resp.ok) {
const text = await resp.text().catch(() => "");
throw new AxiomCLIError(
`Failed to update evaluation: ${resp.statusText}${text ? ` - ${text}` : ""}`
);
}
const body = await resp.json();
if (body.error) {
throw new AxiomCLIError(
`Failed to update evaluation ${evaluation.id}: ${JSON.stringify(body.error)}`
);
}
return body;
}
};
var findEvaluationCases = async (evalId, config) => {
const { dataset, edgeUrl, url, token, orgId } = resolveAxiomConnection(config);
const apl = `['${dataset}'] | where column_ifexists('trace_id', '') == "${evalId}" | order by _time`;
const headers = new Headers({
Authorization: `Bearer ${token}`,
"Content-Type": "application/json",
...orgId ? { "X-AXIOM-ORG-ID": orgId } : {}
});
const hasExplicitEdgeUrl = !!config.eval.edgeUrl;
const queryBaseUrl = hasExplicitEdgeUrl ? edgeUrl : url;
const queryPath = hasExplicitEdgeUrl ? "/v1/query/_apl?format=legacy" : "/v1/datasets/_apl?format=legacy";
const resp = await fetch(`${queryBaseUrl}${queryPath}`, {
headers,
method: "POST",
body: JSON.stringify({ apl })
});
const payload = await resp.json();
if (!resp.ok) {
throw new Error(
`Failed to query evaluation cases: ${payload?.message || resp?.statusText || "Unknown error"}`
);
}
return payload.matches.length ? buildSpanTree(payload.matches) : null;
};
var mapSpanToEval = (span) => {
const flagConfigRaw = getCustomOrRegularAttribute(span.data.attributes, Attr.Eval.Config.Flags);
const tagsRaw = getCustomOrRegularAttribute(span.data.attributes, Attr.Eval.Tags);
const evaluation = {
id: getCustomOrRegularString(span.data.attributes, Attr.Eval.ID),
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Name),
type: getCustomOrRegularString(span.data.attributes, Attr.Eval.Type),
version: getCustomOrRegularString(span.data.attributes, Attr.Eval.Version),
collection: {
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Collection.Name),
size: getCustomOrRegularNumber(span.data.attributes, Attr.Eval.Collection.Size)
},
baseline: {
id: getCustomOrRegularString(span.data.attributes, Attr.Eval.Baseline.ID),
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Baseline.Name)
},
duration: span.data.duration,
status: span.data.status.code,
traceId: span.data.trace_id,
runAt: span._time,
tags: tagsRaw ? typeof tagsRaw === "string" ? JSON.parse(tagsRaw) : tagsRaw : [],
user: {
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.User.Name),
email: getCustomOrRegularString(span.data.attributes, Attr.Eval.User.Email)
},
cases: [],
flagConfig: flagConfigRaw ? typeof flagConfigRaw === "string" ? JSON.parse(flagConfigRaw) : flagConfigRaw : void 0
};
return evaluation;
};
var mapSpanToCase = (item) => {
const data = item.data;
const d = data.duration;
let duration = "-";
if (d.endsWith("s")) {
duration = `${Number(d.replace("s", "")).toFixed(2)}s`;
} else {
duration = d;
}
const scoresRaw = getCustomOrRegularAttribute(data.attributes, Attr.Eval.Case.Scores);
const scoresParsed = scoresRaw ? typeof scoresRaw === "string" ? JSON.parse(scoresRaw) : scoresRaw : {};
const scores = {};
for (const [name, scoreData] of Object.entries(scoresParsed)) {
const s = scoreData;
scores[name] = {
name,
value: s.value ?? s.score ?? 0,
metadata: s.metadata ?? {}
};
}
const caseData = {
index: getCustomOrRegularNumber(data.attributes, Attr.Eval.Case.Index),
input: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Input),
output: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Output),
expected: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Expected),
duration,
status: data.status.code,
scores,
runAt: item._time,
spanId: data.span_id,
traceId: data.trace_id
};
return caseData;
};
var buildSpanTree = (spans) => {
if (!spans.length) {
return null;
}
const evalSpan = spans.find((span) => span.data.attributes.gen_ai.operation.name === "eval");
if (!evalSpan) {
return null;
}
const rootSpan = mapSpanToEval(evalSpan);
const caseSpans = spans.filter((span) => span.data.name.startsWith("case"));
for (const caseSpan of caseSpans) {
const caseData = mapSpanToCase(caseSpan);
const taskSpans = spans.filter(
(span) => span.data.name.startsWith("task") && span.data.parent_span_id === caseSpan.data.span_id
);
if (taskSpans.length > 0) {
const taskSpan = taskSpans[0];
const chatSpans = spans.filter(
(span) => span.data.name.startsWith("chat") && span.data.parent_span_id === taskSpan.data.span_id
);
const chatData = chatSpans.map((chatSpan) => ({
operation: getCustomOrRegularString(chatSpan.data.attributes, "operation") ?? "",
capability: getCustomOrRegularString(chatSpan.data.attributes, "capability") ?? "",
step: getCustomOrRegularString(chatSpan.data.attributes, "step") ?? "",
request: {
max_token: getCustomOrRegularString(chatSpan.data.attributes, "request.max_token") ?? "",
model: getCustomOrRegularString(chatSpan.data.attributes, "request.model") ?? "",
temperature: getCustomOrRegularNumber(chatSpan.data.attributes, "request.temperature") ?? 0
},
response: {
finish_reasons: getCustomOrRegularString(chatSpan.data.attributes, "response.finish_reasons") ?? ""
},
usage: {
input_tokens: getCustomOrRegularNumber(chatSpan.data.attributes, "usage.input_tokens") ?? 0,
output_tokens: getCustomOrRegularNumber(chatSpan.data.attributes, "usage.output_tokens") ?? 0
}
}));
const taskData = {
name: taskSpan.data.name,
output: getCustomOrRegularString(taskSpan.data.attributes, "output") || "",
trial: getCustomOrRegularNumber(taskSpan.data.attributes, "trial") || 0,
type: getCustomOrRegularString(taskSpan.data.attributes, "type") || "",
error: getCustomOrRegularString(taskSpan.data.attributes, "error") || "",
chat: chatData[0] || {
operation: "",
capability: "",
step: "",
request: { max_token: "", model: "", temperature: 0 },
response: { finish_reasons: "" },
usage: { input_tokens: 0, output_tokens: 0 }
}
};
caseData.task = taskData;
}
const scoreSpans = spans.filter(
(span) => span.data.attributes.gen_ai.operation.name === "eval.score" && span.data.parent_span_id === caseSpan.data.span_id
);
if (scoreSpans.length > 0) {
caseData.scores = {};
scoreSpans.forEach((score) => {
const name = getCustomOrRegularString(score.data.attributes, Attr.Eval.Score.Name) ?? "";
const value = getCustomOrRegularNumber(score.data.attributes, Attr.Eval.Score.Value) ?? 0;
const metadataRaw = getCustomOrRegularString(
score.data.attributes,
Attr.Eval.Score.Metadata
);
let metadata = {};
try {
metadata = metadataRaw ? JSON.parse(metadataRaw) : {};
} catch {
}
caseData.scores[name] = {
name,
value,
metadata: {
error: score.data.attributes.error,
...metadata
}
};
});
}
rootSpan.cases.push(caseData);
}
rootSpan.cases.sort((a, b) => a.index - b.index);
return rootSpan;
};
// src/util/deep-equal.ts
function deepEqual(data, other) {
if (data === other) {
return true;
}
if (Object.is(data, other)) {
return true;
}
if (typeof data !== "object" || typeof other !== "object") {
return false;
}
if (data === null || other === null) {
return false;
}
if (Object.getPrototypeOf(data) !== Object.getPrototypeOf(other)) {
return false;
}
if (Array.isArray(data)) {
return isDeepEqualArrays(data, other);
}
if (data instanceof Map) {
return isDeepEqualMaps(data, other);
}
if (data instanceof Set) {
return isDeepEqualSets(data, other);
}
if (data instanceof Date) {
return data.getTime() === other.getTime();
}
if (data instanceof RegExp) {
return data.toString() === other.toString();
}
if (Object.keys(data).length !== Object.keys(other).length) {
return false;
}
for (const [key, value] of Object.entries(data)) {
if (!(key in other)) {
return false;
}
if (!deepEqual(
value,
// @ts-expect-error [ts7053] - We already checked that `other` has `key`
other[key]
)) {
return false;
}
}
return true;
}
function isDeepEqualArrays(data, other) {
if (data.length !== other.length) {
return false;
}
for (const [index, item] of data.entries()) {
if (!deepEqual(item, other[index])) {
return false;
}
}
return true;
}
function isDeepEqualMaps(data, other) {
if (data.size !== other.size) {
return false;
}
for (const [key, value] of data.entries()) {
if (!other.has(key)) {
return false;
}
if (!deepEqual(value, other.get(key))) {
return false;
}
}
return true;
}
function isDeepEqualSets(data, other) {
if (data.size !== other.size) {
return false;
}
const otherCopy = [...other];
for (const dataItem of data) {
let isFound = false;
for (const [index, otherItem] of otherCopy.entries()) {
if (deepEqual(dataItem, otherItem)) {
isFound = true;
otherCopy.splice(index, 1);
break;
}
}
if (!isFound) {
return false;
}
}
return true;
}
// src/util/tryCatch.ts
function toError(rawError, operationName) {
const processedError = rawError instanceof Error ? rawError : new Error(errorToString(rawError));
if (operationName) {
processedError.message = `Operation "${operationName}" failed: ${processedError.message}`;
}
return processedError;
}
function tryCatchSync(fn, operationName) {
try {
return [fn(), null];
} catch (rawError) {
return [null, toError(rawError, operationName)];
}
}
async function tryCatchAsync(fn, operationName) {
try {
const result = typeof fn === "function" ? fn() : fn;
return [await result, null];
} catch (rawError) {
return [null, toError(rawError, operationName)];
}
}
var tryCatch = ((fn, operationName) => {
if (typeof fn === "function") {
try {
const result = fn();
if (result instanceof Promise) {
return tryCatchAsync(result, operationName);
}
return [result, null];
} catch (rawError) {
return [null, toError(rawError, operationName)];
}
}
if (fn instanceof Promise) {
return tryCatchAsync(fn, operationName);
}
return [fn, null];
});
tryCatch.sync = tryCatchSync;
tryCatch.async = tryCatchAsync;
// src/evals/eval.ts
var createVersionId = customAlphabet("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", 10);
var RUN_TASK_FAILURE_DETAILS = Symbol.for("axiom.eval.runTaskFailureDetails");
function withCompatibleSuiteHook(fn) {
return async function({}, maybeSuite) {
const suite = maybeSuite ?? arguments[0];
await fn(suite);
};
}
function attachRunTaskFailureDetails(error, details) {
const normalized = toError(error);
normalized[RUN_TASK_FAILURE_DETAILS] = details;
return normalized;
}
function getRunTaskFailureDetails(error) {
if (typeof error !== "object" || error === null) {
return void 0;
}
return error[RUN_TASK_FAILURE_DETAILS];
}
function Eval(name, params) {
recordName("eval", name);
recordName("capability", params.capability);
if (params.step) {
recordName("step", params.step);
}
if (params.scorers) {
for (const scorer of params.scorers) {
const scorerName = getScorerName(scorer, "");
recordName("scorer", scorerName);
}
}
registerEval(name, params).catch(console.error);
}
function captureFlagConfig(configFlags) {
if (!configFlags || configFlags.length === 0) {
return {};
}
const scope = getConfigScope();
const allDefaults = scope?.getAllDefaultFlags?.() ?? {};
const overrides = getGlobalFlagOverrides();
const merged = { ...allDefaults, ...overrides };
const filtered = {};
for (const [key, value] of Object.entries(merged)) {
const isInScope = configFlags.some((pattern) => key.startsWith(pattern));
if (isInScope) {
filtered[key] = value;
}
}
return dotNotationToNested(filtered);
}
var getScorerName = (scorer, fallback = "unknown") => {
return scorer.name || fallback;
};
async function registerEval(evalName, opts) {
opts.data;
const collectionPromise = typeof opts.data === "function" ? opts.data() : opts.data;
const user = getGitUserInfo();
const baselineId = inject("baseline");
const isDebug = inject("debug");
const isList = inject("list");
const injectedOverrides = inject("overrides");
const axiomConfig = inject("axiomConfig");
const runId = inject("runId");
const consoleUrl = inject("consoleUrl");
if (!axiomConfig) {
throw new AxiomCLIError("Axiom config not found");
}
const timeoutMs = opts.timeout ?? axiomConfig?.eval.timeoutMs;
const instrumentationReady = ensureInstrumentationInitialized(axiomConfig, {
enabled: !isDebug && !isList
});
const result = await describe(
evalName,
async () => {
const collection = await collectionPromise;
const evaluationApiClient = new EvaluationApiClient(axiomConfig, consoleUrl);
const evalVersion = createVersionId();
let evalId = "";
let suiteStart;
let suiteSpan;
let suiteContext;
let instrumentationError = void 0;
let baseline = void 0;
const allOutOfScopeFlags = [];
let finalConfigSnapshot;
const handleBeforeAll = async (suite) => {
if (injectedOverrides && Object.keys(injectedOverrides).length > 0) {
try {
setGlobalFlagOverrides(injectedOverrides);
} catch {
}
}
suite.meta.evaluation = {
id: evalId,
name: evalName,
version: evalVersion,
runId,
orgId: void 0,
baseline: baseline ?? void 0,
configFlags: opts.configFlags
};
const [, instrumentationInitError] = await tryCatchAsync(instrumentationReady);
if (instrumentationInitError) {
instrumentationError = instrumentationInitError;
}
suiteSpan = startSpan(`eval ${evalName}-${evalVersion}`, {
attributes: {
[Attr.GenAI.Operation.Name]: "eval",
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
[Attr.Eval.Type]: "regression",
// TODO: where to get experiment type value from?
[Attr.Eval.Tags]: JSON.stringify(["offline"]),
[Attr.Eval.Collection.ID]: "custom",
// TODO: where to get collection split value from?
[Attr.Eval.Collection.Name]: "custom",
// TODO: where to get collection name from?
[Attr.Eval.Collection.Size]: collection.length,
// capability
[Attr.Eval.Capability.Name]: opts.capability,
[Attr.Eval.Step.Name]: opts.step ?? void 0,
// metadata
[Attr.Eval.Metadata]: JSON.stringify(opts.metadata),
// run
[Attr.Eval.Run.ID]: runId,
// user info
[Attr.Eval.User.Name]: user?.name,
[Attr.Eval.User.Email]: user?.email
}
});
evalId = suiteSpan.spanContext().traceId;
suite.meta.evaluation.id = evalId;
suiteSpan.setAttribute(Attr.Eval.ID, evalId);
suiteContext = trace.setSpan(context.active(), suiteSpan);
const flagConfig = captureFlagConfig(opts.configFlags);
suite.meta.evaluation.flagConfig = flagConfig;
const flagConfigJson = JSON.stringify(flagConfig);
suiteSpan.setAttribute(Attr.Eval.Config.Flags, flagConfigJson);
let createEvalResponse;
if (!isDebug && !isList) {
createEvalResponse = await evaluationApiClient.createEvaluation({
id: evalId,
name: evalName,
capability: opts.capability,
step: opts.step,
dataset: axiomConfig.eval.dataset,
version: evalVersion,
baselineId: baselineId ?? void 0,
runId,
totalCases: collection.length,
config: { overrides: injectedOverrides },
configTimeoutMs: timeoutMs,
metadata: opts.metadata,
status: "running"
});
}
const orgId = createEvalResponse?.data?.orgId;
const resolvedBaselineId = createEvalResponse?.data?.baselineId;
if (!isDebug && !isList && !!resolvedBaselineId) {
const [baselineResult, baselineError] = await tryCatchAsync(
() => findEvaluationCases(resolvedBaselineId, axiomConfig)
);
if (baselineError) {
console.error(`Failed to load baseline: ${errorToString(baselineError)}`);
instrumentationError = instrumentationError || baselineError;
} else {
baseline = baselineResult;
}
}
if (baseline) {
suiteSpan.setAttribute(Attr.Eval.Baseline.ID, baseline.id);
suiteSpan.setAttribute(Attr.Eval.Baseline.Name, baseline.name);
suiteSpan.setAttribute(Attr.Eval.Baseline.Version, baseline.version);
}
suite.meta.evaluation = {
id: evalId,
name: evalName,
version: evalVersion,
runId,
orgId: orgId ?? void 0,
baseline: baseline ?? void 0,
configFlags: opts.configFlags,
registrationStatus: instrumentationError ? {
status: "failed",
error: errorToString(instrumentationError)
} : { status: "success" },
trials: opts.trials
};
suiteStart = performance.now();
};
const handleAfterAll = async (suite) => {
if (instrumentationError) {
throw instrumentationError;
}
const flagSummary = /* @__PURE__ */ new Map();
for (const flag of allOutOfScopeFlags) {
if (flagSummary.has(flag.flagPath)) {
const existing = flagSummary.get(flag.flagPath);
existing.count++;
existing.firstAccessedAt = Math.min(existing.firstAccessedAt, flag.accessedAt);
existing.lastAccessedAt = Math.max(existing.lastAccessedAt, flag.accessedAt);
} else {
flagSummary.set(flag.flagPath, {
flagPath: flag.flagPath,
count: 1,
firstAccessedAt: flag.accessedAt,
lastAccessedAt: flag.accessedAt,
stackTrace: flag.stackTrace
});
}
}
if (suite.meta.evaluation && suiteSpan) {
suite.meta.evaluation.outOfScopeFlags = Array.from(flagSummary.entries()).map(
([_flagPath, stats]) => stats
);
const allDefaults = getConfigScope()?.getAllDefaultFlags();
const pickedFlags = finalConfigSnapshot?.pickedFlags;
const overrides = injectedOverrides ?? getGlobalFlagOverrides();
suite.meta.evaluation.configEnd = {
flags: allDefaults,
pickedFlags,
overrides
};
}
suiteSpan?.setStatus({ code: SpanStatusCode.OK });
suiteSpan?.end();
const [, flushError] = await tryCatchAsync(flush);
if (flushError) {
if (suite.meta.evaluation) {
suite.meta.evaluation.registrationStatus = {
status: "failed",
error: errorToString(flushError)
};
}
}
const durationMs = Math.round(performance.now() - suiteStart);
const successCases = suite.tasks.filter(
(task) => task.meta.case?.status === "success"
).length;
const erroredCases = suite.tasks.filter(
(task) => task.meta.case?.status === "fail" || task.meta.case?.status === "pending"
).length;
if (!isDebug && !isList) {
await evaluationApiClient.updateEvaluation({
id: evalId,
status: "completed",
totalCases: collection.length,
successCases,
erroredCases,
durationMs
});
}
};
beforeAll(withCompatibleSuiteHook(handleBeforeAll));
afterAll(withCompatibleSuiteHook(handleAfterAll));
await it.concurrent.for(
collection.map((d, index) => ({ ...d, index }))
)("case", async (data, { task }) => {
const start = performance.now();
if (!suiteContext) {
throw new Error(
"[Axiom AI] Suite context not initialized. This is likely a bug \u2013 instrumentation should complete before tests run."
);
}
let outOfScopeFlags = [];
await startActiveSpan(
`case ${data.index}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.case",
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
[Attr.Eval.Case.Index]: data.index,
[Attr.Eval.Case.Input]: typeof data.input === "string" ? data.input : JSON.stringify(data.input),
[Attr.Eval.Case.Expected]: typeof data.expected === "string" ? data.expected : JSON.stringify(data.expected),
[Attr.Eval.Case.Metadata]: data.metadata ? JSON.stringify(data.metadata) : void 0,
// user info
[Attr.Eval.User.Name]: user?.name,
[Attr.Eval.User.Email]: user?.email
}
},
async (caseSpan) => {
const caseContext = trace.setSpan(context.active(), caseSpan);
const trials = Math.max(1, opts.trials ?? 1);
let intentionalTrialFailureError;
let caseFinalConfigSnapshot;
caseSpan.setAttribute(Attr.Eval.Case.Trials, trials);
try {
const perScorerTrials = {};
const trialErrors = Array.from({ length: trials }, () => null);
const trialFailures = [];
let lastOutput;
let successfulTaskDuration = 0;
for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
try {
await startActiveSpan(
`trial ${trialIndex}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.trial",
[Attr.Eval.Trial.Index]: trialIndex,
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion
}
},
async (trialSpan) => {
const trialContext = trace.setSpan(context.active(), trialSpan);
try {
const result2 = await runTask(
trialContext,
{
id: evalId,
version: evalVersion,
name: evalName
},
{
index: data.index,
input: data.input,
expected: data.expected,
scorers: opts.scorers,
task: opts.task,
metadata: opts.metadata,
configFlags: opts.configFlags,
capability: opts.capability,
step: opts.step
}
);
const { output: output2, duration } = result2;
lastOutput = output2;
successfulTaskDuration += duration;
outOfScopeFlags.push(...result2.outOfScopeFlags);
caseFinalConfigSnapshot = {
flags: result2.finalFlags || {},
pickedFlags: opts.configFlags,
overrides: result2.overrides
};
await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerName = getScorerName(scorer);
return startActiveSpan(
`score ${scorerName}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.score",
[Attr.Eval.Tags]: JSON.stringify(["offline"]),
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
[Attr.Eval.Trial.Index]: trialIndex
}
},
async (scorerSpan) => {
const scorerStart = performance.now();
try {
const [result3, scorerError] = await tryCatchAsync(
() => scorer({
input: data.input,
output: output2,
expected: data.expected,
trialIndex
})
);
if (scorerError || !result3) {
const scorerDuration = Math.round(
performance.now() - scorerStart
);
console.error(
`ERROR: scorer ${scorerName} failed. Cause:
`,
scorerError
);
const msg = errorToString(scorerError);
const metadata2 = {
duration: scorerDuration,
startedAt: scorerStart,
error: msg
};
(perScorerTrials[scorerName] ?? (perScorerTrials[scorerName] = [])).push(0);
scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorerName,
[Attr.Eval.Score.Metadata]: JSON.stringify(metadata2)
});
scorerSpan.setStatus({
code: SpanStatusCode.ERROR,
message: msg
});
return;
}
const scoreDuration = Math.round(performance.now() - scorerStart);
const scoreValue = result3.score;
const metadata = Object.assign(
{ duration: scoreDuration, startedAt: scorerStart },
result3.metadata
);
(perScorerTrials[scorerName] ?? (perScorerTrials[scorerName] = [])).push(scoreValue);
const aggregation = scorer.aggregation ?? Mean();
scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorerName,
[Attr.Eval.Score.Value]: scoreValue,
[Attr.Eval.Score.Metadata]: JSON.stringify(metadata),
[Attr.Eval.Score.Aggregation]: aggregation.type
});
if (metadata.error) {
const msg = errorToString(metadata.error);
scorerSpan.setStatus({
code: SpanStatusCode.ERROR,
message: msg
});
}
} finally {
scorerSpan.end();
}
},
trialContext
);
})
);
} catch (error) {
const taskFailureDetails = getRunTaskFailureDetails(error);
const failure = toError(error);
const msg = errorToString(failure);
const spanErrorMessage = failure.message || msg;
trialErrors[trialIndex] = msg;
trialFailures.push(failure);
trialSpan.setAttributes({
[Attr.Eval.Trial.Error]: spanErrorMessage
});
for (const scorer of opts.scorers) {
const scorerName = getScorerName(scorer);
(perScorerTrials[scorerName] ?? (perScorerTrials[scorerName] = [])).push(0);
}
if (taskFailureDetails) {
outOfScopeFlags.push(...taskFailureDetails.outOfScopeFlags);
caseFinalConfigSnapshot = {
flags: taskFailureDetails.finalFlags || {},
pickedFlags: opts.configFlags,
overrides: taskFailureDetails.overrides
};
}
throw failure;
}
},
caseContext
);
} catch {
}
}
const scores = {};
for (const scorer of opts.scorers) {
const scorerName = getScorerName(scorer);
const trialsArr = perScorerTrials[scorerName] ?? [];
const aggregation = scorer.aggregation ?? Mean();
const aggregatedValue = trialsArr.length > 0 ? aggregation.aggregate(trialsArr) : 0;
scores[scorerName] = {
name: scorerName,
score: aggregatedValue,
trials: trialsArr,
aggregation: aggregation.type,
threshold: aggregation.threshold,
metadata: {}
};
}
const output = lastOutput;
const failedTrials = trialFailures.length;
const succeededTrials = trials - failedTrials;
const trialSummary = {
total: trials,
succeeded: succeededTrials,
failed: failedTrials
};
caseSpan.setAttribute(Attr.Eval.Case.Scores, JSON.stringify(scores ? scores : {}));
if (output !== void 0) {
caseSpan.setAttribute(
Attr.Eval.Case.Output,
typeof output === "string" ? output : JSON.stringify(output)
);
}
task.meta.case = {
index: data.index,
name: evalName,
expected: data.expected,
input: data.input,
output,
metadata: data.metadata,
scores,
status: "success",
errors: [],
trialErrors,
trialSummary,
duration: successfulTaskDuration,
startedAt: start,
outOfScopeFlags,
pickedFlags: opts.configFlags
};
if (failedTrials > 0) {
const error = new Error(
`Eval case ${data.index} failed with ${failedTrials} trial error(s).`
);
intentionalTrialFailureError = error;
caseSpan.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
task.meta.case.status = "fail";
task.meta.case.errors = trialFailures;
throw error;
}
allOutOfScopeFlags.push(...outOfScopeFlags);
} catch (e) {
console.log(e);
const error = toError(e);
if (e === intentionalTrialFailureError && task.meta.case) {
task.meta.case.status = "fail";
task.meta.case.errors = task.meta.case.errors?.length ? task.meta.case.errors : [error];
allOutOfScopeFlags.push(...outOfScopeFlags);
throw e;
}
const ctx = getEvalContext();
const ctxOutOfScopeFlags = ctx.outOfScopeFlags || [];
if (ctxOutOfScopeFlags.length > 0) {
outOfScopeFlags.push(...ctxOutOfScopeFlags);
}
const ctxFlags = ctx.flags || {};
if (!caseFinalConfigSnapshot && Object.keys(ctxFlags).length > 0) {
caseFinalConfigSnapshot = {
flags: ctxFlags,
pickedFlags: opts.configFlags,
overrides: ctx.overrides
};
}
const failedScores = {};
for (const scorer of opts.scorers) {
const scorerName = getScorerName(scorer);
failedScores[scorerName] = {
name: scorerName,
score: 0,
trials: [],
metadata: {
duration: 0,
startedAt: start,
error: error.message
}
};
}
task.meta.case = {
name: evalName,
index: data.index,
expected: data.expected,
input: data.input,
output: String(e),
metadata: data.metadata,
scores: failedScores,
status: "fail",
errors: [error],
startedAt: start,
duration: Math.round(performance.now() - start),
outOfScopeFlags,
pickedFlags: opts.configFlags
};
allOutOfScopeFlags.push(...outOfScopeFlags);
throw e;
} finally {
try {
const accessedFlags = caseFinalConfigSnapshot?.flags || {};
const accessed = Object.keys(accessedFlags);
const allDefaults = getConfigScope()?.getAllDefaultFlags?.() ?? {};
const runtimeFlags = {};
for (const key of accessed) {
const value = accessedFlags[key];
if (key in allDefaults) {
const replaced = !deepEqual(value, allDefaults[key]);
if (replaced) {
runtimeFlags[key] = { kind: "replaced", value, default: allDefaults[key] };
}
} else {
runtimeFlags[key] = { kind: "introduced", value };
}
}
if (!isDebug && Object.keys(runtimeFlags).length > 0) {
const serialized = JSON.stringify(runtimeFlags);
caseSpan.setAttribute("eval.case.config.runtime_flags", serialized);
}
if (task.meta.case) {
task.meta.case.runtimeFlags = runtimeFlags;
}
} catch {
}
if (caseFinalConfigSnapshot) {
finalConfigSnapshot = caseFinalConfigSnapshot;
}
}
},
suiteContext
);
});
},
timeoutMs
);
return result;
}
var joinArrayOfUnknownResults = (results) => {
if (results.length === 0) {
return "";
}
if (results.every((r) => typeof r === "string")) {
return results.join("");
}
return results[results.length - 1];
};
var executeTask = async (task, input, expected) => {
const taskResultOrStream = await task({ input, expected });
if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) {
const chunks = [];
for await (const chunk of taskResultOrStream) {
chunks.push(chunk);
}
return joinArrayOfUnknownResults(chunks);
}
return taskResultOrStream;
};
var runTask = async (caseContext, evaluation, opts) => {
const taskName = opts.task.name ?? "anonymous";
return startActiveSpan(
`task`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.task",
[Attr.Eval.Task.Name]: taskName,
[Attr.Eval.Task.Type]: "llm_completion",
// TODO: How to determine task type?
[Attr.Eval.ID]: evaluation.id,
[Attr.Eval.Name]: evaluation.name,
[Attr.Eval.Version]: evaluation.version
}
},
async (taskSpan) => {
const { output, duration, outOfScopeFlags, finalFlags, overrides } = await withEvalContext(
{ pickedFlags: opts.configFlags },
async () => {
const start = performance.now();
try {
const output2 = await executeTask(opts.task, opts.input, opts.expected);
const duration2 = Math.round(performance.now() - start);
taskSpan.setAttributes({
[Attr.Eval.Task.Output]: typeof output2 === "string" ? output2 : JSON.stringify(output2)
});
const ctx = getEvalContext();
const outOfScopeFlags2 = ctx.outOfScopeFlags || [];
return {
output: output2,
duration: duration2,
outOfScopeFlags: outOfScopeFlags2,
finalFlags: ctx.flags || {},
overrides: ctx.overrides
};
} catch (error) {
const ctx = getEvalContext();
const duration2 = Math.round(performance.now() - start);
throw attachRunTaskFailureDetails(error, {
duration: duration2,
outOfScopeFlags: ctx.outOfScopeFlags || [],
finalFlags: ctx.flags || {},
overrides: ctx.overrides
});
}
}
);
return {
output,
duration,
outOfScopeFlags,
finalFlags,
overrides
};
},
caseContext
);
};
export {
AxiomReporter,
Eval
};
//# sourceMappingURL=evals.js.map