axiom
Version:
Axiom AI SDK provides - an API to wrap your AI calls with observability instrumentation. - offline evals
966 lines (955 loc) • 33.6 kB
JavaScript
import {
recordName
} from "./chunk-CLH5OLB6.js";
import {
AxiomReporter,
ensureInstrumentationInitialized,
flush,
resolveAxiomConnection,
startActiveSpan,
startSpan
} from "./chunk-EHMYAMMT.js";
import "./chunk-3EAYJ4JJ.js";
import "./chunk-KX7Z2MF4.js";
import {
Attr,
dotNotationToNested,
getConfigScope,
getEvalContext,
getGlobalFlagOverrides,
setGlobalFlagOverrides,
withEvalContext
} from "./chunk-PZCJLQFO.js";
import {
AxiomCLIError,
errorToString
} from "./chunk-S65FSMB3.js";
import "./chunk-X2LH7XLM.js";
import {
__publicField,
init_esm_shims
} from "./chunk-4VNFFUM5.js";
// src/evals.ts
init_esm_shims();
// src/evals/eval.ts
init_esm_shims();
import { afterAll, beforeAll, describe, inject, it } from "vitest";
import { context, SpanStatusCode, trace } from "@opentelemetry/api";
import { customAlphabet } from "nanoid";
// src/evals/git-info.ts
init_esm_shims();
import { execSync } from "child_process";
function getGitUserInfo() {
try {
const name = execSync("git config --get user.name").toString().trim();
const email = execSync("git config --get user.email").toString().trim();
return { name, email };
} catch {
return null;
}
}
// src/evals/eval.service.ts
init_esm_shims();
// src/utils/fetcher.ts
init_esm_shims();
var createFetcher = ({
baseUrl,
token,
orgId
}) => {
return (path, options) => fetch(new URL(path, baseUrl).toString(), {
...options,
headers: {
...options.headers,
"content-type": "application/json",
authorization: `Bearer ${token}`,
...orgId ? { "X-AXIOM-ORG-ID": orgId } : {}
}
});
};
// src/util/traces.ts
init_esm_shims();
function getCustomOrRegularAttribute(obj, accessKey) {
if (typeof obj !== "object" || obj === null) {
return void 0;
}
const keyParts = accessKey.split(".");
const custom = obj.custom;
if (custom && typeof custom === "object" && custom !== null && accessKey in custom) {
return custom[accessKey];
}
let current = obj;
for (const part of keyParts) {
if (typeof current !== "object" || current === null) {
return void 0;
}
current = current[part];
}
return current;
}
function getCustomOrRegularString(obj, key) {
const value = getCustomOrRegularAttribute(obj, key);
return typeof value === "string" ? value : void 0;
}
function getCustomOrRegularNumber(obj, key) {
const value = getCustomOrRegularAttribute(obj, key);
if (typeof value === "number") {
return value;
}
if (typeof value === "string") {
const parsed = Number(value);
return Number.isNaN(parsed) ? void 0 : parsed;
}
return void 0;
}
// src/evals/eval.service.ts
var EvaluationApiClient = class {
constructor(config, consoleUrl) {
__publicField(this, "fetcher");
const { consoleEndpointUrl, token, orgId } = resolveAxiomConnection(config, consoleUrl);
this.fetcher = createFetcher({ baseUrl: consoleEndpointUrl, token: token ?? "", orgId });
}
async createEvaluation(evaluation) {
const resp = await this.fetcher(`/api/v3/evaluations`, {
method: "POST",
body: JSON.stringify(evaluation)
});
if (!resp.ok) {
throw new AxiomCLIError(`Failed to create evaluation: ${resp.statusText}`);
}
return resp.json();
}
async updateEvaluation(evaluation) {
const resp = await this.fetcher(`/api/v3/evaluations/${evaluation.id}`, {
method: "PATCH",
body: JSON.stringify(evaluation)
});
if (!resp.ok) {
throw new AxiomCLIError(`Failed to update evaluation: ${resp.statusText}`);
}
return resp.json();
}
};
var findEvaluationCases = async (evalId, config) => {
const { dataset, url, token, orgId } = resolveAxiomConnection(config);
const apl = `['${dataset}'] | where trace_id == "${evalId}" | order by _time`;
const headers = new Headers({
Authorization: `Bearer ${token}`,
"Content-Type": "application/json",
...orgId ? { "X-AXIOM-ORG-ID": orgId } : {}
});
const resp = await fetch(`${url}/v1/datasets/_apl?format=legacy`, {
headers,
method: "POST",
body: JSON.stringify({ apl })
});
const payload = await resp.json();
if (!resp.ok) {
throw new Error(`Failed to query evaluation cases: ${payload.message || resp.statusText}`);
}
return payload.matches.length ? buildSpanTree(payload.matches) : null;
};
var mapSpanToEval = (span) => {
const flagConfigRaw = getCustomOrRegularAttribute(span.data.attributes, Attr.Eval.Config.Flags);
const tagsRaw = getCustomOrRegularAttribute(span.data.attributes, Attr.Eval.Tags);
const evaluation = {
id: getCustomOrRegularString(span.data.attributes, Attr.Eval.ID),
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Name),
type: getCustomOrRegularString(span.data.attributes, Attr.Eval.Type),
version: getCustomOrRegularString(span.data.attributes, Attr.Eval.Version),
collection: {
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Collection.Name),
size: getCustomOrRegularNumber(span.data.attributes, Attr.Eval.Collection.Size)
},
baseline: {
id: getCustomOrRegularString(span.data.attributes, Attr.Eval.Baseline.ID),
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.Baseline.Name)
},
duration: span.data.duration,
status: span.data.status.code,
traceId: span.data.trace_id,
runAt: span._time,
tags: tagsRaw ? typeof tagsRaw === "string" ? JSON.parse(tagsRaw) : tagsRaw : [],
user: {
name: getCustomOrRegularString(span.data.attributes, Attr.Eval.User.Name),
email: getCustomOrRegularString(span.data.attributes, Attr.Eval.User.Email)
},
cases: [],
flagConfig: flagConfigRaw ? typeof flagConfigRaw === "string" ? JSON.parse(flagConfigRaw) : flagConfigRaw : void 0
};
return evaluation;
};
var mapSpanToCase = (item) => {
const data = item.data;
const d = data.duration;
let duration = "-";
if (d.endsWith("s")) {
duration = `${Number(d.replace("s", "")).toFixed(2)}s`;
} else {
duration = d;
}
const scores = getCustomOrRegularAttribute(data.attributes, Attr.Eval.Case.Scores);
const caseData = {
index: getCustomOrRegularNumber(data.attributes, Attr.Eval.Case.Index),
input: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Input),
output: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Output),
expected: getCustomOrRegularString(data.attributes, Attr.Eval.Case.Expected),
duration,
status: data.status.code,
scores: scores ? typeof scores === "string" ? JSON.parse(scores) : scores : {},
// undefined would be more honest, but this lets us do like `baseline.scores[name]` without crashing
runAt: item._time,
spanId: data.span_id,
traceId: data.trace_id
};
return caseData;
};
var buildSpanTree = (spans) => {
if (!spans.length) {
return null;
}
const evalSpan = spans.find((span) => span.data.attributes.gen_ai.operation.name === "eval");
if (!evalSpan) {
return null;
}
const rootSpan = mapSpanToEval(evalSpan);
const caseSpans = spans.filter((span) => span.data.name.startsWith("case"));
for (const caseSpan of caseSpans) {
const caseData = mapSpanToCase(caseSpan);
const taskSpans = spans.filter(
(span) => span.data.name.startsWith("task") && span.data.parent_span_id === caseSpan.data.span_id
);
if (taskSpans.length > 0) {
const taskSpan = taskSpans[0];
const chatSpans = spans.filter(
(span) => span.data.name.startsWith("chat") && span.data.parent_span_id === taskSpan.data.span_id
);
const chatData = chatSpans.map((chatSpan) => ({
operation: getCustomOrRegularString(chatSpan.data.attributes, "operation") ?? "",
capability: getCustomOrRegularString(chatSpan.data.attributes, "capability") ?? "",
step: getCustomOrRegularString(chatSpan.data.attributes, "step") ?? "",
request: {
max_token: getCustomOrRegularString(chatSpan.data.attributes, "request.max_token") ?? "",
model: getCustomOrRegularString(chatSpan.data.attributes, "request.model") ?? "",
temperature: getCustomOrRegularNumber(chatSpan.data.attributes, "request.temperature") ?? 0
},
response: {
finish_reasons: getCustomOrRegularString(chatSpan.data.attributes, "response.finish_reasons") ?? ""
},
usage: {
input_tokens: getCustomOrRegularNumber(chatSpan.data.attributes, "usage.input_tokens") ?? 0,
output_tokens: getCustomOrRegularNumber(chatSpan.data.attributes, "usage.output_tokens") ?? 0
}
}));
const taskData = {
name: taskSpan.data.name,
output: getCustomOrRegularString(taskSpan.data.attributes, "output") || "",
trial: getCustomOrRegularNumber(taskSpan.data.attributes, "trial") || 0,
type: getCustomOrRegularString(taskSpan.data.attributes, "type") || "",
error: getCustomOrRegularString(taskSpan.data.attributes, "error") || "",
chat: chatData[0] || {
operation: "",
capability: "",
step: "",
request: { max_token: "", model: "", temperature: 0 },
response: { finish_reasons: "" },
usage: { input_tokens: 0, output_tokens: 0 }
}
};
caseData.task = taskData;
}
const scoreSpans = spans.filter(
(span) => span.data.attributes.gen_ai.operation.name === "eval.score" && span.data.parent_span_id === caseSpan.data.span_id
);
if (scoreSpans.length > 0) {
caseData.scores = {};
scoreSpans.forEach((score) => {
const name = getCustomOrRegularString(score.data.attributes, Attr.Eval.Score.Name) ?? "";
const value = getCustomOrRegularNumber(score.data.attributes, Attr.Eval.Score.Value) ?? 0;
const metadataRaw = getCustomOrRegularString(
score.data.attributes,
Attr.Eval.Score.Metadata
);
let metadata = {};
try {
metadata = metadataRaw ? JSON.parse(metadataRaw) : {};
} catch {
}
caseData.scores[name] = {
name,
value,
metadata: {
error: score.data.attributes.error,
...metadata
}
};
});
}
rootSpan.cases.push(caseData);
}
rootSpan.cases.sort((a, b) => a.index - b.index);
return rootSpan;
};
// src/util/deep-equal.ts
init_esm_shims();
function deepEqual(data, other) {
if (data === other) {
return true;
}
if (Object.is(data, other)) {
return true;
}
if (typeof data !== "object" || typeof other !== "object") {
return false;
}
if (data === null || other === null) {
return false;
}
if (Object.getPrototypeOf(data) !== Object.getPrototypeOf(other)) {
return false;
}
if (Array.isArray(data)) {
return isDeepEqualArrays(data, other);
}
if (data instanceof Map) {
return isDeepEqualMaps(data, other);
}
if (data instanceof Set) {
return isDeepEqualSets(data, other);
}
if (data instanceof Date) {
return data.getTime() === other.getTime();
}
if (data instanceof RegExp) {
return data.toString() === other.toString();
}
if (Object.keys(data).length !== Object.keys(other).length) {
return false;
}
for (const [key, value] of Object.entries(data)) {
if (!(key in other)) {
return false;
}
if (!deepEqual(
value,
// @ts-expect-error [ts7053] - We already checked that `other` has `key`
other[key]
)) {
return false;
}
}
return true;
}
function isDeepEqualArrays(data, other) {
if (data.length !== other.length) {
return false;
}
for (const [index, item] of data.entries()) {
if (!deepEqual(item, other[index])) {
return false;
}
}
return true;
}
function isDeepEqualMaps(data, other) {
if (data.size !== other.size) {
return false;
}
for (const [key, value] of data.entries()) {
if (!other.has(key)) {
return false;
}
if (!deepEqual(value, other.get(key))) {
return false;
}
}
return true;
}
function isDeepEqualSets(data, other) {
if (data.size !== other.size) {
return false;
}
const otherCopy = [...other];
for (const dataItem of data) {
let isFound = false;
for (const [index, otherItem] of otherCopy.entries()) {
if (deepEqual(dataItem, otherItem)) {
isFound = true;
otherCopy.splice(index, 1);
break;
}
}
if (!isFound) {
return false;
}
}
return true;
}
// src/evals/eval.ts
var createVersionId = customAlphabet("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", 10);
function Eval(name, params) {
recordName("eval", name);
recordName("capability", params.capability);
if (params.step) {
recordName("step", params.step);
}
if (params.scorers) {
for (const scorer of params.scorers) {
const scorerName = getScorerName(scorer, "");
recordName("scorer", scorerName);
}
}
registerEval(name, params).catch(console.error);
}
function captureFlagConfig(configFlags) {
if (!configFlags || configFlags.length === 0) {
return {};
}
const scope = getConfigScope();
const allDefaults = scope?.getAllDefaultFlags?.() ?? {};
const overrides = getGlobalFlagOverrides();
const merged = { ...allDefaults, ...overrides };
const filtered = {};
for (const [key, value] of Object.entries(merged)) {
const isInScope = configFlags.some((pattern) => key.startsWith(pattern));
if (isInScope) {
filtered[key] = value;
}
}
return dotNotationToNested(filtered);
}
var getScorerName = (scorer, fallback = "unknown") => {
return scorer.name || fallback;
};
async function registerEval(evalName, opts) {
opts.data;
const collectionPromise = typeof opts.data === "function" ? opts.data() : opts.data;
const user = getGitUserInfo();
const baselineId = inject("baseline");
const isDebug = inject("debug");
const isList = inject("list");
const injectedOverrides = inject("overrides");
const axiomConfig = inject("axiomConfig");
const runId = inject("runId");
const consoleUrl = inject("consoleUrl");
if (!axiomConfig) {
throw new AxiomCLIError("Axiom config not found");
}
const timeoutMs = opts.timeout ?? axiomConfig?.eval.timeoutMs;
const instrumentationReady = ensureInstrumentationInitialized(axiomConfig, {
enabled: !isDebug && !isList
});
const result = await describe(
evalName,
async () => {
const collection = await collectionPromise;
const evaluationApiClient = new EvaluationApiClient(axiomConfig, consoleUrl);
const evalVersion = createVersionId();
let evalId = "";
let suiteStart;
let suiteSpan;
let suiteContext;
let instrumentationError = void 0;
let baseline = void 0;
const allOutOfScopeFlags = [];
let finalConfigSnapshot;
beforeAll(async (suite) => {
if (injectedOverrides && Object.keys(injectedOverrides).length > 0) {
try {
setGlobalFlagOverrides(injectedOverrides);
} catch {
}
}
suite.meta.evaluation = {
id: evalId,
name: evalName,
version: evalVersion,
runId,
orgId: void 0,
baseline: baseline ?? void 0,
configFlags: opts.configFlags
};
try {
await instrumentationReady;
} catch (error) {
instrumentationError = error;
}
suiteSpan = startSpan(`eval ${evalName}-${evalVersion}`, {
attributes: {
[Attr.GenAI.Operation.Name]: "eval",
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
[Attr.Eval.Type]: "regression",
// TODO: where to get experiment type value from?
[Attr.Eval.Tags]: [],
[Attr.Eval.Collection.ID]: "custom",
// TODO: where to get collection split value from?
[Attr.Eval.Collection.Name]: "custom",
// TODO: where to get collection name from?
[Attr.Eval.Collection.Size]: collection.length,
// capability
[Attr.Eval.Capability.Name]: opts.capability,
[Attr.Eval.Step.Name]: opts.step ?? void 0,
// metadata
[Attr.Eval.Metadata]: JSON.stringify(opts.metadata),
// run
[Attr.Eval.Run.ID]: runId,
// user info
[Attr.Eval.User.Name]: user?.name,
[Attr.Eval.User.Email]: user?.email
}
});
evalId = suiteSpan.spanContext().traceId;
suite.meta.evaluation.id = evalId;
suiteSpan.setAttribute(Attr.Eval.ID, evalId);
suiteContext = trace.setSpan(context.active(), suiteSpan);
const flagConfig = captureFlagConfig(opts.configFlags);
suite.meta.evaluation.flagConfig = flagConfig;
const flagConfigJson = JSON.stringify(flagConfig);
suiteSpan.setAttribute(Attr.Eval.Config.Flags, flagConfigJson);
let createEvalResponse;
if (!isDebug && !isList) {
createEvalResponse = await evaluationApiClient.createEvaluation({
id: evalId,
name: evalName,
capability: opts.capability,
step: opts.step,
dataset: axiomConfig.eval.dataset,
version: evalVersion,
baselineId: baselineId ?? void 0,
runId,
totalCases: collection.length,
config: { overrides: injectedOverrides },
configTimeoutMs: timeoutMs,
metadata: opts.metadata,
status: "running"
});
}
const orgId = createEvalResponse?.data?.orgId;
const resolvedBaselineId = createEvalResponse?.data?.baselineId;
try {
if (!isDebug && !isList && !!resolvedBaselineId) {
baseline = await findEvaluationCases(resolvedBaselineId, axiomConfig);
}
} catch (error) {
console.error(`Failed to load baseline: ${errorToString(error)}`);
instrumentationError = instrumentationError || error;
}
if (baseline) {
suiteSpan.setAttribute(Attr.Eval.Baseline.ID, baseline.id);
suiteSpan.setAttribute(Attr.Eval.Baseline.Name, baseline.name);
suiteSpan.setAttribute(Attr.Eval.Baseline.Version, baseline.version);
}
suite.meta.evaluation = {
id: evalId,
name: evalName,
version: evalVersion,
runId,
orgId: orgId ?? void 0,
baseline: baseline ?? void 0,
configFlags: opts.configFlags,
registrationStatus: instrumentationError ? {
status: "failed",
error: errorToString(instrumentationError)
} : { status: "success" }
};
suiteStart = performance.now();
});
afterAll(async (suite) => {
if (instrumentationError) {
throw instrumentationError;
}
const tags = ["offline"];
suiteSpan?.setAttribute(Attr.Eval.Tags, JSON.stringify(tags));
const flagSummary = /* @__PURE__ */ new Map();
for (const flag of allOutOfScopeFlags) {
if (flagSummary.has(flag.flagPath)) {
const existing = flagSummary.get(flag.flagPath);
existing.count++;
existing.firstAccessedAt = Math.min(existing.firstAccessedAt, flag.accessedAt);
existing.lastAccessedAt = Math.max(existing.lastAccessedAt, flag.accessedAt);
} else {
flagSummary.set(flag.flagPath, {
flagPath: flag.flagPath,
count: 1,
firstAccessedAt: flag.accessedAt,
lastAccessedAt: flag.accessedAt,
stackTrace: flag.stackTrace
});
}
}
if (suite.meta.evaluation && suiteSpan) {
suite.meta.evaluation.outOfScopeFlags = Array.from(flagSummary.entries()).map(
([_flagPath, stats]) => stats
);
const allDefaults = getConfigScope()?.getAllDefaultFlags();
const pickedFlags = finalConfigSnapshot?.pickedFlags;
const overrides = injectedOverrides ?? getGlobalFlagOverrides();
suite.meta.evaluation.configEnd = {
flags: allDefaults,
pickedFlags,
overrides
};
}
suiteSpan?.setStatus({ code: SpanStatusCode.OK });
suiteSpan?.end();
try {
await flush();
} catch (flushError) {
if (suite.meta.evaluation) {
suite.meta.evaluation.registrationStatus = {
status: "failed",
error: errorToString(flushError)
};
}
}
const durationMs = Math.round(performance.now() - suiteStart);
const successCases = suite.tasks.filter(
(task) => task.meta.case.status === "success"
).length;
const erroredCases = suite.tasks.filter(
(task) => task.meta.case.status === "fail" || task.meta.case.status === "pending"
).length;
if (!isDebug && !isList) {
await evaluationApiClient.updateEvaluation({
id: evalId,
status: "completed",
totalCases: collection.length,
successCases,
erroredCases,
durationMs
});
}
});
await it.concurrent.for(
collection.map((d, index) => ({ ...d, index }))
)("case", async (data, { task }) => {
const start = performance.now();
if (!suiteContext) {
throw new Error(
"[Axiom AI] Suite context not initialized. This is likely a bug \u2013 instrumentation should complete before tests run."
);
}
let outOfScopeFlags = [];
await startActiveSpan(
`case ${data.index}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.case",
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion,
[Attr.Eval.Case.Index]: data.index,
[Attr.Eval.Case.Input]: typeof data.input === "string" ? data.input : JSON.stringify(data.input),
[Attr.Eval.Case.Expected]: typeof data.expected === "string" ? data.expected : JSON.stringify(data.expected),
[Attr.Eval.Case.Metadata]: data.metadata ? JSON.stringify(data.metadata) : void 0,
// user info
[Attr.Eval.User.Name]: user?.name,
[Attr.Eval.User.Email]: user?.email
}
},
async (caseSpan) => {
const caseContext = trace.setSpan(context.active(), caseSpan);
try {
const result2 = await runTask(
caseContext,
{
id: evalId,
version: evalVersion,
name: evalName
},
{
index: data.index,
input: data.input,
expected: data.expected,
scorers: opts.scorers,
task: opts.task,
metadata: opts.metadata,
configFlags: opts.configFlags,
capability: opts.capability,
step: opts.step
}
);
const { output, duration } = result2;
outOfScopeFlags = result2.outOfScopeFlags;
finalConfigSnapshot = {
flags: result2.finalFlags || {},
pickedFlags: opts.configFlags,
overrides: result2.overrides
};
const scoreList = await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerName = getScorerName(scorer);
return startActiveSpan(
`score ${scorerName}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.score",
[Attr.Eval.ID]: evalId,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Version]: evalVersion
}
},
async (scorerSpan) => {
const scorerStart = performance.now();
try {
const result3 = await scorer({
input: data.input,
output,
expected: data.expected
});
const duration2 = Math.round(performance.now() - scorerStart);
const scoreValue = result3.score;
const metadata = Object.assign(
{ duration: duration2, startedAt: scorerStart },
result3.metadata
);
scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorerName,
[Attr.Eval.Score.Value]: scoreValue,
[Attr.Eval.Score.Metadata]: JSON.stringify(metadata)
});
if (metadata.error) {
const msg = errorToString(metadata.error);
scorerSpan.setStatus({
code: SpanStatusCode.ERROR,
message: msg
});
}
return {
name: scorerName,
score: scoreValue,
metadata: Object.assign(
{ duration: duration2, startedAt: scorerStart },
result3.metadata
)
};
} catch (error) {
const scorerDuration = Math.round(performance.now() - scorerStart);
console.error(`ERROR: scorer ${scorerName} failed. Cause:
`, error);
const msg = errorToString(error);
const metadata = {
duration: scorerDuration,
startedAt: scorerStart,
error: msg
};
scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorerName,
[Attr.Eval.Score.Value]: void 0,
[Attr.Eval.Score.Metadata]: JSON.stringify(metadata)
});
scorerSpan.setStatus({
code: SpanStatusCode.ERROR,
message: msg
});
return {
name: scorerName,
score: null,
metadata
};
} finally {
scorerSpan.end();
}
},
caseContext
);
})
);
const scores = Object.fromEntries(scoreList.map((s) => [s.name, s]));
caseSpan.setAttributes({
[Attr.Eval.Case.Output]: typeof output === "string" ? output : JSON.stringify(output),
[Attr.Eval.Case.Scores]: JSON.stringify(scores ? scores : {})
});
task.meta.case = {
index: data.index,
name: evalName,
expected: data.expected,
input: data.input,
output,
metadata: data.metadata,
scores,
status: "success",
errors: [],
duration,
startedAt: start,
outOfScopeFlags,
pickedFlags: opts.configFlags
};
allOutOfScopeFlags.push(...outOfScopeFlags);
} catch (e) {
console.log(e);
const error = e;
const ctx = getEvalContext();
outOfScopeFlags = ctx.outOfScopeFlags || [];
const failedScores = {};
for (const scorer of opts.scorers) {
failedScores[scorer.name] = {
name: scorer.name,
score: 0,
metadata: {
duration: 0,
startedAt: start,
error: error.message
}
};
}
task.meta.case = {
name: evalName,
index: data.index,
expected: data.expected,
input: data.input,
output: String(e),
metadata: data.metadata,
scores: failedScores,
status: "fail",
errors: [error],
startedAt: start,
duration: Math.round(performance.now() - start),
outOfScopeFlags,
pickedFlags: opts.configFlags
};
allOutOfScopeFlags.push(...outOfScopeFlags);
throw e;
} finally {
try {
const accessedFlags = finalConfigSnapshot?.flags || {};
const accessed = Object.keys(accessedFlags);
const allDefaults = getConfigScope()?.getAllDefaultFlags?.() ?? {};
const runtimeFlags = {};
for (const key of accessed) {
const value = accessedFlags[key];
if (key in allDefaults) {
const replaced = !deepEqual(value, allDefaults[key]);
if (replaced) {
runtimeFlags[key] = { kind: "replaced", value, default: allDefaults[key] };
}
} else {
runtimeFlags[key] = { kind: "introduced", value };
}
}
if (!isDebug && Object.keys(runtimeFlags).length > 0) {
const serialized = JSON.stringify(runtimeFlags);
caseSpan.setAttribute("eval.case.config.runtime_flags", serialized);
}
if (task.meta.case) {
task.meta.case.runtimeFlags = runtimeFlags;
}
} catch {
}
}
},
suiteContext
);
});
},
timeoutMs
);
return result;
}
var joinArrayOfUnknownResults = (results) => {
if (results.length === 0) {
return "";
}
if (results.every((r) => typeof r === "string")) {
return results.join("");
}
return results[results.length - 1];
};
var executeTask = async (task, input, expected) => {
const taskResultOrStream = await task({ input, expected });
if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) {
const chunks = [];
for await (const chunk of taskResultOrStream) {
chunks.push(chunk);
}
return joinArrayOfUnknownResults(chunks);
}
return taskResultOrStream;
};
var runTask = async (caseContext, evaluation, opts) => {
const taskName = opts.task.name ?? "anonymous";
return startActiveSpan(
`task`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.task",
[Attr.Eval.Task.Name]: taskName,
[Attr.Eval.Task.Type]: "llm_completion",
// TODO: How to determine task type?
[Attr.Eval.ID]: evaluation.id,
[Attr.Eval.Name]: evaluation.name,
[Attr.Eval.Version]: evaluation.version
}
},
async (taskSpan) => {
const { output, duration, outOfScopeFlags, finalFlags, overrides } = await withEvalContext(
{ pickedFlags: opts.configFlags },
async () => {
const start = performance.now();
const output2 = await executeTask(opts.task, opts.input, opts.expected);
const duration2 = Math.round(performance.now() - start);
taskSpan.setAttributes({
[Attr.Eval.Task.Output]: JSON.stringify(output2)
});
const ctx = getEvalContext();
const outOfScopeFlags2 = ctx.outOfScopeFlags || [];
return {
output: output2,
duration: duration2,
outOfScopeFlags: outOfScopeFlags2,
finalFlags: ctx.flags || {},
overrides: ctx.overrides
};
}
);
return {
output,
duration,
outOfScopeFlags,
finalFlags,
overrides
};
},
caseContext
);
};
// src/evals/scorers.ts
init_esm_shims();
// src/evals/scorer.factory.ts
init_esm_shims();
function createScorer(name, fn) {
const normalizeScore = (res) => {
if (typeof res === "number") {
return { score: res };
}
if (typeof res === "boolean") {
return {
score: res ? 1 : 0,
metadata: {
[Attr.Eval.Score.IsBoolean]: true
}
};
}
return res;
};
const scorer = (args) => {
const res = fn(args);
if (res instanceof Promise) {
return res.then(normalizeScore);
}
return normalizeScore(res);
};
Object.defineProperty(scorer, "name", {
value: name,
configurable: true,
enumerable: true
});
return scorer;
}
export {
AxiomReporter,
Eval,
createScorer as Scorer
};
//# sourceMappingURL=evals.js.map