@axiomhq/ai
Version:
Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.
230 lines (229 loc) • 8.25 kB
JavaScript
import {
flush,
startSpan
} from "./chunk-Y2CPJRWO.js";
import {
Attr
} from "./chunk-SI6RXF4J.js";
import {
AxiomReporter
} from "./chunk-CKSTM4QJ.js";
import "./chunk-KEXKKQVW.js";
// src/evals/eval.ts
import { context, SpanStatusCode, trace } from "@opentelemetry/api";
import { afterAll, describe, it } from "vitest";
var DEFAULT_TIMEOUT = 1e4;
var generateExperimentId = () => {
return crypto.randomUUID();
};
var Eval = (name, params) => {
registerEval(name, params).catch(console.error);
};
async function registerEval(evalName, opts, vitestOpts = {}) {
const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe;
const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve([]) : opts.data();
const result = await describeFn(
evalName,
async () => {
const dataset = await datasetPromise;
const suiteSpan = startSpan(`eval ${evalName}`, {
attributes: {
[Attr.GenAI.Operation.Name]: "eval",
[Attr.Eval.Experiment.ID]: generateExperimentId(),
[Attr.Eval.Experiment.Name]: evalName,
[Attr.Eval.Experiment.Type]: "regression",
// TODO: where to get experiment type value from?
[Attr.Eval.Experiment.Tags]: [],
// TODO: where to get experiment tags from?
[Attr.Eval.Experiment.Version]: "1.0.0",
// TODO: where to get experiment version from?
// [Attr.Eval.Experiment.Group]: "default", // TODO: where to get experiment group from?
// [Attr.Eval.Experiment.BaseID]: "default", // TODO: where to get experiment base id from?
// [Attr.Eval.Experiment.BaseName]: "default", // TODO: where to get experiment base name from?
[Attr.Eval.Experiment.Trials]: 1,
// TODO: implement trials
[Attr.Eval.Dataset.Name]: "test",
// TODO: where to get dataset name from?
[Attr.Eval.Dataset.Split]: "test",
// TODO: where to get dataset split value from?
[Attr.Eval.Dataset.Size]: dataset.length
}
});
const suiteContext = trace.setSpan(context.active(), suiteSpan);
afterAll(async () => {
suiteSpan.setStatus({ code: SpanStatusCode.OK });
suiteSpan.end();
await flush();
});
await it.concurrent.for(dataset.map((d, index) => ({ ...d, index })))(
evalName,
async (data, { task }) => {
const start = performance.now();
const caseSpan = startSpan(
`case ${evalName}_${data.index}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.case",
[Attr.Eval.Case.ID]: `${evalName}_${data.index}`,
[Attr.Eval.Case.Index]: data.index,
[Attr.Eval.Case.Input]: data.input,
[Attr.Eval.Case.Expected]: data.expected
}
},
suiteContext
);
const caseContext = trace.setSpan(context.active(), caseSpan);
try {
const { output, duration } = await runTask(caseContext, {
index: data.index,
expected: data.expected,
input: data.input,
scorers: opts.scorers,
task: opts.task,
threshold: opts.threshold
});
const scores = await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerSpan = startSpan(
`score ${scorer.name}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.score"
}
},
caseContext
);
if (typeof scorer === "function") {
const start2 = performance.now();
const result2 = await scorer({
input: data.input,
output,
expected: data.expected
});
const duration2 = Math.round(performance.now() - start2);
const scoreValue = result2.score;
const passed = scoreValue >= opts.threshold;
scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorer.name,
[Attr.Eval.Score.Value]: scoreValue,
[Attr.Eval.Score.Threshold]: opts.threshold,
[Attr.Eval.Score.Passed]: passed
});
if (!passed) {
scorerSpan.recordException(new Error(`Score did not pass`));
scorerSpan.setStatus({ code: SpanStatusCode.ERROR });
} else {
scorerSpan.setStatus({ code: SpanStatusCode.OK });
}
scorerSpan.end();
return { ...result2, duration: duration2, startedAt: start2 };
} else {
}
})
);
caseSpan.setAttributes({
[Attr.Eval.Case.Output]: output
// TODO: what if output is other than a string?,
});
caseSpan.setStatus({ code: SpanStatusCode.OK });
task.meta.eval = {
order: data.index,
name: evalName,
expected: data.expected,
input: data.input,
output,
scores,
status: "success",
errors: [],
duration,
startedAt: start,
threshold: opts.threshold
};
} catch (e) {
caseSpan.recordException(e);
caseSpan.setStatus({ code: SpanStatusCode.ERROR });
task.meta.eval = {
name: evalName,
order: data.index,
expected: data.expected,
input: data.input,
output: e,
scores: [],
status: "fail",
errors: [e],
startedAt: start,
duration: Math.round(performance.now() - start),
threshold: opts.threshold
};
throw e;
} finally {
caseSpan.end();
}
}
);
},
DEFAULT_TIMEOUT
);
return result;
}
var joinArrayOfUnknownResults = (results) => {
return results.reduce((acc, result) => {
if (typeof result === "string" || typeof result === "number" || typeof result === "boolean") {
return `${acc}${result}`;
}
throw new Error(
`Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.`
);
}, "");
};
var executeTask = async (task, input, expected) => {
const taskResultOrStream = await task(input, expected);
if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) {
const chunks = [];
for await (const chunk of taskResultOrStream) {
chunks.push(chunk);
}
return joinArrayOfUnknownResults(chunks);
}
return taskResultOrStream;
};
var runTask = async (caseContext, opts) => {
const taskName = opts.task.name ?? "anonymous";
const taskSpan = startSpan(
`task`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.task",
[Attr.Eval.Task.Name]: taskName,
[Attr.Eval.Task.Type]: "llm_completion",
// TODO: How to determine task type?
[Attr.Eval.Task.Trial]: 1
}
},
caseContext
);
const { output, duration } = await context.with(
trace.setSpan(context.active(), taskSpan),
async () => {
const start = performance.now();
const output2 = await executeTask(opts.task, opts.input, opts.expected);
const duration2 = Math.round(performance.now() - start);
taskSpan.setAttributes({
[Attr.Eval.Task.Output]: output2
// TODO: what if output is other than a string?,
});
taskSpan.setStatus({ code: SpanStatusCode.OK });
taskSpan.end();
return { output: output2, duration: duration2 };
}
);
return {
output,
duration
};
};
export {
AxiomReporter as experimental_AxiomReporter,
Eval as experimental_Eval
};
//# sourceMappingURL=evals.js.map