UNPKG

@axiomhq/ai

Version:

Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.

230 lines (229 loc) 8.25 kB
import { flush, startSpan } from "./chunk-Y2CPJRWO.js"; import { Attr } from "./chunk-SI6RXF4J.js"; import { AxiomReporter } from "./chunk-CKSTM4QJ.js"; import "./chunk-KEXKKQVW.js"; // src/evals/eval.ts import { context, SpanStatusCode, trace } from "@opentelemetry/api"; import { afterAll, describe, it } from "vitest"; var DEFAULT_TIMEOUT = 1e4; var generateExperimentId = () => { return crypto.randomUUID(); }; var Eval = (name, params) => { registerEval(name, params).catch(console.error); }; async function registerEval(evalName, opts, vitestOpts = {}) { const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe; const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve([]) : opts.data(); const result = await describeFn( evalName, async () => { const dataset = await datasetPromise; const suiteSpan = startSpan(`eval ${evalName}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval", [Attr.Eval.Experiment.ID]: generateExperimentId(), [Attr.Eval.Experiment.Name]: evalName, [Attr.Eval.Experiment.Type]: "regression", // TODO: where to get experiment type value from? [Attr.Eval.Experiment.Tags]: [], // TODO: where to get experiment tags from? [Attr.Eval.Experiment.Version]: "1.0.0", // TODO: where to get experiment version from? // [Attr.Eval.Experiment.Group]: "default", // TODO: where to get experiment group from? // [Attr.Eval.Experiment.BaseID]: "default", // TODO: where to get experiment base id from? // [Attr.Eval.Experiment.BaseName]: "default", // TODO: where to get experiment base name from? [Attr.Eval.Experiment.Trials]: 1, // TODO: implement trials [Attr.Eval.Dataset.Name]: "test", // TODO: where to get dataset name from? [Attr.Eval.Dataset.Split]: "test", // TODO: where to get dataset split value from? [Attr.Eval.Dataset.Size]: dataset.length } }); const suiteContext = trace.setSpan(context.active(), suiteSpan); afterAll(async () => { suiteSpan.setStatus({ code: SpanStatusCode.OK }); suiteSpan.end(); await flush(); }); await it.concurrent.for(dataset.map((d, index) => ({ ...d, index })))( evalName, async (data, { task }) => { const start = performance.now(); const caseSpan = startSpan( `case ${evalName}_${data.index}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.case", [Attr.Eval.Case.ID]: `${evalName}_${data.index}`, [Attr.Eval.Case.Index]: data.index, [Attr.Eval.Case.Input]: data.input, [Attr.Eval.Case.Expected]: data.expected } }, suiteContext ); const caseContext = trace.setSpan(context.active(), caseSpan); try { const { output, duration } = await runTask(caseContext, { index: data.index, expected: data.expected, input: data.input, scorers: opts.scorers, task: opts.task, threshold: opts.threshold }); const scores = await Promise.all( opts.scorers.map(async (scorer) => { const scorerSpan = startSpan( `score ${scorer.name}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.score" } }, caseContext ); if (typeof scorer === "function") { const start2 = performance.now(); const result2 = await scorer({ input: data.input, output, expected: data.expected }); const duration2 = Math.round(performance.now() - start2); const scoreValue = result2.score; const passed = scoreValue >= opts.threshold; scorerSpan.setAttributes({ [Attr.Eval.Score.Name]: scorer.name, [Attr.Eval.Score.Value]: scoreValue, [Attr.Eval.Score.Threshold]: opts.threshold, [Attr.Eval.Score.Passed]: passed }); if (!passed) { scorerSpan.recordException(new Error(`Score did not pass`)); scorerSpan.setStatus({ code: SpanStatusCode.ERROR }); } else { scorerSpan.setStatus({ code: SpanStatusCode.OK }); } scorerSpan.end(); return { ...result2, duration: duration2, startedAt: start2 }; } else { } }) ); caseSpan.setAttributes({ [Attr.Eval.Case.Output]: output // TODO: what if output is other than a string?, }); caseSpan.setStatus({ code: SpanStatusCode.OK }); task.meta.eval = { order: data.index, name: evalName, expected: data.expected, input: data.input, output, scores, status: "success", errors: [], duration, startedAt: start, threshold: opts.threshold }; } catch (e) { caseSpan.recordException(e); caseSpan.setStatus({ code: SpanStatusCode.ERROR }); task.meta.eval = { name: evalName, order: data.index, expected: data.expected, input: data.input, output: e, scores: [], status: "fail", errors: [e], startedAt: start, duration: Math.round(performance.now() - start), threshold: opts.threshold }; throw e; } finally { caseSpan.end(); } } ); }, DEFAULT_TIMEOUT ); return result; } var joinArrayOfUnknownResults = (results) => { return results.reduce((acc, result) => { if (typeof result === "string" || typeof result === "number" || typeof result === "boolean") { return `${acc}${result}`; } throw new Error( `Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.` ); }, ""); }; var executeTask = async (task, input, expected) => { const taskResultOrStream = await task(input, expected); if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) { const chunks = []; for await (const chunk of taskResultOrStream) { chunks.push(chunk); } return joinArrayOfUnknownResults(chunks); } return taskResultOrStream; }; var runTask = async (caseContext, opts) => { const taskName = opts.task.name ?? "anonymous"; const taskSpan = startSpan( `task`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.task", [Attr.Eval.Task.Name]: taskName, [Attr.Eval.Task.Type]: "llm_completion", // TODO: How to determine task type? [Attr.Eval.Task.Trial]: 1 } }, caseContext ); const { output, duration } = await context.with( trace.setSpan(context.active(), taskSpan), async () => { const start = performance.now(); const output2 = await executeTask(opts.task, opts.input, opts.expected); const duration2 = Math.round(performance.now() - start); taskSpan.setAttributes({ [Attr.Eval.Task.Output]: output2 // TODO: what if output is other than a string?, }); taskSpan.setStatus({ code: SpanStatusCode.OK }); taskSpan.end(); return { output: output2, duration: duration2 }; } ); return { output, duration }; }; export { AxiomReporter as experimental_AxiomReporter, Eval as experimental_Eval }; //# sourceMappingURL=evals.js.map