UNPKG

axiom

Version:

Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.

251 lines (248 loc) 8.84 kB
import { AxiomReporter, flush, startSpan } from "./chunk-MNOTFSB6.js"; import { Attr } from "./chunk-EFEYUIIG.js"; import "./chunk-KEXKKQVW.js"; // src/evals/eval.ts import { afterAll, describe, it } from "vitest"; import { context, SpanStatusCode, trace } from "@opentelemetry/api"; import { customAlphabet } from "nanoid"; // src/evals/git-info.ts import { execSync } from "child_process"; function getGitUserInfo() { try { const name = execSync("git config --get user.name").toString().trim(); const email = execSync("git config --get user.email").toString().trim(); return { name, email }; } catch { return null; } } // src/evals/eval.ts var DEFAULT_TIMEOUT = 1e4; var nanoid = customAlphabet("1234567890abcdefghijklmnopqrstuvwxyz", 10); var Eval = (name, params) => { registerEval(name, params).catch(console.error); }; async function registerEval(evalName, opts, vitestOpts = {}) { const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe; const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve([]) : opts.data(); const user = getGitUserInfo(); const result = await describeFn( evalName, async () => { const dataset = await datasetPromise; const id = nanoid(); const suiteSpan = startSpan(`eval ${evalName}-${id}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval", [Attr.Eval.ID]: id, [Attr.Eval.Name]: evalName, [Attr.Eval.Type]: "regression", // TODO: where to get experiment type value from? [Attr.Eval.Tags]: [], // TODO: where to get experiment tags from? [Attr.Eval.Trials]: 1, // TODO: implement trials [Attr.Eval.Collection.Name]: "unknown", // TODO: where to get dataset name from? [Attr.Eval.Collection.Split]: "unknown", // TODO: where to get dataset split value from? [Attr.Eval.Collection.Size]: dataset.length, // user info [Attr.Eval.User.Name]: user?.name, [Attr.Eval.User.Email]: user?.email } }); const suiteContext = trace.setSpan(context.active(), suiteSpan); afterAll(async () => { const tags = ["offline"]; suiteSpan.setAttribute(Attr.Eval.Tags, JSON.stringify(tags)); suiteSpan.setStatus({ code: SpanStatusCode.OK }); suiteSpan.end(); await flush(); }); await it.concurrent.for(dataset.map((d, index) => ({ ...d, index })))( evalName, async (data, { task }) => { const caseName = data.name ?? `${evalName}_${data.index}`; const start = performance.now(); const caseSpan = startSpan( `case ${caseName}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.case", [Attr.Eval.Case.ID]: caseName, [Attr.Eval.Case.Index]: data.index, [Attr.Eval.Case.Input]: typeof data.input === "string" ? data.input : JSON.stringify(data.input), [Attr.Eval.Case.Expected]: typeof data.expected === "string" ? data.expected : JSON.stringify(data.expected), // user info ["eval.user.name"]: user?.name, ["eval.user.email"]: user?.email } }, suiteContext ); const caseContext = trace.setSpan(context.active(), caseSpan); try { const { output, duration } = await runTask(caseContext, { index: data.index, expected: data.expected, input: data.input, scorers: opts.scorers, task: opts.task, threshold: opts.threshold }); const scoreList = await Promise.all( opts.scorers.map(async (scorer) => { const scorerSpan = startSpan( `score ${scorer.name}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.score" } }, caseContext ); const start2 = performance.now(); const result2 = await scorer({ input: data.input, output, expected: data.expected }); const duration2 = Math.round(performance.now() - start2); const scoreValue = result2.score; const passed = scoreValue >= opts.threshold; let hasError = false; scorerSpan.setAttributes({ [Attr.Eval.Score.Name]: scorer.name, [Attr.Eval.Score.Value]: scoreValue, [Attr.Eval.Score.Threshold]: opts.threshold, [Attr.Eval.Score.Passed]: passed }); if (!passed) { hasError = `Score didn't pass`; scorerSpan.setStatus({ code: SpanStatusCode.ERROR, message: hasError }); } else { scorerSpan.setStatus({ code: SpanStatusCode.OK }); } scorerSpan.end(); return { ...result2, metadata: { duration: duration2, startedAt: start2, error: hasError || null } }; }) ); const scores = Object.fromEntries(scoreList.map((s) => [s.name, s])); caseSpan.setAttributes({ [Attr.Eval.Case.Output]: typeof output === "string" ? output : JSON.stringify(output), [Attr.Eval.Case.Scores]: JSON.stringify(scores) }); caseSpan.setStatus({ code: SpanStatusCode.OK }); task.meta.eval = { index: data.index, name: evalName, expected: data.expected, input: data.input, output, scores, status: "success", errors: [], duration, startedAt: start, threshold: opts.threshold }; } catch (e) { caseSpan.recordException(e); caseSpan.setStatus({ code: SpanStatusCode.ERROR }); task.meta.eval = { name: evalName, index: data.index, expected: data.expected, input: data.input, output: e, scores: {}, status: "fail", errors: [e], startedAt: start, duration: Math.round(performance.now() - start), threshold: opts.threshold }; throw e; } finally { caseSpan.end(); } } ); }, DEFAULT_TIMEOUT ); return result; } var joinArrayOfUnknownResults = (results) => { return results.reduce((acc, result) => { if (typeof result === "string" || typeof result === "number" || typeof result === "boolean") { return `${acc}${result}`; } throw new Error( `Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.` ); }, ""); }; var executeTask = async (task, input, expected) => { const taskResultOrStream = await task(input, expected); if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) { const chunks = []; for await (const chunk of taskResultOrStream) { chunks.push(chunk); } return joinArrayOfUnknownResults(chunks); } return taskResultOrStream; }; var runTask = async (caseContext, opts) => { const taskName = opts.task.name ?? "anonymous"; const taskSpan = startSpan( `task`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.task", [Attr.Eval.Task.Name]: taskName, [Attr.Eval.Task.Type]: "llm_completion", // TODO: How to determine task type? [Attr.Eval.Task.Trial]: 1 } }, caseContext ); const { output, duration } = await context.with( trace.setSpan(context.active(), taskSpan), async () => { const start = performance.now(); const output2 = await executeTask(opts.task, opts.input, opts.expected); const duration2 = Math.round(performance.now() - start); taskSpan.setAttributes({ [Attr.Eval.Task.Output]: JSON.stringify(output2) }); taskSpan.setStatus({ code: SpanStatusCode.OK }); taskSpan.end(); return { output: output2, duration: duration2 }; } ); return { output, duration }; }; export { AxiomReporter as experimental_AxiomReporter, Eval as experimental_Eval }; //# sourceMappingURL=evals.js.map