axiom
Version:
Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.
251 lines (248 loc) • 8.84 kB
JavaScript
import {
AxiomReporter,
flush,
startSpan
} from "./chunk-MNOTFSB6.js";
import {
Attr
} from "./chunk-EFEYUIIG.js";
import "./chunk-KEXKKQVW.js";
// src/evals/eval.ts
import { afterAll, describe, it } from "vitest";
import { context, SpanStatusCode, trace } from "@opentelemetry/api";
import { customAlphabet } from "nanoid";
// src/evals/git-info.ts
import { execSync } from "child_process";
function getGitUserInfo() {
try {
const name = execSync("git config --get user.name").toString().trim();
const email = execSync("git config --get user.email").toString().trim();
return { name, email };
} catch {
return null;
}
}
// src/evals/eval.ts
var DEFAULT_TIMEOUT = 1e4;
var nanoid = customAlphabet("1234567890abcdefghijklmnopqrstuvwxyz", 10);
var Eval = (name, params) => {
registerEval(name, params).catch(console.error);
};
async function registerEval(evalName, opts, vitestOpts = {}) {
const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe;
const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve([]) : opts.data();
const user = getGitUserInfo();
const result = await describeFn(
evalName,
async () => {
const dataset = await datasetPromise;
const id = nanoid();
const suiteSpan = startSpan(`eval ${evalName}-${id}`, {
attributes: {
[Attr.GenAI.Operation.Name]: "eval",
[Attr.Eval.ID]: id,
[Attr.Eval.Name]: evalName,
[Attr.Eval.Type]: "regression",
// TODO: where to get experiment type value from?
[Attr.Eval.Tags]: [],
// TODO: where to get experiment tags from?
[Attr.Eval.Trials]: 1,
// TODO: implement trials
[Attr.Eval.Collection.Name]: "unknown",
// TODO: where to get dataset name from?
[Attr.Eval.Collection.Split]: "unknown",
// TODO: where to get dataset split value from?
[Attr.Eval.Collection.Size]: dataset.length,
// user info
[Attr.Eval.User.Name]: user?.name,
[Attr.Eval.User.Email]: user?.email
}
});
const suiteContext = trace.setSpan(context.active(), suiteSpan);
afterAll(async () => {
const tags = ["offline"];
suiteSpan.setAttribute(Attr.Eval.Tags, JSON.stringify(tags));
suiteSpan.setStatus({ code: SpanStatusCode.OK });
suiteSpan.end();
await flush();
});
await it.concurrent.for(dataset.map((d, index) => ({ ...d, index })))(
evalName,
async (data, { task }) => {
const caseName = data.name ?? `${evalName}_${data.index}`;
const start = performance.now();
const caseSpan = startSpan(
`case ${caseName}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.case",
[Attr.Eval.Case.ID]: caseName,
[Attr.Eval.Case.Index]: data.index,
[Attr.Eval.Case.Input]: typeof data.input === "string" ? data.input : JSON.stringify(data.input),
[Attr.Eval.Case.Expected]: typeof data.expected === "string" ? data.expected : JSON.stringify(data.expected),
// user info
["eval.user.name"]: user?.name,
["eval.user.email"]: user?.email
}
},
suiteContext
);
const caseContext = trace.setSpan(context.active(), caseSpan);
try {
const { output, duration } = await runTask(caseContext, {
index: data.index,
expected: data.expected,
input: data.input,
scorers: opts.scorers,
task: opts.task,
threshold: opts.threshold
});
const scoreList = await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerSpan = startSpan(
`score ${scorer.name}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.score"
}
},
caseContext
);
const start2 = performance.now();
const result2 = await scorer({
input: data.input,
output,
expected: data.expected
});
const duration2 = Math.round(performance.now() - start2);
const scoreValue = result2.score;
const passed = scoreValue >= opts.threshold;
let hasError = false;
scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorer.name,
[Attr.Eval.Score.Value]: scoreValue,
[Attr.Eval.Score.Threshold]: opts.threshold,
[Attr.Eval.Score.Passed]: passed
});
if (!passed) {
hasError = `Score didn't pass`;
scorerSpan.setStatus({
code: SpanStatusCode.ERROR,
message: hasError
});
} else {
scorerSpan.setStatus({ code: SpanStatusCode.OK });
}
scorerSpan.end();
return {
...result2,
metadata: { duration: duration2, startedAt: start2, error: hasError || null }
};
})
);
const scores = Object.fromEntries(scoreList.map((s) => [s.name, s]));
caseSpan.setAttributes({
[Attr.Eval.Case.Output]: typeof output === "string" ? output : JSON.stringify(output),
[Attr.Eval.Case.Scores]: JSON.stringify(scores)
});
caseSpan.setStatus({ code: SpanStatusCode.OK });
task.meta.eval = {
index: data.index,
name: evalName,
expected: data.expected,
input: data.input,
output,
scores,
status: "success",
errors: [],
duration,
startedAt: start,
threshold: opts.threshold
};
} catch (e) {
caseSpan.recordException(e);
caseSpan.setStatus({ code: SpanStatusCode.ERROR });
task.meta.eval = {
name: evalName,
index: data.index,
expected: data.expected,
input: data.input,
output: e,
scores: {},
status: "fail",
errors: [e],
startedAt: start,
duration: Math.round(performance.now() - start),
threshold: opts.threshold
};
throw e;
} finally {
caseSpan.end();
}
}
);
},
DEFAULT_TIMEOUT
);
return result;
}
var joinArrayOfUnknownResults = (results) => {
return results.reduce((acc, result) => {
if (typeof result === "string" || typeof result === "number" || typeof result === "boolean") {
return `${acc}${result}`;
}
throw new Error(
`Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.`
);
}, "");
};
var executeTask = async (task, input, expected) => {
const taskResultOrStream = await task(input, expected);
if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) {
const chunks = [];
for await (const chunk of taskResultOrStream) {
chunks.push(chunk);
}
return joinArrayOfUnknownResults(chunks);
}
return taskResultOrStream;
};
var runTask = async (caseContext, opts) => {
const taskName = opts.task.name ?? "anonymous";
const taskSpan = startSpan(
`task`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.task",
[Attr.Eval.Task.Name]: taskName,
[Attr.Eval.Task.Type]: "llm_completion",
// TODO: How to determine task type?
[Attr.Eval.Task.Trial]: 1
}
},
caseContext
);
const { output, duration } = await context.with(
trace.setSpan(context.active(), taskSpan),
async () => {
const start = performance.now();
const output2 = await executeTask(opts.task, opts.input, opts.expected);
const duration2 = Math.round(performance.now() - start);
taskSpan.setAttributes({
[Attr.Eval.Task.Output]: JSON.stringify(output2)
});
taskSpan.setStatus({ code: SpanStatusCode.OK });
taskSpan.end();
return { output: output2, duration: duration2 };
}
);
return {
output,
duration
};
};
export {
AxiomReporter as experimental_AxiomReporter,
Eval as experimental_Eval
};
//# sourceMappingURL=evals.js.map