evalite
Version:
Test your LLM-powered apps with a TypeScript-native, Vitest-based eval runner. No API key required.
328 lines • 13.4 kB
JavaScript
import { mkdir, writeFile } from "fs/promises";
import path from "path";
import { describe, inject, it } from "vitest";
import { reportTraceLocalStorage } from "./traces.js";
import { writeFileQueueLocalStorage } from "./write-file-queue-local-storage.js";
import { createEvaliteFileIfNeeded } from "./utils.js";
import { FILES_LOCATION } from "./backend-only-constants.js";
import { createScorer } from "./index.js";
import { serializeAnnotation } from "./reporter/events.js";
const joinArrayOfUnknownResults = (results) => {
return results.reduce((acc, result) => {
if (typeof result === "string" ||
typeof result === "number" ||
typeof result === "boolean") {
return `${acc}${result}`;
}
throw new Error(`Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.`);
}, "");
};
const makeSerializable = (obj) => {
try {
structuredClone(obj);
return obj; // Already serializable, return as-is
}
catch {
// Use JSON stringify/parse to handle non-serializable values
return JSON.parse(JSON.stringify(obj, (key, value) => {
if (typeof value === "function") {
return "[Function]";
}
if (typeof value === "symbol") {
return "[Symbol]";
}
if (typeof value === "bigint") {
return value.toString() + "n";
}
return value;
}));
}
};
const executeTask = async (task, input, variant) => {
const taskResultOrStream = await task(input, variant);
if (typeof taskResultOrStream === "object" &&
taskResultOrStream &&
Symbol.asyncIterator in taskResultOrStream) {
const chunks = [];
for await (const chunk of taskResultOrStream) {
chunks.push(chunk);
}
return joinArrayOfUnknownResults(chunks);
}
return taskResultOrStream;
};
const runTask = async (opts) => {
const start = performance.now();
const output = await executeTask(opts.task, opts.input, opts.variant);
const duration = Math.round(performance.now() - start);
const scores = await Promise.all((opts.scorers || []).map(async (scorerOrOpts) => {
if (typeof scorerOrOpts === "function") {
return scorerOrOpts({
input: opts.input,
output,
expected: opts.expected,
});
}
else {
return createScorer(scorerOrOpts)({
input: opts.input,
output,
expected: opts.expected,
});
}
}));
const columns = (await opts.columns?.({
input: opts.input,
output,
expected: opts.expected,
scores,
traces: opts.traces,
})) || [];
return {
output,
scores,
duration,
columns,
};
};
export const evalite = (evalName, opts) => registerEvalite(evalName, opts);
evalite.skip = (evalName, opts) => registerEvalite(evalName, opts, { modifier: "skip" });
/**
* @deprecated Use `evalite.skip` instead.
*/
evalite.experimental_skip = evalite.skip;
evalite.each = (variants) => {
return (evalName, opts) => {
for (const variant of variants) {
registerEvalite(evalName, {
...opts,
task: (input) => opts.task(input, variant.input),
}, { variantName: variant.name, variantGroup: evalName });
}
};
};
const resolveData = async (datasetFunction) => {
try {
return {
success: true,
data: await datasetFunction(),
};
}
catch (e) {
return {
success: false,
error: e,
};
}
};
function registerEvalite(evalName, opts, vitestOpts = {}) {
const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe;
const datasetPromise = vitestOpts.modifier === "skip"
? Promise.resolve({ success: true, data: [] })
: typeof opts.data === "function"
? resolveData(opts.data)
: Promise.resolve({ success: true, data: opts.data });
const fullEvalName = vitestOpts.variantName
? `${evalName} [${vitestOpts.variantName}]`
: evalName;
return describeFn(fullEvalName, async () => {
const datasetResult = await datasetPromise;
if (!datasetResult.success) {
it(fullEvalName, async ({ annotate, task }) => {
await annotate(serializeAnnotation({
type: "RESULT_SUBMITTED",
result: {
evalName: fullEvalName,
filepath: task.file.filepath,
order: 0,
status: "fail",
variantName: vitestOpts.variantName,
variantGroup: vitestOpts.variantGroup,
trialIndex: undefined,
duration: 0,
expected: null,
input: null,
output: datasetResult.error,
scores: [],
traces: [],
renderedColumns: [],
},
}));
throw datasetResult.error;
});
return;
}
const dataset = datasetResult.data;
// Filter dataset if any entry has `only: true`
const hasOnlyFlag = dataset.some((d) => d.only === true);
const filteredDataset = hasOnlyFlag
? dataset.filter((d) => d.only === true)
: dataset;
// Get trialCount from opts or config (opts wins)
const configTrialCount = inject("trialCount");
const trialCount = opts.trialCount ?? configTrialCount ?? 1;
// Expand dataset with trials
const expandedDataset = [];
for (let dataIndex = 0; dataIndex < filteredDataset.length; dataIndex++) {
const dataPoint = filteredDataset[dataIndex];
for (let trialIndex = 0; trialIndex < trialCount; trialIndex++) {
expandedDataset.push({
input: dataPoint.input,
expected: dataPoint.expected,
dataIndex,
trialIndex: trialCount > 1 ? trialIndex : undefined,
index: expandedDataset.length,
});
}
}
// Create individual tests manually to avoid serialization
// This allows non-serializable data (like Zod schemas) in closures
for (const data of expandedDataset) {
it.concurrent(fullEvalName, async ({ task, annotate }) => {
if (!annotate || typeof annotate !== "function") {
throw new Error("Evalite requires Vitest 3.2.4 or later for the annotations API. Please upgrade: `npm install vitest@latest`");
}
const cwd = inject("cwd");
const rootDir = path.join(cwd, FILES_LOCATION);
// Send RESULT_STARTED annotation immediately
await annotate(serializeAnnotation({
type: "RESULT_STARTED",
initialResult: {
evalName: fullEvalName,
filepath: task.file.filepath,
order: data.index,
variantName: vitestOpts.variantName,
variantGroup: vitestOpts.variantGroup,
status: "running",
trialIndex: data.trialIndex,
},
}));
const start = performance.now();
const filePromises = [];
writeFileQueueLocalStorage.enterWith(async (filePath, buffer) => {
const func = async () => {
await mkdir(path.dirname(filePath), { recursive: true });
await writeFile(filePath, buffer);
};
const promise = func();
filePromises.push(promise);
});
const traces = [];
reportTraceLocalStorage.enterWith((trace) => traces.push(trace));
const [inputForMeta, expectedForMeta] = await Promise.all([
createEvaliteFileIfNeeded({ rootDir, input: data.input }),
createEvaliteFileIfNeeded({ rootDir, input: data.expected }),
]);
// Ensure data is serializable
const serializableInput = makeSerializable(inputForMeta);
const serializableExpected = makeSerializable(expectedForMeta);
try {
// Pass raw data (from closure) to scorers - allows non-serializable data
const { output, scores, duration, columns } = await runTask({
expected: data.expected,
input: data.input,
variant: undefined,
scorers: opts.scorers,
task: opts.task,
columns: opts.columns || opts.experimental_customColumns,
traces,
});
const [outputWithFiles, tracesWithFiles, renderedColumns] = await Promise.all([
createEvaliteFileIfNeeded({
rootDir,
input: output,
}),
handleFilesInTraces(rootDir, traces),
handleFilesInColumns(rootDir, columns),
]);
const serializableOutput = makeSerializable(outputWithFiles);
// Send RESULT_SUBMITTED annotation
await annotate(serializeAnnotation({
type: "RESULT_SUBMITTED",
result: {
evalName: fullEvalName,
filepath: task.file.filepath,
order: data.index,
duration: Math.round(performance.now() - start),
expected: serializableExpected,
input: serializableInput,
output: serializableOutput,
scores,
traces: tracesWithFiles,
status: "success",
renderedColumns,
variantName: vitestOpts.variantName,
variantGroup: vitestOpts.variantGroup,
trialIndex: data.trialIndex,
},
}));
}
catch (e) {
const duration = Math.round(performance.now() - start);
// Serialize error for better display in UI
const serializedError = e instanceof Error
? {
name: e.name,
message: e.message,
stack: e.stack,
}
: e;
// Send RESULT_SUBMITTED annotation for failure
await annotate(serializeAnnotation({
type: "RESULT_SUBMITTED",
result: {
evalName: fullEvalName,
filepath: task.file.filepath,
order: data.index,
duration,
expected: serializableExpected,
input: serializableInput,
output: serializedError,
scores: [],
traces: await handleFilesInTraces(rootDir, traces),
status: "fail",
renderedColumns: [],
variantName: vitestOpts.variantName,
variantGroup: vitestOpts.variantGroup,
trialIndex: data.trialIndex,
},
}));
throw e;
}
await Promise.all(filePromises);
});
}
});
}
const handleFilesInColumns = async (rootDir, columns) => {
return await Promise.all(columns.map(async (column) => {
const file = await createEvaliteFileIfNeeded({
rootDir,
input: column.value,
});
return {
...column,
value: file,
};
}));
};
const handleFilesInTraces = async (rootDir, traces) => {
return await Promise.all(traces.map(async (trace) => {
const [input, output] = await Promise.all([
createEvaliteFileIfNeeded({
rootDir,
input: trace.input,
}),
createEvaliteFileIfNeeded({
rootDir,
input: trace.output,
}),
]);
return {
...trace,
input,
output,
};
}));
};
//# sourceMappingURL=evalite.js.map