evalite
Version:
Test your LLM-powered apps with a TypeScript-native, Vitest-based eval runner. No API key required.
209 lines • 7.89 kB
JavaScript
import { FILES_LOCATION } from "@evalite/core";
import { mkdir, writeFile } from "fs/promises";
import path from "path";
import { describe, inject, it } from "vitest";
import { reportTraceLocalStorage } from "./traces.js";
import { writeFileQueueLocalStorage } from "./write-file-queue-local-storage.js";
import { createEvaliteFileIfNeeded } from "./utils.js";
const joinArrayOfUnknownResults = (results) => {
return results.reduce((acc, result) => {
if (typeof result === "string" ||
typeof result === "number" ||
typeof result === "boolean") {
return `${acc}${result}`;
}
throw new Error(`Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.`);
}, "");
};
const executeTask = async (task, input) => {
const taskResultOrStream = await task(input);
if (typeof taskResultOrStream === "object" &&
taskResultOrStream &&
Symbol.asyncIterator in taskResultOrStream) {
const chunks = [];
for await (const chunk of taskResultOrStream) {
chunks.push(chunk);
}
return joinArrayOfUnknownResults(chunks);
}
return taskResultOrStream;
};
const runTask = async (opts) => {
const start = performance.now();
const output = await executeTask(opts.task, opts.input);
const duration = Math.round(performance.now() - start);
const experimental_columns = (await opts.experimental_customColumns?.({
input: opts.input,
output,
expected: opts.expected,
})) || [];
const scores = await Promise.all(opts.scorers.map(async (scorerOrOpts) => {
if (typeof scorerOrOpts === "function") {
return scorerOrOpts({
input: opts.input,
output,
expected: opts.expected,
});
}
else {
return createScorer(scorerOrOpts)({
input: opts.input,
output,
expected: opts.expected,
});
}
}));
return {
output,
scores,
duration,
experimental_columns,
};
};
export const evalite = (evalName, opts) => registerEvalite(evalName, opts);
evalite.experimental_skip = (evalName, opts) => registerEvalite(evalName, opts, { modifier: "skip" });
function registerEvalite(evalName, opts, vitestOpts = {}) {
const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe;
const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve([]) : opts.data();
return describeFn(evalName, async () => {
const dataset = await datasetPromise;
it.concurrent.for(dataset.map((d, index) => ({ ...d, index })))(evalName, async (data, { task }) => {
const cwd = inject("cwd");
const rootDir = path.join(cwd, FILES_LOCATION);
task.meta.evalite = {
duration: undefined,
initialResult: {
evalName: evalName,
filepath: task.file.filepath,
order: data.index,
},
};
const start = performance.now();
const filePromises = [];
writeFileQueueLocalStorage.enterWith(async (filePath, buffer) => {
const func = async () => {
await mkdir(path.dirname(filePath), { recursive: true });
await writeFile(filePath, buffer);
};
const promise = func();
filePromises.push(promise);
});
const traces = [];
reportTraceLocalStorage.enterWith((trace) => traces.push(trace));
const [input, expected] = await Promise.all([
createEvaliteFileIfNeeded({ rootDir, input: data.input }),
createEvaliteFileIfNeeded({ rootDir, input: data.expected }),
]);
try {
const { output, scores, duration, experimental_columns } = await runTask({
expected: data.expected,
input: data.input,
scorers: opts.scorers,
task: opts.task,
experimental_customColumns: opts.experimental_customColumns,
});
const [outputWithFiles, tracesWithFiles, renderedColumns] = await Promise.all([
createEvaliteFileIfNeeded({
rootDir,
input: output,
}),
handleFilesInTraces(rootDir, traces),
handleFilesInColumns(rootDir, experimental_columns),
]);
task.meta.evalite = {
result: {
evalName: evalName,
filepath: task.file.filepath,
order: data.index,
duration,
expected: expected,
input: input,
output: outputWithFiles,
scores,
traces: tracesWithFiles,
status: "success",
renderedColumns,
},
duration: Math.round(performance.now() - start),
};
}
catch (e) {
task.meta.evalite = {
result: {
evalName: evalName,
filepath: task.file.filepath,
order: data.index,
duration: Math.round(performance.now() - start),
expected: expected,
input: input,
output: e,
scores: [],
traces: await handleFilesInTraces(rootDir, traces),
status: "fail",
renderedColumns: [],
},
duration: Math.round(performance.now() - start),
};
throw e;
}
await Promise.all(filePromises);
});
});
}
export const createScorer = (opts) => {
return async (input) => {
const score = await opts.scorer(input);
if (typeof score === "object") {
if (typeof score.score !== "number") {
throw new Error(`The scorer '${opts.name}' must return a number.`);
}
return {
score: score.score,
metadata: score.metadata,
description: opts.description,
name: opts.name,
};
}
if (typeof score !== "number") {
throw new Error(`The scorer '${opts.name}' must return a number.`);
}
return {
description: opts.description,
name: opts.name,
score,
};
};
};
export * from "./evalite-file.js";
const handleFilesInColumns = async (rootDir, columns) => {
return await Promise.all(columns.map(async (column) => {
const file = await createEvaliteFileIfNeeded({
rootDir,
input: column.value,
});
return {
...column,
value: file,
};
}));
};
const handleFilesInTraces = async (rootDir, traces) => {
return await Promise.all(traces.map(async (trace) => {
const [input, output] = await Promise.all([
createEvaliteFileIfNeeded({
rootDir,
input: trace.input,
}),
createEvaliteFileIfNeeded({
rootDir,
input: trace.output,
}),
]);
return {
...trace,
input,
output,
};
}));
};
//# sourceMappingURL=index.js.map