UNPKG

evalite

Version:

Test your LLM-powered apps with a TypeScript-native, Vitest-based eval runner. No API key required.

328 lines 13.4 kB
import { mkdir, writeFile } from "fs/promises"; import path from "path"; import { describe, inject, it } from "vitest"; import { reportTraceLocalStorage } from "./traces.js"; import { writeFileQueueLocalStorage } from "./write-file-queue-local-storage.js"; import { createEvaliteFileIfNeeded } from "./utils.js"; import { FILES_LOCATION } from "./backend-only-constants.js"; import { createScorer } from "./index.js"; import { serializeAnnotation } from "./reporter/events.js"; const joinArrayOfUnknownResults = (results) => { return results.reduce((acc, result) => { if (typeof result === "string" || typeof result === "number" || typeof result === "boolean") { return `${acc}${result}`; } throw new Error(`Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.`); }, ""); }; const makeSerializable = (obj) => { try { structuredClone(obj); return obj; // Already serializable, return as-is } catch { // Use JSON stringify/parse to handle non-serializable values return JSON.parse(JSON.stringify(obj, (key, value) => { if (typeof value === "function") { return "[Function]"; } if (typeof value === "symbol") { return "[Symbol]"; } if (typeof value === "bigint") { return value.toString() + "n"; } return value; })); } }; const executeTask = async (task, input, variant) => { const taskResultOrStream = await task(input, variant); if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) { const chunks = []; for await (const chunk of taskResultOrStream) { chunks.push(chunk); } return joinArrayOfUnknownResults(chunks); } return taskResultOrStream; }; const runTask = async (opts) => { const start = performance.now(); const output = await executeTask(opts.task, opts.input, opts.variant); const duration = Math.round(performance.now() - start); const scores = await Promise.all((opts.scorers || []).map(async (scorerOrOpts) => { if (typeof scorerOrOpts === "function") { return scorerOrOpts({ input: opts.input, output, expected: opts.expected, }); } else { return createScorer(scorerOrOpts)({ input: opts.input, output, expected: opts.expected, }); } })); const columns = (await opts.columns?.({ input: opts.input, output, expected: opts.expected, scores, traces: opts.traces, })) || []; return { output, scores, duration, columns, }; }; export const evalite = (evalName, opts) => registerEvalite(evalName, opts); evalite.skip = (evalName, opts) => registerEvalite(evalName, opts, { modifier: "skip" }); /** * @deprecated Use `evalite.skip` instead. */ evalite.experimental_skip = evalite.skip; evalite.each = (variants) => { return (evalName, opts) => { for (const variant of variants) { registerEvalite(evalName, { ...opts, task: (input) => opts.task(input, variant.input), }, { variantName: variant.name, variantGroup: evalName }); } }; }; const resolveData = async (datasetFunction) => { try { return { success: true, data: await datasetFunction(), }; } catch (e) { return { success: false, error: e, }; } }; function registerEvalite(evalName, opts, vitestOpts = {}) { const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe; const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve({ success: true, data: [] }) : typeof opts.data === "function" ? resolveData(opts.data) : Promise.resolve({ success: true, data: opts.data }); const fullEvalName = vitestOpts.variantName ? `${evalName} [${vitestOpts.variantName}]` : evalName; return describeFn(fullEvalName, async () => { const datasetResult = await datasetPromise; if (!datasetResult.success) { it(fullEvalName, async ({ annotate, task }) => { await annotate(serializeAnnotation({ type: "RESULT_SUBMITTED", result: { evalName: fullEvalName, filepath: task.file.filepath, order: 0, status: "fail", variantName: vitestOpts.variantName, variantGroup: vitestOpts.variantGroup, trialIndex: undefined, duration: 0, expected: null, input: null, output: datasetResult.error, scores: [], traces: [], renderedColumns: [], }, })); throw datasetResult.error; }); return; } const dataset = datasetResult.data; // Filter dataset if any entry has `only: true` const hasOnlyFlag = dataset.some((d) => d.only === true); const filteredDataset = hasOnlyFlag ? dataset.filter((d) => d.only === true) : dataset; // Get trialCount from opts or config (opts wins) const configTrialCount = inject("trialCount"); const trialCount = opts.trialCount ?? configTrialCount ?? 1; // Expand dataset with trials const expandedDataset = []; for (let dataIndex = 0; dataIndex < filteredDataset.length; dataIndex++) { const dataPoint = filteredDataset[dataIndex]; for (let trialIndex = 0; trialIndex < trialCount; trialIndex++) { expandedDataset.push({ input: dataPoint.input, expected: dataPoint.expected, dataIndex, trialIndex: trialCount > 1 ? trialIndex : undefined, index: expandedDataset.length, }); } } // Create individual tests manually to avoid serialization // This allows non-serializable data (like Zod schemas) in closures for (const data of expandedDataset) { it.concurrent(fullEvalName, async ({ task, annotate }) => { if (!annotate || typeof annotate !== "function") { throw new Error("Evalite requires Vitest 3.2.4 or later for the annotations API. Please upgrade: `npm install vitest@latest`"); } const cwd = inject("cwd"); const rootDir = path.join(cwd, FILES_LOCATION); // Send RESULT_STARTED annotation immediately await annotate(serializeAnnotation({ type: "RESULT_STARTED", initialResult: { evalName: fullEvalName, filepath: task.file.filepath, order: data.index, variantName: vitestOpts.variantName, variantGroup: vitestOpts.variantGroup, status: "running", trialIndex: data.trialIndex, }, })); const start = performance.now(); const filePromises = []; writeFileQueueLocalStorage.enterWith(async (filePath, buffer) => { const func = async () => { await mkdir(path.dirname(filePath), { recursive: true }); await writeFile(filePath, buffer); }; const promise = func(); filePromises.push(promise); }); const traces = []; reportTraceLocalStorage.enterWith((trace) => traces.push(trace)); const [inputForMeta, expectedForMeta] = await Promise.all([ createEvaliteFileIfNeeded({ rootDir, input: data.input }), createEvaliteFileIfNeeded({ rootDir, input: data.expected }), ]); // Ensure data is serializable const serializableInput = makeSerializable(inputForMeta); const serializableExpected = makeSerializable(expectedForMeta); try { // Pass raw data (from closure) to scorers - allows non-serializable data const { output, scores, duration, columns } = await runTask({ expected: data.expected, input: data.input, variant: undefined, scorers: opts.scorers, task: opts.task, columns: opts.columns || opts.experimental_customColumns, traces, }); const [outputWithFiles, tracesWithFiles, renderedColumns] = await Promise.all([ createEvaliteFileIfNeeded({ rootDir, input: output, }), handleFilesInTraces(rootDir, traces), handleFilesInColumns(rootDir, columns), ]); const serializableOutput = makeSerializable(outputWithFiles); // Send RESULT_SUBMITTED annotation await annotate(serializeAnnotation({ type: "RESULT_SUBMITTED", result: { evalName: fullEvalName, filepath: task.file.filepath, order: data.index, duration: Math.round(performance.now() - start), expected: serializableExpected, input: serializableInput, output: serializableOutput, scores, traces: tracesWithFiles, status: "success", renderedColumns, variantName: vitestOpts.variantName, variantGroup: vitestOpts.variantGroup, trialIndex: data.trialIndex, }, })); } catch (e) { const duration = Math.round(performance.now() - start); // Serialize error for better display in UI const serializedError = e instanceof Error ? { name: e.name, message: e.message, stack: e.stack, } : e; // Send RESULT_SUBMITTED annotation for failure await annotate(serializeAnnotation({ type: "RESULT_SUBMITTED", result: { evalName: fullEvalName, filepath: task.file.filepath, order: data.index, duration, expected: serializableExpected, input: serializableInput, output: serializedError, scores: [], traces: await handleFilesInTraces(rootDir, traces), status: "fail", renderedColumns: [], variantName: vitestOpts.variantName, variantGroup: vitestOpts.variantGroup, trialIndex: data.trialIndex, }, })); throw e; } await Promise.all(filePromises); }); } }); } const handleFilesInColumns = async (rootDir, columns) => { return await Promise.all(columns.map(async (column) => { const file = await createEvaliteFileIfNeeded({ rootDir, input: column.value, }); return { ...column, value: file, }; })); }; const handleFilesInTraces = async (rootDir, traces) => { return await Promise.all(traces.map(async (trace) => { const [input, output] = await Promise.all([ createEvaliteFileIfNeeded({ rootDir, input: trace.input, }), createEvaliteFileIfNeeded({ rootDir, input: trace.output, }), ]); return { ...trace, input, output, }; })); }; //# sourceMappingURL=evalite.js.map