UNPKG

evalite

Version:

Test your LLM-powered apps with a TypeScript-native, Vitest-based eval runner. No API key required.

mattpocock/evalite

280 lines • 12.1 kB

JavaScript

import { mkdir, writeFile } from "fs/promises"; import path from "path"; import { Writable } from "stream"; import { createVitest, registerConsoleShortcuts } from "vitest/node"; import { createInMemoryStorage } from "./storage/in-memory.js"; import { computeAverageScores } from "./storage/utils.js"; import { DB_LOCATION, FILES_LOCATION } from "./backend-only-constants.js"; import { DEFAULT_SERVER_PORT } from "./constants.js"; import EvaliteReporter from "./reporter.js"; import { createServer } from "./server.js"; import { createSqliteStorage } from "./storage/sqlite.js"; import { loadEvaliteConfig } from "./config.js"; const exportResultsToJSON = async (opts) => { const latestFullRunResults = await opts.storage.runs.getMany({ runType: "full", orderBy: "created_at", orderDirection: "desc", limit: 1, }); const latestFullRun = latestFullRunResults[0]; if (!latestFullRun) { throw new Error("No completed run found to export"); } const allEvals = await opts.storage.evals.getMany({ runIds: [latestFullRun.id], statuses: ["fail", "success"], }); const evalResults = await opts.storage.results.getMany({ evalIds: allEvals.map((e) => e.id), }); const allScores = await opts.storage.scores.getMany({ resultIds: evalResults.map((r) => r.id), }); const allTraces = await opts.storage.traces.getMany({ resultIds: evalResults.map((r) => r.id), }); const resultsAverageScores = computeAverageScores(allScores); // Group results by eval and transform to camelCase const outputData = { run: { id: latestFullRun.id, runType: latestFullRun.runType, createdAt: latestFullRun.created_at, }, evals: allEvals.map((evaluation) => { const resultsForEval = evalResults.filter((r) => r.eval_id === evaluation.id); const scoresForEval = allScores.filter((s) => resultsForEval.some((r) => r.id === s.result_id)); const evalAvgScore = scoresForEval.length > 0 ? scoresForEval.reduce((sum, s) => sum + s.score, 0) / scoresForEval.length : 0; return { id: evaluation.id, name: evaluation.name, filepath: evaluation.filepath, duration: evaluation.duration, status: evaluation.status, variantName: evaluation.variant_name, variantGroup: evaluation.variant_group, createdAt: evaluation.created_at, averageScore: evalAvgScore, results: resultsForEval.map((result) => { const resultAvgScore = resultsAverageScores.find((r) => r.result_id === result.id); const scoresForResult = allScores.filter((s) => s.result_id === result.id); const tracesForResult = allTraces.filter((t) => t.result_id === result.id); return { id: result.id, duration: result.duration, input: result.input, output: result.output, expected: result.expected, status: result.status, colOrder: result.col_order, renderedColumns: result.rendered_columns, createdAt: result.created_at, averageScore: resultAvgScore?.average ?? 0, scores: scoresForResult.map((score) => ({ id: score.id, name: score.name, score: score.score, description: score.description, metadata: score.metadata, createdAt: score.created_at, })), traces: tracesForResult.map((trace) => ({ id: trace.id, input: trace.input, output: trace.output, startTime: trace.start_time, endTime: trace.end_time, inputTokens: trace.input_tokens, outputTokens: trace.output_tokens, totalTokens: trace.total_tokens, colOrder: trace.col_order, })), }; }), }; }), }; const absolutePath = path.isAbsolute(opts.outputPath) ? opts.outputPath : path.join(opts.cwd, opts.outputPath); await mkdir(path.dirname(absolutePath), { recursive: true }); await writeFile(absolutePath, JSON.stringify(outputData, null, 2), "utf-8"); console.log(`\nResults exported to: ${absolutePath}`); }; /** * Run Evalite programmatically via the Node API. * * This is the official Node API for running evaluations programmatically. * It provides full control over eval execution including path filtering, * watch mode, score thresholds, and result exporting. * * @param opts - Configuration options for running evaluations * @param opts.path - Optional path filter to run specific eval files (defaults to undefined, which runs all evals) * @param opts.cwd - Working directory (defaults to process.cwd()) * @param opts.testOutputWritable - Optional writable stream for test output * @param opts.mode - Execution mode: "watch-for-file-changes", "run-once-and-exit", or "run-once-and-serve" * @param opts.scoreThreshold - Optional score threshold (0-100) to fail the process if scores are below * @param opts.outputPath - Optional path to write test results in JSON format after completion * * @example * ```typescript * import { runEvalite } from "evalite/runner"; * * // Run once and exit - simplified usage * await runEvalite({ * mode: "run-once-and-exit", * scoreThreshold: 80, * outputPath: "./results.json" * }); * * // Watch mode for development * await runEvalite({ * mode: "watch-for-file-changes" * }); * * // Run specific eval file with custom working directory * await runEvalite({ * path: "tests/my-eval.eval.ts", * cwd: "/path/to/project", * mode: "run-once-and-exit" * }); * ``` */ export const runEvalite = async (opts) => { const cwd = opts.cwd ?? process.cwd(); const filesLocation = path.join(cwd, FILES_LOCATION); await mkdir(filesLocation, { recursive: true }); // Load config file if present const config = await loadEvaliteConfig(cwd); // Merge options: opts (highest priority) > config > defaults let storage = opts.storage; if (!storage && config?.storage) { // Call config storage factory (may be async) storage = await config.storage(); } if (!storage) { const dbLocation = path.join(cwd, DB_LOCATION); storage = await createSqliteStorage(dbLocation); } const scoreThreshold = opts.scoreThreshold ?? config?.scoreThreshold; const hideTable = opts.hideTable ?? config?.hideTable; const serverPort = config?.server?.port ?? DEFAULT_SERVER_PORT; const testTimeout = config?.testTimeout; const maxConcurrency = config?.maxConcurrency; const setupFiles = config?.setupFiles; const filters = opts.path ? [opts.path] : undefined; process.env.EVALITE_REPORT_TRACES = "true"; let server = undefined; if (!opts.disableServer && (opts.mode === "watch-for-file-changes" || opts.mode === "run-once-and-serve")) { server = createServer({ storage: storage, }); server.start(serverPort); } let exitCode = undefined; const vitest = await createVitest("test", { // Everything passed here cannot be // overridden by the user root: cwd, include: ["**/*.eval.?(m)ts"], watch: opts.mode === "watch-for-file-changes", reporters: [ new EvaliteReporter({ logNewState: (newState) => { server?.updateState(newState); }, port: serverPort, isWatching: opts.mode === "watch-for-file-changes" || opts.mode === "run-once-and-serve", storage: storage, scoreThreshold: scoreThreshold, modifyExitCode: (code) => { exitCode = code; }, mode: opts.mode, hideTable: hideTable, }), ], mode: "test", browser: undefined, }, { plugins: [ { name: "evalite-config-plugin", // Everything inside this config CAN be overridden by user's vite.config.ts // EXCEPT when evalite.config.ts explicitly sets values - those override vite.config.ts // When we moved to Vitest v4, I found a strange type error where // `config` was not being inferred correctly. In the TS playground, // this code works fine, so it may be some kind of package resolution issue. // Since this code is fully tested, I feel OK with an 'any' for now. config(config) { config.test ??= {}; // If evalite.config.ts specifies these values, override user's vite.config.ts // Otherwise use vite.config.ts value or fallback to default if (testTimeout !== undefined) { config.test.testTimeout = testTimeout; } else { config.test.testTimeout ??= 30_000; } if (maxConcurrency !== undefined) { config.test.maxConcurrency = maxConcurrency; } // Note: no fallback for maxConcurrency - let Vitest use its own default if (setupFiles !== undefined) { config.test.setupFiles = setupFiles; } // Note: no fallback for setupFiles - let Vitest use its own default config.test.sequence ??= {}; config.test.sequence.concurrent ??= true; }, // See comment about any on config() above configResolved(config) { if (opts.configDebugMode) { const debugMessage = `[Evalite Config Debug] testTimeout: ${config.test?.testTimeout}, maxConcurrency: ${config.test?.maxConcurrency}\n`; if (opts.testOutputWritable) { opts.testOutputWritable.write(debugMessage); } else { process.stdout.write(debugMessage); } } }, }, ], }, { stdout: opts.testOutputWritable || process.stdout, stderr: opts.testOutputWritable || process.stderr, }); vitest.provide("cwd", cwd); vitest.provide("trialCount", config?.trialCount); await vitest.start(filters); const dispose = registerConsoleShortcuts(vitest, process.stdin, process.stdout); const shouldKeepRunning = vitest.shouldKeepServer() || opts.mode === "run-once-and-serve"; if (!shouldKeepRunning) { dispose(); await vitest.close(); if (opts.outputPath) { await exportResultsToJSON({ storage, outputPath: opts.outputPath, cwd, }); } if (typeof exitCode === "number") { process.exit(exitCode); } } return vitest; }; /** * @deprecated Use `runEvalite` instead. This export will be removed in a future version. */ export const runVitest = runEvalite; //# sourceMappingURL=run-evalite.js.map