evalite
Version:
Test your LLM-powered apps with a TypeScript-native, Vitest-based eval runner. No API key required.
292 lines • 11.2 kB
JavaScript
import path from "node:path";
import { getTestName } from "@vitest/runner/utils";
import { parseStacktrace } from "@vitest/utils/source-map";
import c from "tinyrainbow";
import { EvaliteRunner } from "./reporter/EvaliteRunner.js";
import { deserializeAnnotation } from "./reporter/events.js";
import { renderDetailedTable, renderErrorsSummary, renderInitMessage, renderScoreDisplay, renderServeModeFinalMessage, renderSummaryStats, renderTask, renderThreshold, renderWatcherStart, } from "./reporter/rendering.js";
import { average, max } from "./utils.js";
const F_POINTER = "❯";
const separator = c.dim(" > ");
export default class EvaliteReporter {
opts;
runner;
ctx;
start;
constructor(opts) {
this.opts = opts;
this.runner = new EvaliteRunner({
storage: opts.storage,
logNewState: opts.logNewState,
modifyExitCode: opts.modifyExitCode,
scoreThreshold: opts.scoreThreshold,
});
}
onInit(ctx) {
this.ctx = ctx;
this.start = performance.now();
renderInitMessage(this.ctx.logger, {
isWatching: this.opts.isWatching,
port: this.opts.port,
});
this.runner.sendEvent({
type: "RUN_BEGUN",
filepaths: this.ctx.state.getFiles().map((f) => f.filepath),
runType: "full",
});
}
onWatcherStart(files = [], errors = []) {
const failedDueToThreshold = this.runner.getDidLastRunFailThreshold() === "yes";
renderWatcherStart(this.ctx.logger, {
files,
errors,
failedDueToThreshold,
scoreThreshold: this.opts.scoreThreshold,
});
}
shouldLog(log, taskState) {
if (this.ctx.config.silent === true ||
(this.ctx.config.silent === "passed-only" && taskState !== "failed")) {
return false;
}
if (this.ctx.config.onConsoleLog) {
const task = log.taskId
? this.ctx.state.idMap.get(log.taskId)
: undefined;
const entity = task && this.ctx.state.getReportedEntity(task);
if (this.ctx.config.onConsoleLog(log.content, log.type, entity) === false) {
return false;
}
}
return true;
}
getFullName(task, sep) {
if (task === task.file) {
return task.name;
}
let name = task.file.name;
if (task.location) {
name += c.dim(`:${task.location.line}:${task.location.column}`);
}
name += sep;
name += getTestName(task, sep);
return name;
}
onUserConsoleLog(log, taskState) {
if (!this.shouldLog(log, taskState)) {
return;
}
const output = log.type === "stdout"
? this.ctx.logger.outputStream
: this.ctx.logger.errorStream;
const write = (msg) => {
output.write(msg);
};
let headerText = "unknown test";
const task = log.taskId ? this.ctx.state.idMap.get(log.taskId) : undefined;
if (task) {
headerText = this.getFullName(task, separator);
}
else if (log.taskId && log.taskId !== "__vitest__unknown_test__") {
headerText = log.taskId;
}
write(c.gray(log.type + c.dim(` | ${headerText}\n`)) + log.content);
if (log.origin) {
// browser logs don't have an extra end of line at the end like Node.js does
if (log.browser) {
write("\n");
}
const project = task
? this.ctx.getProjectByName(task.file.projectName || "")
: this.ctx.getRootProject();
const stack = log.browser
? project.browser?.parseStacktrace(log.origin) || []
: parseStacktrace(log.origin);
const highlight = task && stack.find((i) => i.file === task.file.filepath);
for (const frame of stack) {
const color = frame === highlight ? c.cyan : c.gray;
const relativePath = path.relative(project.config.root, frame.file);
const positions = [
frame.method,
`${relativePath}:${c.dim(`${frame.line}:${frame.column}`)}`,
]
.filter(Boolean)
.join(" ");
write(color(` ${c.dim(F_POINTER)} ${positions}\n`));
}
}
write("\n");
}
onWatcherRerun(files, trigger) {
this.runner.sendEvent({
type: "RUN_BEGUN",
filepaths: files,
runType: "partial",
});
}
onTestRunEnd = async (testModules, unhandledErrors, reason) => {
this.runner.sendEvent({
type: "RUN_ENDED",
});
// Wait for all queued events to complete
await this.runner.waitForCompletion();
// Print errors first (mimicking DefaultReporter's reportSummary -> printErrorsSummary flow)
renderErrorsSummary(this.ctx.logger, {
testModules: testModules,
errors: unhandledErrors,
});
// Then print test summary
this.customTestSummary(testModules);
};
customTestSummary(modules) {
const tests = modules.flatMap((module) => Array.from(module.children.allTests()));
const failedTestsCount = tests.filter((test) => test.result().state === "failed").length;
// Get scores from runner's collected results
const scores = this.runner.getAllScores();
const averageScore = scores.length === 0 ? null : average(scores, (score) => score.score ?? 0);
this.runner.handleTestSummary({
failedTasksCount: failedTestsCount,
averageScore,
});
this.ctx.logger.log("");
renderScoreDisplay(this.ctx.logger, failedTestsCount, averageScore);
if (typeof this.opts.scoreThreshold === "number") {
renderThreshold(this.ctx.logger, this.opts.scoreThreshold, averageScore);
}
renderSummaryStats(this.ctx.logger, {
totalFiles: modules.length,
maxDuration: max(tests, (test) => test.diagnostic()?.duration ?? 0),
totalEvals: tests.length,
});
if (modules.length === 1 && !this.opts.hideTable) {
const successfulResults = this.runner.getSuccessfulResults();
if (successfulResults.length > 0) {
renderDetailedTable(this.ctx.logger, successfulResults, failedTestsCount);
}
}
if (this.opts.mode === "run-once-and-serve") {
renderServeModeFinalMessage(this.ctx.logger, this.opts.port);
}
}
onTestSuiteReady(testSuite) {
return;
}
onTestSuiteResult(testSuite) {
return;
}
onTestCaseAnnotate(testCase, annotation) {
const data = deserializeAnnotation(annotation.message);
if (!data) {
// Not an evalite annotation - ignore
return;
}
if (data.type === "RESULT_STARTED") {
this.runner.sendEvent({
type: "RESULT_STARTED",
initialResult: data.initialResult,
});
}
else if (data.type === "RESULT_SUBMITTED") {
this.runner.sendEvent({
type: "RESULT_SUBMITTED",
result: data.result,
});
}
}
onTestModuleCollected(mod) {
const errors = mod.errors();
// A module-level error has been detected
if (errors.length > 0) {
renderTask({
logger: this.ctx.logger,
result: {
filePath: path.relative(this.ctx.config.root, mod.moduleId),
status: "fail",
scores: [],
numberOfEvals: "unknown",
},
});
for (const error of errors) {
this.ctx.logger.printError(error);
}
this.opts.modifyExitCode(1);
}
}
onTestModuleStart(mod) {
const tests = Array.from(mod.children.allTests());
renderTask({
logger: this.ctx.logger,
result: {
filePath: path.relative(this.ctx.config.root, mod.moduleId),
status: "running",
scores: [],
numberOfEvals: tests.length,
},
});
return;
}
onTestCaseResult(test) {
// Check if we received a RESULT_SUBMITTED annotation
const hasResultSubmitted = test.annotations().some((annotation) => {
const data = deserializeAnnotation(annotation.message);
return data?.type === "RESULT_SUBMITTED";
});
// If we already got a RESULT_SUBMITTED, nothing to do
if (hasResultSubmitted) {
return;
}
// Check if we have a RESULT_STARTED annotation
const resultStartedAnnotation = test.annotations().find((annotation) => {
const data = deserializeAnnotation(annotation.message);
return data?.type === "RESULT_STARTED";
});
if (!resultStartedAnnotation) {
// No evalite annotations at all - not an evalite test
return;
}
// Test finished but never submitted a result - likely timeout
const data = deserializeAnnotation(resultStartedAnnotation.message);
if (data && data.type === "RESULT_STARTED") {
this.runner.sendEvent({
type: "RESULT_SUBMITTED",
result: {
evalName: data.initialResult.evalName,
filepath: data.initialResult.filepath,
order: data.initialResult.order,
duration: 0,
expected: "",
input: "",
output: null,
scores: [],
traces: [],
status: "fail",
renderedColumns: [],
variantName: data.initialResult.variantName,
variantGroup: data.initialResult.variantGroup,
trialIndex: data.initialResult.trialIndex,
},
});
}
}
printAnnotations(_test, _console, _padding) {
// Evalite uses annotations internally for reporter communication.
// Users cannot add custom annotations via the Evalite API,
// so we suppress all annotation output.
return;
}
onTestModuleEnd(mod) {
const tests = Array.from(mod.children.allTests());
const hasFailed = tests.some((test) => test.result().state === "failed");
const scores = this.runner.getScoresForModule(mod.moduleId);
renderTask({
logger: this.ctx.logger,
result: {
filePath: path.relative(this.ctx.config.root, mod.moduleId),
status: hasFailed ? "fail" : "success",
numberOfEvals: tests.length,
scores,
},
});
}
}
//# sourceMappingURL=reporter.js.map