UNPKG

evalite

Version:

Test your LLM-powered apps with a TypeScript-native, Vitest-based eval runner. No API key required.

410 lines (406 loc) 14.1 kB
import Database from "better-sqlite3"; import { max } from "./utils.js"; export const createDatabase = (url) => { const db = new Database(url); db.pragma("journal_mode = WAL"); db.exec(` CREATE TABLE IF NOT EXISTS runs ( id INTEGER PRIMARY KEY AUTOINCREMENT, runType TEXT NOT NULL, -- full, partial created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); CREATE TABLE IF NOT EXISTS evals ( id INTEGER PRIMARY KEY AUTOINCREMENT, run_id INTEGER NOT NULL, name TEXT NOT NULL, filepath TEXT NOT NULL, duration INTEGER NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (run_id) REFERENCES runs(id) ); CREATE TABLE IF NOT EXISTS results ( id INTEGER PRIMARY KEY AUTOINCREMENT, eval_id INTEGER NOT NULL, duration INTEGER NOT NULL, input TEXT NOT NULL, -- JSON output TEXT NOT NULL, -- JSON expected TEXT, -- JSON col_order INTEGER NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (eval_id) REFERENCES evals(id) ); CREATE TABLE IF NOT EXISTS scores ( id INTEGER PRIMARY KEY AUTOINCREMENT, result_id INTEGER NOT NULL, name TEXT NOT NULL, score FLOAT NOT NULL, description TEXT, metadata TEXT, -- JSON created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (result_id) REFERENCES results(id) ); CREATE TABLE IF NOT EXISTS traces ( id INTEGER PRIMARY KEY AUTOINCREMENT, result_id INTEGER NOT NULL, input TEXT NOT NULL, -- JSON output TEXT NOT NULL, -- JSON start_time INTEGER NOT NULL, end_time INTEGER NOT NULL, prompt_tokens INTEGER, completion_tokens INTEGER, col_order INTEGER NOT NULL, FOREIGN KEY (result_id) REFERENCES results(id) ); `); // Add status key to evals table try { db.exec(`ALTER TABLE evals ADD COLUMN status TEXT NOT NULL DEFAULT 'success';`); } catch (e) { } // Add status key to results table try { db.exec(`ALTER TABLE results ADD COLUMN status TEXT NOT NULL DEFAULT 'success';`); } catch (e) { } // Add rendered_columns key to results table try { db.exec(`ALTER TABLE results ADD COLUMN rendered_columns TEXT`); } catch (e) { } return db; }; /** * @deprecated */ export const saveRun = (db, { files, runType, }) => { const runId = db .prepare(` INSERT INTO runs (runType) VALUES (@runType) `) .run({ runType }).lastInsertRowid; for (const file of files) { for (const suite of file.tasks) { if (!suite.tasks) { throw new Error("An unknown error occurred - did you nest evalite inside a describe block?"); } const evalId = db .prepare(` INSERT INTO evals (run_id, name, filepath, duration, status) VALUES (@runId, @name, @filepath, @duration, @status) `) .run({ runId, name: suite.name, filepath: file.filepath, duration: max(suite.tasks, (t) => t.meta.evalite?.duration ?? 0), status: suite.result?.state === "fail" ? "fail" : "success", }).lastInsertRowid; for (const task of suite.tasks) { if (task.meta.evalite?.result) { const { duration, input, output, expected, scores, traces, order } = task.meta.evalite.result; const resultId = db .prepare(` INSERT INTO results (eval_id, duration, input, output, expected, col_order) VALUES (@evalId, @duration, @input, @output, @expected, @col_order) `) .run({ evalId, duration, input: JSON.stringify(input), output: JSON.stringify(output), expected: JSON.stringify(expected), col_order: order, }).lastInsertRowid; for (const score of scores) { db.prepare(` INSERT INTO scores (result_id, name, score, description, metadata) VALUES (@resultId, @name, @score, @description, @metadata) `).run({ resultId, name: score.name, score: score.score ?? 0, description: score.description, metadata: JSON.stringify(score.metadata), }); } let traceOrder = 0; for (const trace of traces) { traceOrder += 1; db.prepare(` INSERT INTO traces (result_id, input, output, start_time, end_time, prompt_tokens, completion_tokens, col_order) VALUES (@resultId, @input, @output, @start_time, @end_time, @prompt_tokens, @completion_tokens, @col_order) `).run({ resultId, input: JSON.stringify(trace.input), output: JSON.stringify(trace.output), start_time: Math.round(trace.start), end_time: Math.round(trace.end), prompt_tokens: trace.usage?.promptTokens ?? null, completion_tokens: trace.usage?.completionTokens ?? null, col_order: traceOrder, }); } } } } } }; /** * @deprecated * * Used in existing tests, but in future code should be replaced * by more specific queries. */ export const getEvalsAsRecord = async (db) => { const evals = db.prepare(`SELECT * FROM evals`).all(); const allResults = getResults(db, evals.map((e) => e.id)); const allScores = getScores(db, allResults.map((r) => r.id)); const allTraces = getTraces(db, allResults.map((r) => r.id)); const recordOfEvals = {}; for (const evaluation of evals) { const key = evaluation.name; if (!recordOfEvals[key]) { recordOfEvals[key] = []; } const results = allResults.filter((r) => r.eval_id === evaluation.id); const resultsWithScores = results.map((r) => { const scores = allScores.filter((s) => s.result_id === r.id); const traces = allTraces.filter((t) => t.result_id === r.id); return { ...r, scores, traces }; }); recordOfEvals[key].push({ ...evaluation, results: resultsWithScores, }); } return recordOfEvals; }; export const getEvals = (db, runIds, allowedStatuses) => { return db .prepare(` SELECT * FROM evals WHERE run_id IN (${runIds.join(",")}) AND status IN (${allowedStatuses.map((s) => `'${s}'`).join(",")}) `) .all(); }; export const getResults = (db, evalIds) => { return db .prepare(` SELECT * FROM results WHERE eval_id IN (${evalIds.join(",")}) ORDER BY col_order ASC `) .all() .map((r) => jsonParseFields(r, ["input", "output", "expected", "rendered_columns"])); }; export const getScores = (db, resultIds) => { return db .prepare(` SELECT * FROM scores WHERE result_id IN (${resultIds.join(",")}) `) .all() .map((r) => jsonParseFields(r, ["metadata"])); }; export const getTraces = (db, resultIds) => { return db .prepare(` SELECT * FROM traces WHERE result_id IN (${resultIds.join(",")}) ORDER BY col_order ASC `) .all() .map((t) => jsonParseFields(t, ["input", "output"])); }; export const getMostRecentRun = (db, runType) => { const run = db .prepare(` SELECT * FROM runs WHERE runType = @runType ORDER BY created_at DESC LIMIT 1 `) .get({ runType }); return run; }; export const getPreviousCompletedEval = (db, name, startTime) => { const evaluation = db .prepare(` SELECT * FROM evals WHERE name = @name AND created_at < @startTime AND status != 'running' ORDER BY created_at DESC LIMIT 1 `) .get({ name, startTime }); return evaluation; }; export const getAverageScoresFromResults = (db, resultIds) => { return db .prepare(` SELECT result_id, AVG(score) as average FROM scores WHERE result_id IN (${resultIds.join(",")}) GROUP BY result_id `) .all(); }; export const getEvalsAverageScores = (db, evalIds) => { const result = db .prepare(` SELECT r.eval_id, AVG(s.score) as average FROM scores s JOIN results r ON s.result_id = r.id WHERE r.eval_id IN (${evalIds.join(",")}) GROUP BY r.eval_id `) .all(); return result; }; export const jsonParseFields = (obj, fields) => { const objToReturn = {}; for (const key of Object.keys(obj)) { const value = obj[key]; if (fields.includes(key)) { objToReturn[key] = JSON.parse(value); } else { objToReturn[key] = value; } } return objToReturn; }; /** * Defaults to most recent if timestamp not passed */ export const getEvalByName = (db, opts) => { return db .prepare(` SELECT * FROM evals WHERE name = @name ${opts.timestamp ? "AND created_at = @timestamp" : ""} ${opts.statuses ? `AND status IN (${opts.statuses.map((s) => `'${s}'`).join(",")})` : ""} ORDER BY created_at DESC LIMIT 1 `) .get({ name: opts.name, timestamp: opts.timestamp }); }; export const getHistoricalEvalsWithScoresByName = (db, name) => { return db .prepare(` SELECT evals.*, AVG(scores.score) as average_score FROM evals LEFT JOIN results ON evals.id = results.eval_id LEFT JOIN scores ON results.id = scores.result_id WHERE evals.name = @name AND evals.status != 'running' GROUP BY evals.id ORDER BY evals.created_at ASC `) .all({ name }); }; export const createEvalIfNotExists = ({ db, runId, name, filepath, }) => { let evaluationId = db .prepare(`SELECT id FROM evals WHERE name = @name AND run_id = @runId`) .get({ name, runId })?.id; if (!evaluationId) { evaluationId = db .prepare(`INSERT INTO evals (run_id, name, filepath, duration, status) VALUES (@runId, @name, @filepath, @duration, @status)`) .run({ runId, name, filepath, duration: 0, status: "running", }).lastInsertRowid; } return evaluationId; }; export const createRun = ({ db, runType, }) => { return db .prepare(`INSERT INTO runs (runType) VALUES (@runType)`) .run({ runType }).lastInsertRowid; }; export const insertResult = ({ db, evalId, order, input, expected, output, duration, status, renderedColumns, }) => { return db .prepare(`INSERT INTO results (eval_id, col_order, input, expected, output, duration, status, rendered_columns) VALUES (@eval_id, @col_order, @input, @expected, @output, @duration, @status, @rendered_columns)`) .run({ eval_id: evalId, col_order: order, input: JSON.stringify(input), expected: JSON.stringify(expected), output: JSON.stringify(output), duration, status, rendered_columns: JSON.stringify(renderedColumns), }).lastInsertRowid; }; export const updateResult = ({ db, resultId, output, duration, status, renderedColumns, input, expected, }) => { db.prepare(`UPDATE results SET output = @output, duration = @duration, input = @input, expected = @expected, status = @status, rendered_columns = @rendered_columns WHERE id = @id`).run({ id: resultId, output: JSON.stringify(output), duration, status, rendered_columns: JSON.stringify(renderedColumns), input: JSON.stringify(input), expected: JSON.stringify(expected), }); }; export const insertScore = ({ db, resultId, description, name, score, metadata, }) => { db.prepare(`INSERT INTO scores (result_id, name, score, metadata, description) VALUES (@result_id, @name, @score, @metadata, @description)`).run({ result_id: resultId, description, name, score, metadata: JSON.stringify(metadata), }); }; export const insertTrace = ({ db, resultId, input, output, start, end, promptTokens, completionTokens, order, }) => { db.prepare(`INSERT INTO traces (result_id, input, output, start_time, end_time, prompt_tokens, completion_tokens, col_order) VALUES (@result_id, @input, @output, @start_time, @end_time, @prompt_tokens, @completion_tokens, @col_order)`).run({ result_id: resultId, input: JSON.stringify(input), output: JSON.stringify(output), start_time: Math.round(start), end_time: Math.round(end), prompt_tokens: promptTokens, completion_tokens: completionTokens, col_order: order, }); }; export const updateEvalStatusAndDuration = ({ db, evalId, status, }) => { db.prepare(`UPDATE evals SET status = @status, duration = (SELECT MAX(duration) FROM results WHERE eval_id = @id) WHERE id = @id`).run({ id: evalId, status, }); }; export const findResultByEvalIdAndOrder = ({ db, evalId, order, }) => { return db .prepare(`SELECT id FROM results WHERE eval_id = @eval_id AND col_order = @col_order`) .get({ eval_id: evalId, col_order: order, })?.id; }; export const getAllResultsForEval = ({ db, evalId, }) => { return db .prepare(`SELECT id, status FROM results WHERE eval_id = @eval_id`) .all({ eval_id: evalId }); }; //# sourceMappingURL=db.js.map