UNPKG

bff-eval

Version:

Evaluation framework for LLM systems - Node.js CLI

github.com/bolt-foundry/bolt-foundry/tree/main/packages/bff-eval

bolt-foundry/bolt-foundry

354 lines (353 loc) • 16.6 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.runEvaluation = runEvaluation; function printLine(message) { console.log(message); } function printTable(data) { // Format numbers to at most 3 decimal places const formatValue = (value) => { if (typeof value === "number") { // If it's an integer or has no decimal places, return as is if (Number.isInteger(value)) { return value; } // Otherwise format to at most 3 decimal places const formatted = parseFloat(value.toFixed(3)); return formatted; } return value; }; // Process the data let processedData; if (Array.isArray(data)) { processedData = data.map((row) => { if (typeof row === "object" && row !== null) { const processedRow = {}; for (const [key, value] of Object.entries(row)) { processedRow[key] = formatValue(value); } return processedRow; } return row; }); } else if (typeof data === "object" && data !== null) { processedData = {}; for (const [key, value] of Object.entries(data)) { processedData[key] = formatValue(value); } } else { processedData = data; } console.table(processedData); } async function runEvaluation(options) { const { input, grader: graderPath, model = "openai/gpt-4o", output, verbose } = options; console.log("🚀 Starting evaluation...\n"); if (verbose) { console.log("Evaluation Configuration:"); console.table({ "Input File": input, "Grader File": graderPath, "Model": model, "Output": output || "Console", }); console.log(""); } try { // Load files console.log("📁 Loading input file..."); const fs = await Promise.resolve().then(() => __importStar(require("node:fs/promises"))); const inputContent = await fs.readFile(input, 'utf-8'); console.log(`✅ Loaded input file: ${input}`); // Load grader module console.log(`\n🔧 Loading grader...`); const graderModule = require(graderPath); const grader = graderModule.default || graderModule; console.log(`✅ Loaded grader: ${graderPath}`); // Parse JSONL input console.log("\n📋 Parsing evaluation samples..."); const samples = []; const lines = inputContent.trim().split("\n"); for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); if (!line) continue; try { const sample = JSON.parse(line); // Generate ID if missing if (!sample.id) { sample.id = `eval-${i + 1}`; } samples.push(sample); } catch (error) { throw new Error(`Invalid JSON on line ${i + 1}: ${error instanceof Error ? error.message : String(error)}`); } } console.log(`✅ Parsed ${samples.length} samples`); // Process all samples in parallel console.log(`\n🤖 Processing ${samples.length} samples in parallel with ${model}...\n`); // Track completion status let completedCount = 0; const startTime = Date.now(); // Create a function to process a single sample const processSample = async (sample, index) => { const sampleNumber = index + 1; const sampleStartTime = performance.now(); try { // Prepare context for the deck const context = { userMessage: sample.userMessage, assistantResponse: sample.assistantResponse, }; if (sample.expected) { context.expected = sample.expected; } // Render the grader with the evaluation context const renderedGrader = grader.render({ model, context, temperature: 0, }); // Call LLM API via OpenRouter const apiKey = process.env.OPENROUTER_API_KEY; if (!apiKey) { throw new Error(`Missing OPENROUTER_API_KEY environment variable.\n\n` + `To use bff-eval, you need to set up an OpenRouter API key:\n` + `1. Sign up for an account at https://openrouter.ai\n` + `2. Generate an API key from your dashboard\n` + `3. Set the environment variable:\n` + ` export OPENROUTER_API_KEY="your-api-key-here"\n\n` + `For more information, visit: https://openrouter.ai/docs`); } const response = await fetch(`https://openrouter.ai/api/v1/chat/completions`, { method: "POST", headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}`, "HTTP-Referer": "https://boltfoundry.com", "X-Title": "Bolt Foundry Eval", }, body: JSON.stringify(renderedGrader), }); const endTime = performance.now(); const latencyInMs = endTime - sampleStartTime; if (!response.ok) { throw new Error(`API request failed: ${response.statusText}`); } const apiResponse = await response.json(); const rawOutput = apiResponse.choices[0].message.content; // Parse the evaluation result let output; try { output = JSON.parse(rawOutput); } catch (_parseError) { // If the grader fails to return valid JSON, that's a score of 0 output = { score: 0, notes: `Grader failed to return valid JSON. Raw output: ${rawOutput}`, }; } // Validate score is in range const score = Math.round(output.score); if (score < -3 || score > 3) { throw new Error(`Score ${score} is out of valid range [-3, 3]`); } // Update progress completedCount++; const progress = Math.round((completedCount / samples.length) * 100); process.stdout.write(`\r⚡ Progress: ${completedCount}/${samples.length} (${progress}%) - Latest: ${sample.id} scored ${score} (${Math.round(latencyInMs)}ms)`); // Create result return { model, id: sample.id, iteration: 1, score, latencyInMs, rawOutput, output, sample, sampleMetadata: Object.fromEntries(Object.entries(sample).filter(([key]) => !["id", "userMessage", "assistantResponse", "expected"].includes(key))), }; } catch (error) { // Update progress even on error completedCount++; const progress = Math.round((completedCount / samples.length) * 100); process.stdout.write(`\r⚡ Progress: ${completedCount}/${samples.length} (${progress}%) - Error on ${sample.id}: ${error instanceof Error ? error.message : 'Unknown error'}`); throw error; } }; // Process all samples in parallel const resultPromises = samples.map((sample, index) => processSample(sample, index)); const results = await Promise.all(resultPromises); // Clear progress line and show completion const totalTime = Date.now() - startTime; process.stdout.write(`\r✅ All ${samples.length} samples processed in ${(totalTime / 1000).toFixed(1)}s (${Math.round(totalTime / samples.length)}ms avg)\n`); // Output results if (output) { // Write to file console.log("\n💾 Writing results to file..."); const outputData = results.map((r) => JSON.stringify(r)).join("\n"); await fs.writeFile(output, outputData); console.log(`✅ Results written to ${output}`); } else { console.log("\n📊 Generating evaluation report..."); // Output to console printLine("\nEvaluation Results:"); printTable(results.map((r) => ({ "Sample ID": r.id, "Score": r.score, "Latency (ms)": Math.round(r.latencyInMs), }))); // Summary statistics const scores = results.map((r) => r.score); const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length; const avgLatency = results.reduce((a, b) => a + b.latencyInMs, 0) / results.length; printLine("\nSummary Statistics:"); printTable({ "Average Score": avgScore, "Average Latency (ms)": avgLatency, "Total Samples": results.length, }); // Calibration metrics if ground truth scores are present const resultsWithGroundTruth = results.filter((r) => r.sampleMetadata && "score" in r.sampleMetadata); if (resultsWithGroundTruth.length > 0) { printLine("\nGrader Calibration Metrics:"); // Calculate accuracy metrics let exactMatches = 0; let withinOne = 0; let totalError = 0; for (const result of resultsWithGroundTruth) { const groundTruth = result.sampleMetadata?.score; const diff = Math.abs(result.score - groundTruth); if (diff === 0) exactMatches++; if (diff <= 1) withinOne++; totalError += diff; } const exactAccuracy = exactMatches / resultsWithGroundTruth.length * 100; const withinOneAccuracy = withinOne / resultsWithGroundTruth.length * 100; const avgError = totalError / resultsWithGroundTruth.length; printTable({ "Exact Match Rate": `${exactAccuracy.toFixed(1)}% (${exactMatches}/${resultsWithGroundTruth.length})`, "Within ±1 Accuracy": `${withinOneAccuracy.toFixed(1)}% (${withinOne}/${resultsWithGroundTruth.length})`, "Average Absolute Error": avgError, "Total Samples with Ground Truth": resultsWithGroundTruth.length, }); // Show disagreements const disagreements = resultsWithGroundTruth.filter((r) => r.score !== r.sampleMetadata?.score); if (disagreements.length > 0) { printLine("\nDisagreements:"); printTable(disagreements.map((result) => { const groundTruth = result.sampleMetadata ?.score; return { "Sample ID": result.id, "Grader Score": result.score, "Ground Truth": groundTruth, "Difference": result.score - groundTruth, "Assistant Response": result.sample?.assistantResponse || "No response available", }; })); // Detailed breakdown of each disagreement printLine("\nDetailed Disagreement Analysis:"); disagreements.forEach((result, index) => { const groundTruth = result.sampleMetadata ?.score; printLine(`\n${"=".repeat(80)}`); printLine(`Disagreement ${index + 1} - Sample ID: ${result.id || "N/A"}`); printLine(`${"=".repeat(80)}`); // Show sample data and scores in a single table printLine("\nSample Details:"); const sampleData = {}; // Display the original sample data if (result.sample) { sampleData["User Message"] = result.sample.userMessage; sampleData["Assistant Response"] = result.sample.assistantResponse; // Add expected value if present if (result.sample.expected) { sampleData["Expected"] = result.sample.expected; } // Add description from metadata if available if (result.sampleMetadata?.description) { sampleData["Description"] = result.sampleMetadata.description; } // Add any other sample fields (excluding the ones we already displayed) const excludedFields = [ "id", "userMessage", "assistantResponse", "expected", "score", ]; Object.entries(result.sample).forEach(([key, value]) => { if (!excludedFields.includes(key)) { const displayKey = key .replace(/([A-Z])/g, " $1") .replace(/^./, (str) => str.toUpperCase()) .trim(); sampleData[displayKey] = value; } }); } // Add score comparison to the same table sampleData["Grader Score"] = result.score; sampleData["Ground Truth"] = groundTruth; sampleData["Difference"] = result.score - groundTruth; printTable(sampleData); // Print grader notes below the table printLine("\nGrader Notes:"); printLine(result.output.notes || "No notes provided"); }); } } } } catch (error) { console.error(`Evaluation failed: ${error instanceof Error ? error.message : String(error)}`); process.exit(1); } }