UNPKG

ai-functions

Version:

Core AI primitives for building intelligent applications

154 lines 6.11 kB
/** * Simple eval runner for AI Functions * * Runs evals across multiple models and collects results. * Does not depend on evalite - uses our own infrastructure. */ import { generateObject, generateText } from '../generate.js'; import { schema } from '../schema.js'; import { createModelVariants, getModelPricing } from './models.js'; import { getLogger } from '../logger.js'; /** * Default output function uses logger.info */ const defaultOutput = (message) => getLogger().info(message); /** * Run an eval suite across models */ export async function runEval(options) { const { name, cases, task, scorers, concurrency = 3, quiet = false } = options; const log = quiet ? () => { } : options.output ?? defaultOutput; // Get models to test const variantOptions = {}; if (options.tiers !== undefined) variantOptions.tiers = options.tiers; if (options.providers !== undefined) variantOptions.providers = options.providers; const models = options.models ?? createModelVariants(variantOptions).map((v) => v.input); const results = []; const startTime = Date.now(); log(`\nRunning eval: ${name}`); log(` Models: ${models.map((m) => m.name).join(', ')}`); log(` Cases: ${cases.length}`); log(''); // Run all model/case combinations const jobs = []; for (const model of models) { for (const evalCase of cases) { jobs.push({ model, case: evalCase }); } } // Process in batches with concurrency limit for (let i = 0; i < jobs.length; i += concurrency) { const batch = jobs.slice(i, i + concurrency); const batchResults = await Promise.all(batch.map(async (job) => { const caseStart = Date.now(); try { // Run the task const taskOutput = await task(job.case.input, job.model); const latencyMs = Date.now() - caseStart; // Run scorers const scores = []; for (const s of scorers) { try { const score = await s.scorer({ input: job.case.input, output: taskOutput, ...(job.case.expected !== undefined && { expected: job.case.expected }), }); scores.push({ name: s.name, score: Math.max(0, Math.min(1, score)), ...(s.description && { description: s.description }), }); } catch (err) { scores.push({ name: s.name, score: 0, ...(s.description && { description: s.description }), metadata: { error: String(err) }, }); } } // Calculate cost const pricing = getModelPricing(job.model.id); // Estimate tokens - rough approximation const estimatedPromptTokens = 100; const estimatedCompletionTokens = 200; const cost = pricing ? (estimatedPromptTokens * pricing.prompt + estimatedCompletionTokens * pricing.completion) / 1_000_000 : 0; const avgScore = scores.length > 0 ? scores.reduce((sum, s) => sum + s.score, 0) / scores.length : 0; const symbol = avgScore >= 0.8 ? 'PASS' : avgScore >= 0.5 ? 'WARN' : 'FAIL'; log(` ${symbol} ${job.model.name} | ${job.case.name} | ${(avgScore * 100).toFixed(0)}% | ${latencyMs}ms`); return { model: job.model, case: job.case, output: taskOutput, scores, latencyMs, cost, }; } catch (err) { log(` FAIL ${job.model.name} | ${job.case.name} | ERROR: ${err}`); return { model: job.model, case: job.case, output: null, scores: scorers.map((s) => ({ name: s.name, score: 0 })), latencyMs: Date.now() - caseStart, cost: 0, error: String(err), }; } })); results.push(...batchResults); } // Calculate summary const totalTime = Date.now() - startTime; const totalCost = results.reduce((sum, r) => sum + r.cost, 0); const allScores = results.flatMap((r) => r.scores.map((s) => s.score)); const avgScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : 0; // Group by model const byModel = {}; for (const result of results) { const modelKey = result.model.id; if (!byModel[modelKey]) { byModel[modelKey] = { avgScore: 0, count: 0 }; } const resultAvg = result.scores.reduce((sum, s) => sum + s.score, 0) / result.scores.length; byModel[modelKey].avgScore += resultAvg; byModel[modelKey].count++; } for (const key of Object.keys(byModel)) { const entry = byModel[key]; if (entry) { entry.avgScore /= entry.count; } } log(''); log(`Results:`); log(` Overall: ${(avgScore * 100).toFixed(1)}%`); log(` Time: ${(totalTime / 1000).toFixed(1)}s`); log(` Cost: $${totalCost.toFixed(4)}`); log(''); log(' By Model:'); for (const [modelId, stats] of Object.entries(byModel)) { log(` - ${modelId}: ${(stats.avgScore * 100).toFixed(1)}%`); } return { name, results, avgScore, byModel, totalCost, totalTime, }; } // Re-export helpers export { generateObject, generateText, schema }; //# sourceMappingURL=runner.js.map