ai-functions
Version:
Core AI primitives for building intelligent applications
243 lines (214 loc) • 7.2 kB
text/typescript
/**
* Simple eval runner for AI Functions
*
* Runs evals across multiple models and collects results.
* Does not depend on evalite - uses our own infrastructure.
*/
import { generateObject, generateText } from '../generate.js'
import { schema } from '../schema.js'
import { createModelVariants, getModelPricing, type EvalModel, type ModelTier } from './models.js'
import { getLogger } from '../logger.js'
/**
* Output function type for eval progress reporting
*/
export type EvalOutputFn = (message: string) => void
/**
* Default output function uses logger.info
*/
const defaultOutput: EvalOutputFn = (message: string) => getLogger().info(message)
export interface EvalCase<TInput = unknown, TExpected = unknown> {
name: string
input: TInput
expected?: TExpected
}
export interface EvalScore {
name: string
score: number
description?: string
metadata?: unknown
}
export interface EvalResult<TOutput = unknown> {
model: EvalModel
case: EvalCase
/** The output from the task. Will be null if an error occurred. */
output: TOutput | null
scores: EvalScore[]
latencyMs: number
cost: number
error?: string
}
export interface EvalSummary {
name: string
results: EvalResult[]
avgScore: number
byModel: Record<string, { avgScore: number; count: number }>
totalCost: number
totalTime: number
}
export interface RunEvalOptions<TInput, TOutput, TExpected> {
name: string
cases: EvalCase<TInput, TExpected>[]
task: (input: TInput, model: EvalModel) => Promise<TOutput>
scorers: Array<{
name: string
description?: string
scorer: (args: {
input: TInput
output: TOutput
expected?: TExpected
}) => number | Promise<number>
}>
models?: EvalModel[]
tiers?: ModelTier[]
providers?: string[]
concurrency?: number
/** Custom output function for progress reporting (defaults to logger.info) */
output?: EvalOutputFn
/** Whether to suppress progress output (defaults to false) */
quiet?: boolean
}
/**
* Run an eval suite across models
*/
export async function runEval<TInput, TOutput, TExpected>(
options: RunEvalOptions<TInput, TOutput, TExpected>
): Promise<EvalSummary> {
const { name, cases, task, scorers, concurrency = 3, quiet = false } = options
const log = quiet ? () => {} : options.output ?? defaultOutput
// Get models to test
const variantOptions: { tiers?: ModelTier[]; providers?: string[] } = {}
if (options.tiers !== undefined) variantOptions.tiers = options.tiers
if (options.providers !== undefined) variantOptions.providers = options.providers
const models = options.models ?? createModelVariants(variantOptions).map((v) => v.input)
const results: EvalResult<TOutput>[] = []
const startTime = Date.now()
log(`\nRunning eval: ${name}`)
log(` Models: ${models.map((m) => m.name).join(', ')}`)
log(` Cases: ${cases.length}`)
log('')
// Run all model/case combinations
const jobs: Array<{ model: EvalModel; case: EvalCase<TInput, TExpected> }> = []
for (const model of models) {
for (const evalCase of cases) {
jobs.push({ model, case: evalCase })
}
}
// Process in batches with concurrency limit
for (let i = 0; i < jobs.length; i += concurrency) {
const batch = jobs.slice(i, i + concurrency)
const batchResults = await Promise.all(
batch.map(async (job) => {
const caseStart = Date.now()
try {
// Run the task
const taskOutput = await task(job.case.input, job.model)
const latencyMs = Date.now() - caseStart
// Run scorers
const scores: EvalScore[] = []
for (const s of scorers) {
try {
const score = await s.scorer({
input: job.case.input,
output: taskOutput,
...(job.case.expected !== undefined && { expected: job.case.expected }),
})
scores.push({
name: s.name,
score: Math.max(0, Math.min(1, score)),
...(s.description && { description: s.description }),
})
} catch (err) {
scores.push({
name: s.name,
score: 0,
...(s.description && { description: s.description }),
metadata: { error: String(err) },
})
}
}
// Calculate cost
const pricing = getModelPricing(job.model.id)
// Estimate tokens - rough approximation
const estimatedPromptTokens = 100
const estimatedCompletionTokens = 200
const cost = pricing
? (estimatedPromptTokens * pricing.prompt +
estimatedCompletionTokens * pricing.completion) /
1_000_000
: 0
const avgScore =
scores.length > 0 ? scores.reduce((sum, s) => sum + s.score, 0) / scores.length : 0
const symbol = avgScore >= 0.8 ? 'PASS' : avgScore >= 0.5 ? 'WARN' : 'FAIL'
log(
` ${symbol} ${job.model.name} | ${job.case.name} | ${(avgScore * 100).toFixed(
0
)}% | ${latencyMs}ms`
)
return {
model: job.model,
case: job.case,
output: taskOutput,
scores,
latencyMs,
cost,
}
} catch (err) {
log(` FAIL ${job.model.name} | ${job.case.name} | ERROR: ${err}`)
return {
model: job.model,
case: job.case,
output: null,
scores: scorers.map((s) => ({ name: s.name, score: 0 })),
latencyMs: Date.now() - caseStart,
cost: 0,
error: String(err),
}
}
})
)
results.push(...batchResults)
}
// Calculate summary
const totalTime = Date.now() - startTime
const totalCost = results.reduce((sum, r) => sum + r.cost, 0)
const allScores = results.flatMap((r) => r.scores.map((s) => s.score))
const avgScore =
allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : 0
// Group by model
const byModel: Record<string, { avgScore: number; count: number }> = {}
for (const result of results) {
const modelKey = result.model.id
if (!byModel[modelKey]) {
byModel[modelKey] = { avgScore: 0, count: 0 }
}
const resultAvg = result.scores.reduce((sum, s) => sum + s.score, 0) / result.scores.length
byModel[modelKey].avgScore += resultAvg
byModel[modelKey].count++
}
for (const key of Object.keys(byModel)) {
const entry = byModel[key]
if (entry) {
entry.avgScore /= entry.count
}
}
log('')
log(`Results:`)
log(` Overall: ${(avgScore * 100).toFixed(1)}%`)
log(` Time: ${(totalTime / 1000).toFixed(1)}s`)
log(` Cost: $${totalCost.toFixed(4)}`)
log('')
log(' By Model:')
for (const [modelId, stats] of Object.entries(byModel)) {
log(` - ${modelId}: ${(stats.avgScore * 100).toFixed(1)}%`)
}
return {
name,
results,
avgScore,
byModel,
totalCost,
totalTime,
}
}
// Re-export helpers
export { generateObject, generateText, schema }