ai-functions
Version:
Core AI primitives for building intelligent applications
152 lines (151 loc) • 7.02 kB
JavaScript
#!/usr/bin/env npx tsx
/**
* Run AI Functions Eval Suite
*
* Usage:
* npx tsx evals/run-evals.ts [--fast] [--all]
*
* Options:
* --fast Only run fast-tier models (default)
* --all Run all models
* --math Run only math eval
* --class Run only classification eval
*/
import { runEval, generateObject, schema } from '../src/eval/runner.js';
// Parse CLI args
const args = process.argv.slice(2);
const runAll = args.includes('--all');
const runMath = args.includes('--math');
const runClass = args.includes('--class');
const runSingle = runMath || runClass;
const tiers = runAll ? ['best', 'fast', 'cheap'] : ['fast'];
console.log('╔════════════════════════════════════════════════════════════════╗');
console.log('║ AI Functions Eval Suite ║');
console.log('╚════════════════════════════════════════════════════════════════╝');
console.log('');
console.log(`Tiers: ${tiers.join(', ')}`);
// Math eval
async function runMathEval() {
const cases = [
{ name: 'Simple addition', input: { problem: 'What is 15 + 27?' }, expected: 42 },
{ name: 'Division', input: { problem: 'What is 144 / 12?' }, expected: 12 },
{ name: 'Multiplication', input: { problem: 'What is 7 * 8?' }, expected: 56 },
{ name: 'Word problem', input: { problem: 'A store sells 45 apples at $2 each. What is the total revenue?' }, expected: 90 },
{ name: 'Multi-step', input: { problem: 'A company has 120 employees. 40% work in engineering, and 25% of engineers are senior. How many senior engineers?' }, expected: 12 },
];
return runEval({
name: 'Math',
cases,
tiers,
task: async (input, model) => {
const { object } = await generateObject({
model: model.id,
schema: schema({
answer: 'The numeric answer (number)',
reasoning: 'Step by step reasoning',
}),
prompt: `Solve this math problem:\n\n${input.problem}`,
});
return object;
},
scorers: [
{
name: 'Correct Answer',
description: 'Whether the numeric answer is correct',
scorer: ({ output, expected }) => {
const answer = output.answer;
const exp = expected;
return Math.abs(answer - exp) < 0.01 ? 1 : 0;
},
},
{
name: 'Shows Work',
description: 'Whether model explains reasoning',
scorer: ({ output }) => {
const reasoning = output.reasoning;
if (!reasoning || reasoning.length < 20)
return 0.2;
if (reasoning.length > 50)
return 1;
return 0.6;
},
},
],
});
}
// Classification eval
async function runClassificationEval() {
const cases = [
{ name: 'Positive sentiment', input: { text: 'This product exceeded my expectations!', options: ['positive', 'negative', 'neutral'] }, expected: 'positive' },
{ name: 'Negative sentiment', input: { text: 'The delivery was late and packaging damaged.', options: ['positive', 'negative', 'neutral'] }, expected: 'negative' },
{ name: 'Neutral sentiment', input: { text: 'The product arrived as described.', options: ['positive', 'negative', 'neutral'] }, expected: 'neutral' },
{ name: 'Account ticket', input: { text: 'I need to reset my password', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'account' },
{ name: 'Billing ticket', input: { text: 'When will my refund be processed?', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'billing' },
{ name: 'Technical ticket', input: { text: 'The app crashes when uploading images', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'technical' },
];
return runEval({
name: 'Classification',
cases,
tiers,
task: async (input, model) => {
const enumStr = input.options.join(' | ');
const { object } = await generateObject({
model: model.id,
schema: schema({
category: enumStr,
confidence: 'Confidence 0-1 (number)',
}),
prompt: `Classify this text into one of: ${input.options.join(', ')}\n\nText: "${input.text}"`,
});
return object;
},
scorers: [
{
name: 'Accuracy',
description: 'Whether classification is correct',
scorer: ({ output, expected }) => {
const predicted = output.category;
return predicted === expected ? 1 : 0;
},
},
{
name: 'Valid Category',
description: 'Whether output is a valid option',
scorer: ({ input, output }) => {
const predicted = output.category;
const options = input.options;
return options.includes(predicted) ? 1 : 0;
},
},
],
});
}
// Run evals
async function main() {
const results = [];
if (!runSingle || runMath) {
results.push(await runMathEval());
}
if (!runSingle || runClass) {
results.push(await runClassificationEval());
}
// Overall summary
console.log('');
console.log('╔════════════════════════════════════════════════════════════════╗');
console.log('║ Summary ║');
console.log('╚════════════════════════════════════════════════════════════════╝');
let totalScore = 0;
let totalCost = 0;
let totalTime = 0;
for (const result of results) {
console.log(`\n${result.name}: ${(result.avgScore * 100).toFixed(1)}%`);
totalScore += result.avgScore;
totalCost += result.totalCost;
totalTime += result.totalTime;
}
console.log('');
console.log(`Overall: ${((totalScore / results.length) * 100).toFixed(1)}%`);
console.log(`Total Cost: $${totalCost.toFixed(4)}`);
console.log(`Total Time: ${(totalTime / 1000).toFixed(1)}s`);
}
main().catch(console.error);