ai-functions
Version:
Core AI primitives for building intelligent applications
106 lines (103 loc) • 3.82 kB
JavaScript
/**
* Writing Quality Eval (LLM-as-Judge)
*
* Tests model writing capabilities using LLM-as-judge scoring.
* Uses a strong model (sonnet) to judge output quality.
*/
import { evalite } from 'evalite';
import { generateText, generateObject } from '../src/generate.js';
import { schema } from '../src/schema.js';
import { createModelVariants } from '../src/eval/models.js';
// Use sonnet as the judge model
const JUDGE_MODEL = 'sonnet';
// Writing test cases
const TEST_CASES = [
{
name: 'Professional email',
prompt: 'Write a professional email declining a meeting invitation politely.',
criteria: ['Polite tone', 'Clear explanation', 'Proper email format'],
},
{
name: 'Product description',
prompt: 'Write a product description for wireless earbuds targeting tech-savvy consumers.',
criteria: ['Highlights features', 'Compelling language', 'Clear value proposition'],
},
{
name: 'Explanation',
prompt: 'Explain how photosynthesis works in simple terms for a high school student.',
criteria: ['Accurate content', 'Clear language', 'Logical flow'],
},
];
const modelVariants = createModelVariants({ tiers: ['fast'] });
evalite.each(modelVariants)('Writing Quality', {
data: TEST_CASES.map(tc => ({ input: tc })),
task: async (input, variant) => {
const model = variant;
const startTime = Date.now();
// Generate the writing
const { text, usage } = await generateText({
model: model.id,
prompt: input.prompt,
});
const latencyMs = Date.now() - startTime;
return {
text,
testName: input.name,
criteria: input.criteria,
modelId: model.id,
modelName: model.name,
provider: model.provider,
latencyMs,
usage,
};
},
scorers: [
// LLM-as-judge for quality
{
name: 'Writing Quality',
description: 'LLM judge evaluation of writing quality',
scorer: async ({ input, output }) => {
const { object } = await generateObject({
model: JUDGE_MODEL,
schema: schema({
clarity: 'How clear is the writing? (number 0-1)',
engagement: 'How engaging is the content? (number 0-1)',
accuracy: 'How well does it meet the criteria? (number 0-1)',
reasoning: 'Brief explanation',
}),
prompt: `Evaluate this writing on a scale of 0-1.
Criteria: ${input.criteria.join(', ')}
Writing:
"""
${output.text}
"""`,
});
const avg = (object.clarity + object.engagement + object.accuracy) / 3;
return {
score: avg,
metadata: object,
};
},
},
// Word count check
{
name: 'Appropriate Length',
description: 'Whether output has reasonable length',
scorer: ({ output }) => {
const words = output.text.split(/\s+/).length;
if (words < 20)
return { score: 0.3, metadata: { words } };
if (words > 500)
return { score: 0.7, metadata: { words } };
return { score: 1, metadata: { words } };
},
},
],
columns: ({ output }) => [
{ label: 'Model', value: output.modelName },
{ label: 'Test', value: output.testName },
{ label: 'Words', value: output.text.split(/\s+/).length },
{ label: 'Latency', value: `${output.latencyMs}ms` },
],
trialCount: 2, // Run twice for variance
});