ai-functions
Version:
Core AI primitives for building intelligent applications
118 lines (101 loc) • 3.45 kB
text/typescript
/**
* Writing Quality Eval (LLM-as-Judge)
*
* Tests model writing capabilities using LLM-as-judge scoring.
* Uses a strong model (sonnet) to judge output quality.
*/
import { evalite } from 'evalite'
import { generateText, generateObject } from '../src/generate.js'
import { schema } from '../src/schema.js'
import { createModelVariants, type EvalModel } from '../src/eval/models.js'
// Use sonnet as the judge model
const JUDGE_MODEL = 'sonnet'
// Writing test cases
const TEST_CASES = [
{
name: 'Professional email',
prompt: 'Write a professional email declining a meeting invitation politely.',
criteria: ['Polite tone', 'Clear explanation', 'Proper email format'],
},
{
name: 'Product description',
prompt: 'Write a product description for wireless earbuds targeting tech-savvy consumers.',
criteria: ['Highlights features', 'Compelling language', 'Clear value proposition'],
},
{
name: 'Explanation',
prompt: 'Explain how photosynthesis works in simple terms for a high school student.',
criteria: ['Accurate content', 'Clear language', 'Logical flow'],
},
]
const modelVariants = createModelVariants({ tiers: ['fast'] })
evalite.each(modelVariants)('Writing Quality', {
data: TEST_CASES.map(tc => ({ input: tc })),
task: async (input, variant) => {
const model = variant as EvalModel
const startTime = Date.now()
// Generate the writing
const { text, usage } = await generateText({
model: model.id,
prompt: input.prompt,
})
const latencyMs = Date.now() - startTime
return {
text,
testName: input.name,
criteria: input.criteria,
modelId: model.id,
modelName: model.name,
provider: model.provider,
latencyMs,
usage,
}
},
scorers: [
// LLM-as-judge for quality
{
name: 'Writing Quality',
description: 'LLM judge evaluation of writing quality',
scorer: async ({ input, output }) => {
const { object } = await generateObject({
model: JUDGE_MODEL,
schema: schema({
clarity: 'How clear is the writing? (number 0-1)',
engagement: 'How engaging is the content? (number 0-1)',
accuracy: 'How well does it meet the criteria? (number 0-1)',
reasoning: 'Brief explanation',
}),
prompt: `Evaluate this writing on a scale of 0-1.
Criteria: ${(input.criteria as string[]).join(', ')}
Writing:
"""
${output.text}
"""`,
})
const avg = ((object.clarity as number) + (object.engagement as number) + (object.accuracy as number)) / 3
return {
score: avg,
metadata: object,
}
},
},
// Word count check
{
name: 'Appropriate Length',
description: 'Whether output has reasonable length',
scorer: ({ output }) => {
const words = (output.text as string).split(/\s+/).length
if (words < 20) return { score: 0.3, metadata: { words } }
if (words > 500) return { score: 0.7, metadata: { words } }
return { score: 1, metadata: { words } }
},
},
],
columns: ({ output }) => [
{ label: 'Model', value: output.modelName },
{ label: 'Test', value: output.testName },
{ label: 'Words', value: (output.text as string).split(/\s+/).length },
{ label: 'Latency', value: `${output.latencyMs}ms` },
],
trialCount: 2, // Run twice for variance
})