ai-functions

Version:

Core AI primitives for building intelligent applications

118 lines (101 loc) • 3.45 kB

text/typescript

/** * Writing Quality Eval (LLM-as-Judge) * * Tests model writing capabilities using LLM-as-judge scoring. * Uses a strong model (sonnet) to judge output quality. */ import { evalite } from 'evalite' import { generateText, generateObject } from '../src/generate.js' import { schema } from '../src/schema.js' import { createModelVariants, type EvalModel } from '../src/eval/models.js' // Use sonnet as the judge model const JUDGE_MODEL = 'sonnet' // Writing test cases const TEST_CASES = [ { name: 'Professional email', prompt: 'Write a professional email declining a meeting invitation politely.', criteria: ['Polite tone', 'Clear explanation', 'Proper email format'], }, { name: 'Product description', prompt: 'Write a product description for wireless earbuds targeting tech-savvy consumers.', criteria: ['Highlights features', 'Compelling language', 'Clear value proposition'], }, { name: 'Explanation', prompt: 'Explain how photosynthesis works in simple terms for a high school student.', criteria: ['Accurate content', 'Clear language', 'Logical flow'], }, ] const modelVariants = createModelVariants({ tiers: ['fast'] }) evalite.each(modelVariants)('Writing Quality', { data: TEST_CASES.map(tc => ({ input: tc })), task: async (input, variant) => { const model = variant as EvalModel const startTime = Date.now() // Generate the writing const { text, usage } = await generateText({ model: model.id, prompt: input.prompt, }) const latencyMs = Date.now() - startTime return { text, testName: input.name, criteria: input.criteria, modelId: model.id, modelName: model.name, provider: model.provider, latencyMs, usage, } }, scorers: [ // LLM-as-judge for quality { name: 'Writing Quality', description: 'LLM judge evaluation of writing quality', scorer: async ({ input, output }) => { const { object } = await generateObject({ model: JUDGE_MODEL, schema: schema({ clarity: 'How clear is the writing? (number 0-1)', engagement: 'How engaging is the content? (number 0-1)', accuracy: 'How well does it meet the criteria? (number 0-1)', reasoning: 'Brief explanation', }), prompt: `Evaluate this writing on a scale of 0-1. Criteria: ${(input.criteria as string[]).join(', ')} Writing: """ ${output.text} """`, }) const avg = ((object.clarity as number) + (object.engagement as number) + (object.accuracy as number)) / 3 return { score: avg, metadata: object, } }, }, // Word count check { name: 'Appropriate Length', description: 'Whether output has reasonable length', scorer: ({ output }) => { const words = (output.text as string).split(/\s+/).length if (words < 20) return { score: 0.3, metadata: { words } } if (words > 500) return { score: 0.7, metadata: { words } } return { score: 1, metadata: { words } } }, }, ], columns: ({ output }) => [ { label: 'Model', value: output.modelName }, { label: 'Test', value: output.testName }, { label: 'Words', value: (output.text as string).split(/\s+/).length }, { label: 'Latency', value: `${output.latencyMs}ms` }, ], trialCount: 2, // Run twice for variance })