ai-functions
Version:
Core AI primitives for building intelligent applications
144 lines (128 loc) • 4.16 kB
text/typescript
/**
* Structured Output Eval
*
* Tests model ability to generate valid structured JSON output
* matching specified schemas across all providers.
*/
import { evalite } from 'evalite'
import { generateObject } from '../src/generate.js'
import { schema } from '../src/schema.js'
import { createModelVariants, getModelPricing, type EvalModel } from '../src/eval/models.js'
// Test cases for structured output
const TEST_CASES = [
{
name: 'Simple object',
prompt: 'Generate a greeting in French',
schema: {
greeting: 'A friendly greeting',
language: 'The language of the greeting',
},
expectedTypes: { greeting: 'string', language: 'string' },
},
{
name: 'With numbers',
prompt: 'Generate info about Tokyo',
schema: {
name: 'City name',
population: 'Population in millions (number)',
area: 'Area in square kilometers (number)',
},
expectedTypes: { name: 'string', population: 'number', area: 'number' },
},
{
name: 'With arrays',
prompt: 'Generate a simple pasta recipe',
schema: {
title: 'Recipe title',
ingredients: ['List of ingredients'],
steps: ['Cooking steps'],
},
expectedTypes: { title: 'string', ingredients: 'array', steps: 'array' },
},
{
name: 'With enum',
prompt: 'Analyze sentiment: "I love this product!"',
schema: {
sentiment: 'positive | negative | neutral',
confidence: 'Confidence score 0-1 (number)',
},
expectedTypes: { sentiment: 'string', confidence: 'number' },
},
{
name: 'Nested object',
prompt: 'Generate a fictional person living in Japan',
schema: {
person: { name: 'Full name', age: 'Age (number)' },
address: { city: 'City name', country: 'Country name' },
},
expectedTypes: { person: 'object', address: 'object' },
},
]
// Test across models - start with fast tier for quick iteration
const modelVariants = createModelVariants({ tiers: ['fast'] })
evalite.each(modelVariants)('Structured Output', {
data: TEST_CASES.map(tc => ({ input: tc })),
task: async (input, variant) => {
const model = variant as EvalModel
const startTime = Date.now()
const { object, usage } = await generateObject({
model: model.id,
schema: schema(input.schema),
prompt: input.prompt,
})
const latencyMs = Date.now() - startTime
// Calculate cost from language-models pricing
const pricing = getModelPricing(model.id)
const cost = pricing
? ((usage?.promptTokens ?? 0) * pricing.prompt + (usage?.completionTokens ?? 0) * pricing.completion) / 1_000_000
: 0
return {
object,
expectedTypes: input.expectedTypes,
testName: input.name,
modelId: model.id,
modelName: model.name,
provider: model.provider,
latencyMs,
cost,
usage,
}
},
scorers: [
// Type accuracy
{
name: 'Type Accuracy',
description: 'Whether output fields have correct types',
scorer: ({ output }) => {
const obj = output.object as Record<string, unknown>
const expected = output.expectedTypes as Record<string, string>
const fields = Object.keys(expected)
let correct = 0
for (const field of fields) {
const val = obj[field]
const expectedType = expected[field]
const actualType = Array.isArray(val) ? 'array' : typeof val
if (actualType === expectedType) correct++
}
return { score: correct / fields.length }
},
},
// Latency
{
name: 'Latency',
description: 'Response time (target < 3s)',
scorer: ({ output }) => {
const ms = output.latencyMs as number
if (ms < 2000) return { score: 1 }
if (ms > 10000) return { score: 0 }
return { score: 1 - (ms - 2000) / 8000 }
},
},
],
columns: ({ output, scores }) => [
{ label: 'Model', value: output.modelName },
{ label: 'Test', value: output.testName },
{ label: 'Latency', value: `${output.latencyMs}ms` },
{ label: 'Cost', value: `$${(output.cost as number).toFixed(6)}` },
],
})