aiwg

Version:

Deployment tool and support utility for AI context. Copies agents, skills, commands, rules, and behaviors into the paths each AI platform reads (Claude Code, Codex, Copilot, Cursor, Warp, OpenClaw, and 6 more) so one source of truth works across 10 platfo

aiwg.io

jmagly/aiwg

242 lines (209 loc) • 7.56 kB

text/typescript

/** * AIWG Evaluation Runner — orchestrates test execution across AIWG-specific dimensions, * with optional delegation to the matric-eval framework for standard benchmarks. * * Extends matric-eval via composition: @matric/eval-client */ import { promises as fs } from 'fs'; import path from 'path'; import yaml from 'js-yaml'; import { createClient, type MatricEvalClient, type EvalSummary } from '@matric/eval-client'; import type { GenerationModel, TestCase, EvalResult, DimensionScore, EvalReport } from './models/types.js'; import { DIMENSION_WEIGHTS, scoreTier, calculateOverall } from './scoring/weights.js'; export interface RunnerOptions { dimensions?: string[]; outputFormat?: 'json' | 'markdown'; verbose?: boolean; /** Include standard matric-eval benchmark scores when matric-eval binary is available */ includeMatricBenchmarks?: boolean; } export class AiwgEvalRunner { private model: GenerationModel; private datasetsDir: string; private matricClient: MatricEvalClient; constructor(model: GenerationModel, datasetsDir: string) { this.model = model; this.datasetsDir = datasetsDir; this.matricClient = createClient(); } async run(options: RunnerOptions = {}): Promise<EvalReport> { const dimensions = options.dimensions || Object.keys(DIMENSION_WEIGHTS); const results: EvalResult[] = []; const dimensionScores: DimensionScore[] = []; for (const dimension of dimensions) { const testCases = await this.loadTestCases(dimension); if (testCases.length === 0) { if (options.verbose) console.log(` Skipping ${dimension}: no test cases found`); continue; } if (options.verbose) console.log(`\nEvaluating: ${dimension} (${testCases.length} test cases)`); const dimResults: EvalResult[] = []; for (const testCase of testCases) { if (options.verbose) console.log(` Running: ${testCase.id}`); const result = await this.evaluateTestCase(testCase); dimResults.push(result); results.push(result); } const totalScore = dimResults.reduce((s, r) => s + r.score, 0); const maxScore = dimResults.reduce((s, r) => s + r.maxScore, 0); const dimScore = maxScore > 0 ? Math.round((totalScore / maxScore) * 100) : 0; dimensionScores.push({ dimension, score: dimScore, tier: scoreTier(dimScore), testCases: testCases.length, passed: dimResults.filter((r) => r.score / r.maxScore >= 0.5).length, }); } const dimScoreMap: Record<string, number> = {}; for (const ds of dimensionScores) { dimScoreMap[ds.dimension] = ds.score; } const overall = calculateOverall(dimScoreMap); const totalLatency = results.reduce((s, r) => s + r.latencyMs, 0); // Optionally include standard matric-eval benchmark scores let matricSummary: EvalSummary | undefined; if (options.includeMatricBenchmarks && (await this.matricClient.isAvailable())) { if (options.verbose) console.log('\nRunning standard matric-eval benchmarks...'); matricSummary = await this.matricClient.run({ models: [this.model.name], tier: 'smoke', logLevel: options.verbose ? 'INFO' : 'WARNING', }); } return { model: this.model.name, backend: 'ollama', date: new Date().toISOString(), aiwgVersion: '2026.3.2', dimensions: dimensionScores, overall, overallTier: scoreTier(overall), totalLatencyMs: totalLatency, matricBenchmarks: matricSummary ? matricSummary.results.find((r) => r.model === this.model.name) : undefined, }; } private async loadTestCases(dimension: string): Promise<TestCase[]> { const dimDir = path.join(this.datasetsDir, dimension); try { const files = await fs.readdir(dimDir); const testCases: TestCase[] = []; for (const file of files) { if (!file.endsWith('.yaml') && !file.endsWith('.yml')) continue; const content = await fs.readFile(path.join(dimDir, file), 'utf-8'); const parsed = yaml.load(content) as TestCase; if (parsed?.id) testCases.push(parsed); } return testCases; } catch { return []; } } private async evaluateTestCase(testCase: TestCase): Promise<EvalResult> { const startTime = Date.now(); try { const result = await this.model.generate(testCase.prompt, { temperature: 0, maxTokens: 2048, }); const latencyMs = Date.now() - startTime; const { score, maxScore, details } = scoreResponse(result.text, testCase); return { testCaseId: testCase.id, dimension: testCase.dimension, score, maxScore, details, latencyMs, modelResponse: result.text.slice(0, 500), }; } catch (error) { return { testCaseId: testCase.id, dimension: testCase.dimension, score: 0, maxScore: 1, details: { error: (error as Error).message }, latencyMs: Date.now() - startTime, modelResponse: '', }; } } } /** Backward-compatible alias — use AiwgEvalRunner for new code */ export const EvalRunner = AiwgEvalRunner; function scoreResponse(response: string, testCase: TestCase): { score: number; maxScore: number; details: Record<string, unknown> } { const scoring = testCase.scoring; const expected = testCase.expected; let score = 0; let maxScore = 0; const details: Record<string, unknown> = {}; // Check for expected tool calls if (expected.tool_calls) { const toolCalls = expected.tool_calls as Array<{ tool: string; params_contain?: Record<string, string> }>; for (const tc of toolCalls) { maxScore += scoring.correct_tool || 0.5; if (response.toLowerCase().includes(tc.tool.toLowerCase())) { score += scoring.correct_tool || 0.5; details[`tool_${tc.tool}`] = 'found'; } if (tc.params_contain) { maxScore += scoring.correct_params || 0.5; const paramsFound = Object.values(tc.params_contain).every((v) => response.includes(String(v)) ); if (paramsFound) { score += scoring.correct_params || 0.5; details[`params_${tc.tool}`] = 'correct'; } } } } // Check for expected content if (expected.contains) { const contains = expected.contains as string[]; for (const c of contains) { maxScore += 1; if (response.toLowerCase().includes(c.toLowerCase())) { score += 1; details[`contains_${c}`] = true; } } } // Check for forbidden content if (expected.must_not_contain) { const forbidden = expected.must_not_contain as string[]; maxScore += scoring.no_hallucination || 0.2; const clean = forbidden.every((f) => !response.toLowerCase().includes(f.toLowerCase())); if (clean) { score += scoring.no_hallucination || 0.2; details.no_hallucination = true; } } // Check JSON/YAML validity if (expected.valid_json) { maxScore += 1; try { JSON.parse(response); score += 1; details.valid_json = true; } catch { details.valid_json = false; } } if (expected.valid_yaml) { maxScore += 1; try { yaml.load(response); score += 1; details.valid_yaml = true; } catch { details.valid_yaml = false; } } // Fallback: at least 1 point max if (maxScore === 0) maxScore = 1; return { score, maxScore, details }; }