aiwg
Version:
Deployment tool and support utility for AI context. Copies agents, skills, commands, rules, and behaviors into the paths each AI platform reads (Claude Code, Codex, Copilot, Cursor, Warp, OpenClaw, and 6 more) so one source of truth works across 10 platfo
242 lines (209 loc) • 7.56 kB
text/typescript
/**
* AIWG Evaluation Runner — orchestrates test execution across AIWG-specific dimensions,
* with optional delegation to the matric-eval framework for standard benchmarks.
*
* Extends matric-eval via composition: @matric/eval-client
*/
import { promises as fs } from 'fs';
import path from 'path';
import yaml from 'js-yaml';
import { createClient, type MatricEvalClient, type EvalSummary } from '@matric/eval-client';
import type { GenerationModel, TestCase, EvalResult, DimensionScore, EvalReport } from './models/types.js';
import { DIMENSION_WEIGHTS, scoreTier, calculateOverall } from './scoring/weights.js';
export interface RunnerOptions {
dimensions?: string[];
outputFormat?: 'json' | 'markdown';
verbose?: boolean;
/** Include standard matric-eval benchmark scores when matric-eval binary is available */
includeMatricBenchmarks?: boolean;
}
export class AiwgEvalRunner {
private model: GenerationModel;
private datasetsDir: string;
private matricClient: MatricEvalClient;
constructor(model: GenerationModel, datasetsDir: string) {
this.model = model;
this.datasetsDir = datasetsDir;
this.matricClient = createClient();
}
async run(options: RunnerOptions = {}): Promise<EvalReport> {
const dimensions = options.dimensions || Object.keys(DIMENSION_WEIGHTS);
const results: EvalResult[] = [];
const dimensionScores: DimensionScore[] = [];
for (const dimension of dimensions) {
const testCases = await this.loadTestCases(dimension);
if (testCases.length === 0) {
if (options.verbose) console.log(` Skipping ${dimension}: no test cases found`);
continue;
}
if (options.verbose) console.log(`\nEvaluating: ${dimension} (${testCases.length} test cases)`);
const dimResults: EvalResult[] = [];
for (const testCase of testCases) {
if (options.verbose) console.log(` Running: ${testCase.id}`);
const result = await this.evaluateTestCase(testCase);
dimResults.push(result);
results.push(result);
}
const totalScore = dimResults.reduce((s, r) => s + r.score, 0);
const maxScore = dimResults.reduce((s, r) => s + r.maxScore, 0);
const dimScore = maxScore > 0 ? Math.round((totalScore / maxScore) * 100) : 0;
dimensionScores.push({
dimension,
score: dimScore,
tier: scoreTier(dimScore),
testCases: testCases.length,
passed: dimResults.filter((r) => r.score / r.maxScore >= 0.5).length,
});
}
const dimScoreMap: Record<string, number> = {};
for (const ds of dimensionScores) {
dimScoreMap[ds.dimension] = ds.score;
}
const overall = calculateOverall(dimScoreMap);
const totalLatency = results.reduce((s, r) => s + r.latencyMs, 0);
// Optionally include standard matric-eval benchmark scores
let matricSummary: EvalSummary | undefined;
if (options.includeMatricBenchmarks && (await this.matricClient.isAvailable())) {
if (options.verbose) console.log('\nRunning standard matric-eval benchmarks...');
matricSummary = await this.matricClient.run({
models: [this.model.name],
tier: 'smoke',
logLevel: options.verbose ? 'INFO' : 'WARNING',
});
}
return {
model: this.model.name,
backend: 'ollama',
date: new Date().toISOString(),
aiwgVersion: '2026.3.2',
dimensions: dimensionScores,
overall,
overallTier: scoreTier(overall),
totalLatencyMs: totalLatency,
matricBenchmarks: matricSummary
? matricSummary.results.find((r) => r.model === this.model.name)
: undefined,
};
}
private async loadTestCases(dimension: string): Promise<TestCase[]> {
const dimDir = path.join(this.datasetsDir, dimension);
try {
const files = await fs.readdir(dimDir);
const testCases: TestCase[] = [];
for (const file of files) {
if (!file.endsWith('.yaml') && !file.endsWith('.yml')) continue;
const content = await fs.readFile(path.join(dimDir, file), 'utf-8');
const parsed = yaml.load(content) as TestCase;
if (parsed?.id) testCases.push(parsed);
}
return testCases;
} catch {
return [];
}
}
private async evaluateTestCase(testCase: TestCase): Promise<EvalResult> {
const startTime = Date.now();
try {
const result = await this.model.generate(testCase.prompt, {
temperature: 0,
maxTokens: 2048,
});
const latencyMs = Date.now() - startTime;
const { score, maxScore, details } = scoreResponse(result.text, testCase);
return {
testCaseId: testCase.id,
dimension: testCase.dimension,
score,
maxScore,
details,
latencyMs,
modelResponse: result.text.slice(0, 500),
};
} catch (error) {
return {
testCaseId: testCase.id,
dimension: testCase.dimension,
score: 0,
maxScore: 1,
details: { error: (error as Error).message },
latencyMs: Date.now() - startTime,
modelResponse: '',
};
}
}
}
/** Backward-compatible alias — use AiwgEvalRunner for new code */
export const EvalRunner = AiwgEvalRunner;
function scoreResponse(response: string, testCase: TestCase): { score: number; maxScore: number; details: Record<string, unknown> } {
const scoring = testCase.scoring;
const expected = testCase.expected;
let score = 0;
let maxScore = 0;
const details: Record<string, unknown> = {};
// Check for expected tool calls
if (expected.tool_calls) {
const toolCalls = expected.tool_calls as Array<{ tool: string; params_contain?: Record<string, string> }>;
for (const tc of toolCalls) {
maxScore += scoring.correct_tool || 0.5;
if (response.toLowerCase().includes(tc.tool.toLowerCase())) {
score += scoring.correct_tool || 0.5;
details[`tool_${tc.tool}`] = 'found';
}
if (tc.params_contain) {
maxScore += scoring.correct_params || 0.5;
const paramsFound = Object.values(tc.params_contain).every((v) =>
response.includes(String(v))
);
if (paramsFound) {
score += scoring.correct_params || 0.5;
details[`params_${tc.tool}`] = 'correct';
}
}
}
}
// Check for expected content
if (expected.contains) {
const contains = expected.contains as string[];
for (const c of contains) {
maxScore += 1;
if (response.toLowerCase().includes(c.toLowerCase())) {
score += 1;
details[`contains_${c}`] = true;
}
}
}
// Check for forbidden content
if (expected.must_not_contain) {
const forbidden = expected.must_not_contain as string[];
maxScore += scoring.no_hallucination || 0.2;
const clean = forbidden.every((f) => !response.toLowerCase().includes(f.toLowerCase()));
if (clean) {
score += scoring.no_hallucination || 0.2;
details.no_hallucination = true;
}
}
// Check JSON/YAML validity
if (expected.valid_json) {
maxScore += 1;
try {
JSON.parse(response);
score += 1;
details.valid_json = true;
} catch {
details.valid_json = false;
}
}
if (expected.valid_yaml) {
maxScore += 1;
try {
yaml.load(response);
score += 1;
details.valid_yaml = true;
} catch {
details.valid_yaml = false;
}
}
// Fallback: at least 1 point max
if (maxScore === 0) maxScore = 1;
return { score, maxScore, details };
}