aiwg

Version:

Deployment tool and support utility for AI context. Copies agents, skills, commands, rules, and behaviors into the paths each AI platform reads (Claude Code, Codex, Copilot, Cursor, Warp, OpenClaw, and 6 more) so one source of truth works across 10 platfo

aiwg.io

jmagly/aiwg

64 lines (56 loc) • 1.5 kB

text/typescript

View Raw

/** * Model types for AIWG evaluation suite * Extends matric-eval types from @matric/eval-client */ export interface GenerationOptions { temperature?: number; maxTokens?: number; stopSequences?: string[]; stream?: boolean; } export interface GenerationResult { text: string; tokensGenerated: number; totalTime: number; timeToFirstToken?: number; } export interface GenerationModel { readonly name: string; generate(prompt: string, options?: GenerationOptions): Promise<GenerationResult>; } export interface TestCase { id: string; dimension: string; difficulty: 'basic' | 'intermediate' | 'advanced'; prompt: string; expected: Record<string, unknown>; scoring: Record<string, number>; } export interface EvalResult { testCaseId: string; dimension: string; score: number; maxScore: number; details: Record<string, unknown>; latencyMs: number; modelResponse: string; } export interface DimensionScore { dimension: string; score: number; tier: 'opus' | 'sonnet' | 'haiku' | 'not-recommended'; testCases: number; passed: number; } export interface EvalReport { model: string; backend: string; date: string; aiwgVersion: string; dimensions: DimensionScore[]; overall: number; overallTier: 'opus' | 'sonnet' | 'haiku' | 'not-recommended'; totalLatencyMs: number; /** Standard benchmark scores from matric-eval, present when --include-matric-benchmarks is set */ matricBenchmarks?: import('@matric/eval-client').ModelResult; }