claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
597 lines • 32.3 kB
JavaScript
/**
* V3 CLI gaia-bench Command — ADR-133-PR8 + ADR-135 Tracks A/B/D/E/Q + ADR-136 Track Q
*
* Runs GAIA benchmark questions through the claude-flow agent loop and
* reports pass-rate, cost, and per-question results.
*
* Contract (matches gaia-benchmark.yml workflow expectations):
* node bin/cli.js gaia-bench run \
* --level <1|2|3> \
* --limit <N> \
* --models <csv> \
* --output json
*
* JSON output shape:
* {
* level: number,
* model: string,
* summary: { total, passed, passRate, estCostUsd, hardnessDist? },
* results: [{ task_id, question, model, correct, answer, expected_output, error }]
* }
*
* Integration (iter 39 — ADR-135):
* Wires standalone track modules into the CLI so they are usable end-to-end.
* - Track A (--voting-attempts N) : multi-attempt self-consistency voting
* - Track B (--planning-interval N) : periodic planning checkpoints in gaia-agent
* - Track D (--enable-critic) : adversarial critic review after agent answer
* - Track E (--decompose) : question decomposition for multi-step Qs
* - Track Q (--hardness-routing) : hardness-based compute allocation
*
* Precedence when flags combine:
* --hardness-routing overrides --max-turns and --voting-attempts per question.
* --voting-attempts > 1 takes precedence over --enable-critic (cost containment).
* --decompose works independently; sub-question answers feed into voting/critic/plain.
*
* Refs: ADR-133, ADR-135, ADR-136, #2165, iter 28/34/36/37/39
*/
import { output } from '../output.js';
// ---------------------------------------------------------------------------
// Pricing constants for cost estimation
// ---------------------------------------------------------------------------
const MODEL_PRICING = {
'claude-haiku-4-5': { inputPerM: 0.25, outputPerM: 1.25 },
'claude-haiku-3': { inputPerM: 0.25, outputPerM: 1.25 },
'claude-sonnet-4-5': { inputPerM: 3.0, outputPerM: 15.0 },
'claude-sonnet-4-6': { inputPerM: 3.0, outputPerM: 15.0 },
'claude-opus-4-5': { inputPerM: 15.0, outputPerM: 75.0 },
};
function estimateCost(model, totalInputTokens, totalOutputTokens) {
const pricing = MODEL_PRICING[model] ?? { inputPerM: 3.0, outputPerM: 15.0 };
return ((totalInputTokens / 1_000_000) * pricing.inputPerM +
(totalOutputTokens / 1_000_000) * pricing.outputPerM);
}
// ---------------------------------------------------------------------------
// run subcommand
// ---------------------------------------------------------------------------
const runCommand = {
name: 'run',
description: 'Run GAIA benchmark questions against one or more models',
options: [
{
name: 'level',
short: 'l',
type: 'number',
description: 'GAIA difficulty level: 1 (easiest), 2, or 3',
default: '1',
},
{
name: 'limit',
short: 'n',
type: 'number',
description: 'Maximum number of questions to run (default: all)',
},
{
name: 'models',
short: 'm',
type: 'string',
description: 'Comma-separated list of model IDs to test',
default: 'claude-haiku-4-5',
},
{
name: 'output',
short: 'o',
type: 'string',
description: 'Output format: text or json',
default: 'text',
},
{
name: 'concurrency',
short: 'c',
type: 'number',
description: 'Number of questions to run in parallel (default: 3)',
default: '3',
},
{
name: 'smoke-only',
type: 'boolean',
description: 'Use the 5-question smoke fixture instead of real HF dataset (no HF token required)',
default: 'false',
},
{
name: 'max-turns',
type: 'number',
description: 'Maximum agent turns per question (default: 12). Overridden per-question when --hardness-routing is enabled.',
default: '12',
},
{
name: 'judge-model',
type: 'string',
description: 'Model for LLM-as-judge scoring (default: claude-sonnet-4-6)',
default: 'claude-sonnet-4-6',
},
{
name: 'voting-attempts',
type: 'number',
description: 'Number of parallel attempts for majority-vote self-consistency (default: 1 = no voting). N>1 costs Nx per question. Recommended: 3 (+5-10pp L1 lift per ADR-135 Track A). Overridden per-question when --hardness-routing is enabled.',
default: '1',
},
{
name: 'hardness-routing',
type: 'boolean',
description: 'ADR-136 Track Q: enable hardness-based compute routing. Trains a linear classifier from historical result JSONs and allocates: easy=Haiku/4t/1-attempt, medium=Sonnet/8t/1-attempt, hard=Sonnet/12t/3-vote. Overrides --max-turns and --voting-attempts per question.',
default: 'false',
},
{
name: 'hardness-verbose',
type: 'boolean',
description: 'ADR-136 Track Q: log hardness prediction for each question (requires --hardness-routing).',
default: 'false',
},
{
name: 'enable-critic',
type: 'boolean',
description: 'ADR-135 Track D: enable adversarial critic review after agent answer (+3-5pp L1 lift expected). Skipped when --voting-attempts > 1 (cost containment — voting takes precedence).',
default: 'false',
},
{
name: 'decompose',
type: 'boolean',
description: 'ADR-135 Track E: decompose complex questions into 1-5 sub-questions before solving (+5-10pp on multi-step Qs, ~30-40% of L1 set). Each sub-question runs through voting/critic/plain independently; sub-answers are synthesized before judging.',
default: 'false',
},
{
name: 'planning-interval',
type: 'number',
description: 'ADR-135 Track B: inject a planning checkpoint every N tool_use turns (default: 4, set 0 to disable). Based on smolagents finding — prevents tunnel-vision on bad strategies.',
default: '4',
},
{
name: 'enable-convergence',
type: 'boolean',
description: 'iter 62: enable convergence layer — forces a final commit when max_turns, loop, or token_overflow is detected (default: true). Disabling is for ablation only.',
default: 'true',
},
],
examples: [
{
command: 'claude-flow gaia-bench run --level 1 --limit 10 --models claude-haiku-4-5 --output json',
description: 'Run 10 Level-1 questions with Haiku, JSON output',
},
{
command: 'claude-flow gaia-bench run --level 1 --limit 10 --models claude-haiku-4-5,claude-sonnet-4-6',
description: 'Compare Haiku vs Sonnet on 10 Level-1 questions',
},
{
command: 'claude-flow gaia-bench run --smoke-only --output json',
description: 'Quick smoke test (5 fixture questions, no HF token needed)',
},
{
command: 'claude-flow gaia-bench run --level 1 --limit 20 --models claude-haiku-4-5 --voting-attempts 3 --output json',
description: 'Self-consistency voting: run each question 3x, majority-vote (ADR-135 Track A, +5-10pp expected)',
},
{
command: 'claude-flow gaia-bench run --level 1 --models claude-sonnet-4-6 --hardness-routing --output json',
description: 'ADR-136 Track Q: auto-route questions to Haiku/Sonnet based on predicted difficulty',
},
{
command: 'claude-flow gaia-bench run --level 1 --models claude-sonnet-4-6 --enable-critic --output json',
description: 'ADR-135 Track D: adversarial critic reviews each answer before submission (+3-5pp expected)',
},
{
command: 'claude-flow gaia-bench run --level 1 --models claude-sonnet-4-6 --decompose --output json',
description: 'ADR-135 Track E: decompose complex questions into sub-questions (+5-10pp on multi-step Qs)',
},
{
command: 'claude-flow gaia-bench run --level 1 --models claude-sonnet-4-6 --hardness-routing --enable-critic --planning-interval 4',
description: 'Recommended config: hardness routing + critic + planning checkpoints (~$2/run est.)',
},
],
action: async (ctx) => {
const level = parseInt(String(ctx.flags.level ?? '1'), 10);
const limit = ctx.flags.limit ? parseInt(String(ctx.flags.limit), 10) : undefined;
const modelsRaw = String(ctx.flags.models ?? 'claude-haiku-4-5');
const models = modelsRaw.split(',').map((m) => m.trim()).filter(Boolean);
const outputFormat = String(ctx.flags.output ?? 'text');
const concurrency = parseInt(String(ctx.flags.concurrency ?? '3'), 10);
// Parser converts --smoke-only to camelCase "smokeOnly"
const smokeOnly = ctx.flags['smokeOnly'] === true || ctx.flags['smokeOnly'] === 'true' ||
ctx.flags['smoke-only'] === true || ctx.flags['smoke-only'] === 'true';
// Parser converts --max-turns to maxTurns, --judge-model to judgeModel, --voting-attempts to votingAttempts
// NOTE: default must match DEFAULT_MAX_TURNS in benchmarks/gaia-agent.ts
const maxTurns = parseInt(String(ctx.flags['maxTurns'] ?? ctx.flags['max-turns'] ?? '12'), 10);
const judgeModel = String(ctx.flags['judgeModel'] ?? ctx.flags['judge-model'] ?? 'claude-sonnet-4-6');
// votingAttempts=1 means no voting (backward-compat default). N>1 routes through runGaiaAgentWithVoting.
const votingAttempts = parseInt(String(ctx.flags['votingAttempts'] ?? ctx.flags['voting-attempts'] ?? '1'), 10);
const useVoting = votingAttempts > 1;
// ADR-136 Track Q: hardness-based routing.
const hardnessRouting = ctx.flags['hardnessRouting'] === true || ctx.flags['hardnessRouting'] === 'true' ||
ctx.flags['hardness-routing'] === true || ctx.flags['hardness-routing'] === 'true';
const hardnessVerbose = ctx.flags['hardnessVerbose'] === true || ctx.flags['hardnessVerbose'] === 'true' ||
ctx.flags['hardness-verbose'] === true || ctx.flags['hardness-verbose'] === 'true';
// ADR-135 Track D: adversarial critic.
// Voting takes precedence over critic when both are enabled (cost containment).
const enableCritic = !useVoting && (ctx.flags['enableCritic'] === true || ctx.flags['enableCritic'] === 'true' ||
ctx.flags['enable-critic'] === true || ctx.flags['enable-critic'] === 'true');
// ADR-135 Track E: question decomposition.
const enableDecompose = ctx.flags['decompose'] === true || ctx.flags['decompose'] === 'true';
// ADR-135 Track B: planning interval (passed through to runGaiaAgent via agentOpts).
const planningInterval = parseInt(String(ctx.flags['planningInterval'] ?? ctx.flags['planning-interval'] ?? '4'), 10);
// iter 62: convergence layer — default ON, disable with --no-enable-convergence.
// Note: boolean false is falsy, so we check for explicit false values only.
const enableConvergence = !(ctx.flags['enableConvergence'] === false || ctx.flags['enableConvergence'] === 'false' ||
ctx.flags['enable-convergence'] === false || ctx.flags['enable-convergence'] === 'false');
// Dynamic imports to avoid loading at startup.
// NOTE: gaia-*.ts sources are pre-compiled under dist/src/benchmarks/ only --
// they are NOT in the src/ include glob so TypeScript cannot resolve them
// statically. We resolve the absolute path from import.meta.url at runtime
// and cast to `any` to bypass the static-analysis check.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const benchmarksBase = new URL('../benchmarks/', import.meta.url).href;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { loadGaia } = (await import(benchmarksBase + 'gaia-loader.js'));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { runGaiaAgent } = (await import(benchmarksBase + 'gaia-agent.js'));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { judgeAnswer } = (await import(benchmarksBase + 'gaia-judge.js'));
// ADR-135 Track A: voting wrapper (imported when --voting-attempts > 1 OR hardness routing triggers it).
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { runGaiaAgentWithVoting } = (useVoting || hardnessRouting)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
? (await import(benchmarksBase + 'gaia-voting.js'))
: { runGaiaAgentWithVoting: null };
// ADR-135 Track D: critic wrapper (only imported when --enable-critic and no voting).
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { runGaiaAgentWithCritic } = enableCritic
// eslint-disable-next-line @typescript-eslint/no-explicit-any
? (await import(benchmarksBase + 'gaia-critic.js'))
: { runGaiaAgentWithCritic: null };
// ADR-135 Track E: decomposer (only imported when --decompose).
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let decomposeQuestion = null;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let synthesizeFromSubAnswers = null;
if (enableDecompose) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const decomposerMod = (await import(benchmarksBase + 'gaia-decomposer.js'));
decomposeQuestion = decomposerMod.decomposeQuestion;
synthesizeFromSubAnswers = decomposerMod.synthesizeFromSubAnswers;
}
// ADR-136 Track Q: hardness predictor.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let hardnessPredictor = null;
if (hardnessRouting) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { HardnessPredictor } = (await import(benchmarksBase + 'gaia-hardness/predictor.js'));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { loadTrainingData } = (await import(benchmarksBase + 'gaia-hardness/train-data-loader.js'));
hardnessPredictor = new HardnessPredictor({ conservativeMode: true });
const trainingData = loadTrainingData([], hardnessVerbose);
if (trainingData.length >= 10) {
hardnessPredictor.train(trainingData);
}
// If < 10 examples: cold-start (medium for all) -- documented fallback.
}
// Only print to stderr so stdout stays clean for JSON consumers
const log = (msg) => {
if (outputFormat !== 'json') {
output.writeln(msg);
}
else {
process.stderr.write(msg + '\n');
}
};
log('');
log(output.bold(`GAIA Benchmark -- Level ${level}${smokeOnly ? ' [SMOKE]' : ''}`));
log(output.dim('-'.repeat(60)));
log(`Models : ${models.join(', ')}`);
log(`Limit : ${limit ?? 'all'}`);
log(`Concurrency: ${concurrency}`);
if (useVoting && !hardnessRouting) {
log(`Voting : ${votingAttempts}x self-consistency (ADR-135 Track A) -- cost ~${votingAttempts}x per question`);
}
if (enableCritic) {
log(`Critic : ADR-135 Track D enabled -- adversarial review after each answer`);
}
if (enableDecompose) {
log(`Decompose: ADR-135 Track E enabled -- multi-step questions will be split into sub-questions`);
}
if (planningInterval > 0) {
log(`Planning: ADR-135 Track B -- checkpoint every ${planningInterval} turns`);
}
if (hardnessRouting) {
const trainedStatus = hardnessPredictor?.isTrained
? 'trained (classifier active)'
: 'cold-start (no training data -> all medium)';
log(`Hardness: ADR-136 Track Q enabled -- ${trainedStatus}`);
log(' easy=Haiku/4t/1-attempt medium=Sonnet/8t/1-attempt hard=Sonnet/12t/3-vote');
}
log('');
// Load questions
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let questions;
try {
questions = await loadGaia({ level, limit, smokeOnly });
}
catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (outputFormat === 'json') {
process.stdout.write(JSON.stringify({ error: `Failed to load GAIA dataset: ${msg}` }, null, 2) + '\n');
}
else {
output.writeln(output.error(`Failed to load GAIA dataset: ${msg}`));
}
return { success: false };
}
log(`Loaded : ${questions.length} questions`);
log('');
const allModelOutputs = [];
// Resolve the API key once (shared across all per-question agent calls).
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const { resolveAnthropicApiKey } = (await import(benchmarksBase + 'gaia-agent.js'));
const apiKey = resolveAnthropicApiKey();
for (const model of models) {
log(output.bold(`Running model: ${model}`));
log(output.dim('-'.repeat(40)));
const results = [];
let totalInputTokens = 0;
let totalOutputTokens = 0;
let totalTurns = 0;
let totalWallMs = 0;
// ADR-136 Track Q: hardness distribution tracking.
const hardnessDist = { easy: 0, medium: 0, hard: 0 };
// Process questions in batches of `concurrency`
for (let i = 0; i < questions.length; i += concurrency) {
const batch = questions.slice(i, Math.min(i + concurrency, questions.length));
const batchResults = await Promise.all(batch.map(async (q) => {
const qIdx = i + batch.indexOf(q) + 1;
// ADR-136 Track Q: predict hardness and set per-question compute budget.
let effectiveModel = model;
let effectiveMaxTurns = maxTurns;
let effectiveVotingAttempts = votingAttempts;
let predictedDifficulty;
let predictedConfidence;
if (hardnessRouting && hardnessPredictor) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const prediction = hardnessPredictor.predict(q);
predictedDifficulty = prediction.difficulty;
predictedConfidence = prediction.confidence;
hardnessDist[predictedDifficulty]++;
// Override compute budget from hardness policy.
const budget = prediction.budget;
effectiveModel = budget.model === 'haiku'
? 'claude-haiku-4-5'
: (model.includes('sonnet') ? model : 'claude-sonnet-4-6');
effectiveMaxTurns = budget.maxTurns;
effectiveVotingAttempts = budget.votingAttempts;
if (hardnessVerbose) {
log(` [${qIdx}/${questions.length}] ${q.task_id} hardness=${predictedDifficulty}` +
` conf=${((predictedConfidence ?? 0) * 100).toFixed(0)}%` +
` -> ${effectiveModel} / ${effectiveMaxTurns}t / ${effectiveVotingAttempts}-attempt`);
}
else {
log(` [${qIdx}/${questions.length}] ${q.task_id} [${predictedDifficulty}] -- ${String(q.question).slice(0, 50)}...`);
}
}
else {
log(` [${qIdx}/${questions.length}] ${q.task_id} -- ${String(q.question).slice(0, 60)}...`);
}
const useThisVoting = effectiveVotingAttempts > 1;
// Critic is suppressed when voting is active (same precedence rule as the global flag).
const useThisCritic = enableCritic && !useThisVoting && runGaiaAgentWithCritic;
// Shared agent options (Track B planning interval + iter 62 convergence wired here).
const agentOpts = {
model: effectiveModel,
maxTurns: effectiveMaxTurns,
planningInterval,
apiKey,
enableConvergence,
};
// ADR-135 Track E: decompose the question if enabled.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let decomposedResult = null;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let questionsToSolve = [q];
if (enableDecompose && decomposeQuestion) {
try {
decomposedResult = await decomposeQuestion(q.question, { apiKey });
if (decomposedResult?.decomposed === true && Array.isArray(decomposedResult.subQuestions) && decomposedResult.subQuestions.length > 1) {
questionsToSolve = decomposedResult.subQuestions.map((sq) => ({ ...q, question: sq }));
log(` decomposed into ${questionsToSolve.length} sub-questions`);
}
}
catch {
// Graceful fallback: treat as atomic question.
decomposedResult = null;
}
}
// Solve each (sub-)question.
const subAnswers = [];
let lastAgentResult = null; // eslint-disable-line @typescript-eslint/no-explicit-any
let solveError;
for (const sq of questionsToSolve) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let agentResult;
try {
if (useThisVoting && runGaiaAgentWithVoting) {
// ADR-135 Track A: multi-attempt majority voting.
agentResult = await runGaiaAgentWithVoting(sq, {
...agentOpts,
attempts: effectiveVotingAttempts,
});
const vr = agentResult;
log(` vote-method=${vr.votingMethod ?? '?'} agreement=${vr.agreementCount ?? '?'}/${effectiveVotingAttempts}`);
}
else if (useThisCritic) {
// ADR-135 Track D: critic-wrapped agent.
agentResult = await runGaiaAgentWithCritic(sq, {
...agentOpts,
enableCritic: true,
});
const cr = agentResult;
log(` critic-verdict=${cr.criticVerdict ?? '?'}`);
}
else {
agentResult = await runGaiaAgent(sq, agentOpts);
}
}
catch (err) {
solveError = err instanceof Error ? err.message : String(err);
log(` ERROR: ${solveError}`);
break;
}
subAnswers.push(agentResult.finalAnswer ?? '');
lastAgentResult = agentResult;
}
if (solveError || !lastAgentResult) {
return {
task_id: q.task_id,
question: q.question,
model: effectiveModel,
correct: false,
answer: null,
expected_output: q.final_answer,
error: solveError ?? 'no result',
hardnessDifficulty: predictedDifficulty,
hardnessConfidence: predictedConfidence,
decomposed: decomposedResult?.decomposed === true,
};
}
// ADR-135 Track E: synthesize sub-answers if decomposed.
let finalAnswer = subAnswers[0] ?? null;
if (decomposedResult?.decomposed === true && questionsToSolve.length > 1 && synthesizeFromSubAnswers) {
try {
const synth = await synthesizeFromSubAnswers(decomposedResult, subAnswers, { apiKey });
finalAnswer = synth.finalAnswer ?? finalAnswer;
}
catch {
// Graceful fallback: use first sub-answer.
}
}
// Judge the answer
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let judgeResult;
try {
judgeResult = await judgeAnswer({ id: q.task_id, expected: q.final_answer, questionText: q.question }, finalAnswer, { judgeModel });
}
catch (err) {
const errorMsg = err instanceof Error ? err.message : String(err);
judgeResult = {
questionId: q.task_id,
passed: false,
scoringPath: 'exact-match',
candidateAnswer: finalAnswer ?? '',
groundTruth: q.final_answer,
judgeReason: `Judge error: ${errorMsg}`,
};
}
const verdict = judgeResult.passed ? output.success('PASS') : output.error('FAIL');
log(` ${verdict} answer="${finalAnswer ?? 'null'}" expected="${q.final_answer}"` +
` turns=${lastAgentResult.turns} ${(lastAgentResult.wallMs / 1000).toFixed(1)}s`);
return {
task_id: q.task_id,
question: q.question,
model: effectiveModel,
correct: judgeResult.passed,
answer: finalAnswer,
expected_output: q.final_answer,
error: lastAgentResult.error,
turns: lastAgentResult.turns,
wallMs: lastAgentResult.wallMs,
inputTokens: lastAgentResult.totalInputTokens,
outputTokens: lastAgentResult.totalOutputTokens,
hardnessDifficulty: predictedDifficulty,
hardnessConfidence: predictedConfidence,
decomposed: decomposedResult?.decomposed === true,
};
}));
for (const r of batchResults) {
results.push(r);
totalInputTokens += r.inputTokens ?? 0;
totalOutputTokens += r.outputTokens ?? 0;
totalTurns += r.turns ?? 0;
totalWallMs += r.wallMs ?? 0;
}
}
const passed = results.filter((r) => r.correct).length;
const total = results.length;
const passRate = total > 0 ? passed / total : 0;
const estCostUsd = estimateCost(model, totalInputTokens, totalOutputTokens);
const modelOutput = {
level,
model,
summary: {
total,
passed,
passRate,
estCostUsd,
meanTurns: total > 0 ? totalTurns / total : 0,
meanWallMs: total > 0 ? totalWallMs / total : 0,
...(hardnessRouting ? { hardnessDist } : {}),
},
results,
};
allModelOutputs.push(modelOutput);
log('');
log(output.bold(`Results for ${model}:`));
log(` Pass rate : ${passed}/${total} (${(passRate * 100).toFixed(1)}%)`);
log(` Est. cost : $${estCostUsd.toFixed(4)}`);
log(` Mean turns: ${modelOutput.summary.meanTurns.toFixed(1)}`);
log(` Mean time : ${(modelOutput.summary.meanWallMs / 1000).toFixed(1)}s per question`);
if (hardnessRouting) {
log(` Hardness : easy=${hardnessDist.easy} medium=${hardnessDist.medium} hard=${hardnessDist.hard}`);
}
log('');
}
// Output results
if (outputFormat === 'json') {
if (allModelOutputs.length === 1) {
// Single model: emit flat object (matches workflow contract)
process.stdout.write(JSON.stringify(allModelOutputs[0], null, 2) + '\n');
}
else {
// Multiple models: emit array
process.stdout.write(JSON.stringify(allModelOutputs, null, 2) + '\n');
}
}
else {
// Print summary table
output.writeln(output.bold('Summary'));
output.writeln(output.dim('-'.repeat(60)));
for (const m of allModelOutputs) {
const pct = (m.summary.passRate * 100).toFixed(1);
output.writeln(`${m.model.padEnd(28)} ${m.summary.passed}/${m.summary.total} (${pct}%)` +
` cost=$${m.summary.estCostUsd.toFixed(4)}` +
` turns=${m.summary.meanTurns.toFixed(1)}`);
}
}
return { success: true };
},
};
// ---------------------------------------------------------------------------
// Main gaia-bench command
// ---------------------------------------------------------------------------
export const gaiaBenchCommand = {
name: 'gaia-bench',
description: 'GAIA benchmark harness -- measure agent pass-rate on real GAIA questions',
subcommands: [runCommand],
examples: [
{
command: 'claude-flow gaia-bench run --level 1 --limit 10 --models claude-haiku-4-5 --output json',
description: 'Mini Level-1 run with Haiku, JSON output',
},
{
command: 'claude-flow gaia-bench run --smoke-only',
description: 'Quick smoke test with built-in fixture (no HF token)',
},
{
command: 'claude-flow gaia-bench run --level 1 --models claude-sonnet-4-6 --hardness-routing --output json',
description: 'ADR-136 Track Q: tiered compute routing by predicted question difficulty',
},
{
command: 'claude-flow gaia-bench run --level 1 --models claude-sonnet-4-6 --hardness-routing --enable-critic --planning-interval 4',
description: 'Recommended config: all tracks active, ~$2/run estimated',
},
],
};
export default gaiaBenchCommand;
//# sourceMappingURL=gaia-bench.js.map