claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
136 lines • 6.19 kB
JavaScript
/**
* GAIA End-to-End Smoke — ADR-133
*
* Wires gaia-agent.ts + gaia-judge.ts into a single end-to-end pipeline:
*
* for each question in SMOKE_FIXTURE:
* 1. runGaiaAgent(question) — Haiku agent loop, ≤8 turns
* 2. judgeAnswer(question, result.finalAnswer) — exact-match fast-path,
* Sonnet LLM-judge only if exact-match misses
*
* Reports: pass rate, total cost, mean turn count.
* Asserts: ≥ 3/5 questions pass (lenient — smoke fixture is not trivial).
*
* Cost discipline:
* - Agent: claude-haiku-4-5 at $0.25/$1.25 per M tokens
* - Judge: claude-sonnet-4-6 at $3/$15 per M tokens (only when needed)
* - Expected total for 5 questions × ~2 turns × Haiku + 1-2 Sonnet
* judge calls ≈ $0.02
*
* Usage:
* ANTHROPIC_API_KEY=sk-ant-... npx tsx src/benchmarks/gaia-e2e-smoke.ts
*
* Refs: ADR-133, #2156
*/
import * as os from 'node:os';
import * as path from 'node:path';
import { SMOKE_FIXTURE, } from './gaia-loader.js';
import { runGaiaAgent, } from './gaia-agent.js';
import { judgeAnswer, } from './gaia-judge.js';
// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------
/** Agent model — Haiku only for cost discipline. */
const AGENT_MODEL = 'claude-haiku-4-5';
/** Judge model — Sonnet for semantic judgments (only when exact-match fails). */
const JUDGE_MODEL = 'claude-sonnet-4-6';
/** Minimum pass rate for smoke to succeed. */
const MIN_PASS_RATE = 3 / 5;
// Haiku pricing ($/M tokens)
const HAIKU_IN = 0.25;
const HAIKU_OUT = 1.25;
// Sonnet pricing ($/M tokens)
const SONNET_IN = 3.0;
const SONNET_OUT = 15.0;
// ---------------------------------------------------------------------------
// Runner
// ---------------------------------------------------------------------------
async function runE2ESmoke() {
const hasKey = !!(process.env.ANTHROPIC_API_KEY?.trim());
if (!hasKey) {
console.error('ANTHROPIC_API_KEY is required for the end-to-end smoke.\n' +
'Set it with: export ANTHROPIC_API_KEY=sk-ant-...');
process.exit(1);
}
const cacheDir = path.join(os.homedir(), '.cache', 'ruflo', 'gaia', 'judgments');
const judgeOpts = { judgeModel: JUDGE_MODEL, cacheDir };
const questions = SMOKE_FIXTURE;
const rows = [];
console.log(`\n=== GAIA End-to-End Smoke (${questions.length} questions) ===\n`);
console.log(`Agent: ${AGENT_MODEL} | Judge: ${JUDGE_MODEL}\n` +
`Questions: ${questions.length} | Min pass rate: ${(MIN_PASS_RATE * 100).toFixed(0)}%\n`);
for (const q of questions) {
process.stdout.write(`[${q.task_id}] "${q.question.slice(0, 60)}..." `);
// Run agent
const agentResult = await runGaiaAgent(q, { model: AGENT_MODEL });
// Judge
const judgeResult = await judgeAnswer({ id: q.task_id, expected: q.final_answer, questionText: q.question }, agentResult.finalAnswer, judgeOpts);
rows.push({ question: q, agentResult, judgeResult });
const verdict = judgeResult.passed ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m';
const path_ = judgeResult.scoringPath;
console.log(`${verdict} (turns=${agentResult.turns}, ` +
`answer="${agentResult.finalAnswer ?? 'null'}", ` +
`expected="${q.final_answer}", path=${path_})`);
}
// ── Summary ──
console.log('\n--- Summary ---\n');
const passed = rows.filter((r) => r.judgeResult.passed).length;
const total = rows.length;
const passRate = passed / total;
let totalAgentCostUsd = 0;
let totalJudgeCostUsd = 0;
let totalTurns = 0;
for (const row of rows) {
const { agentResult, judgeResult } = row;
totalTurns += agentResult.turns;
totalAgentCostUsd +=
(agentResult.totalInputTokens / 1_000_000) * HAIKU_IN +
(agentResult.totalOutputTokens / 1_000_000) * HAIKU_OUT;
totalJudgeCostUsd += judgeResult.judgeCostUsd ?? 0;
}
const totalCostUsd = totalAgentCostUsd + totalJudgeCostUsd;
const meanTurns = totalTurns / total;
console.log(`Pass rate : ${passed}/${total} (${(passRate * 100).toFixed(0)}%)`);
console.log(`Mean turns : ${meanTurns.toFixed(1)}`);
console.log(`Agent cost : $${totalAgentCostUsd.toFixed(5)} (Haiku)`);
console.log(`Judge cost : $${totalJudgeCostUsd.toFixed(5)} (Sonnet, only when needed)`);
console.log(`Total cost : $${totalCostUsd.toFixed(5)}`);
// ── Per-row detail ──
console.log('\n--- Per-question detail ---\n');
for (const row of rows) {
const { question, agentResult, judgeResult } = row;
const verdict = judgeResult.passed ? 'PASS' : 'FAIL';
console.log(` ${verdict} ${question.task_id} turns=${agentResult.turns} ` +
`path=${judgeResult.scoringPath} ` +
`answer="${agentResult.finalAnswer ?? 'null'}" ` +
`expected="${question.final_answer}"`);
if (judgeResult.judgeReason) {
console.log(` reason: ${judgeResult.judgeReason}`);
}
if (agentResult.error) {
console.log(` error : ${agentResult.error}`);
}
}
// ── Assertion ──
console.log('');
if (passRate >= MIN_PASS_RATE) {
console.log(`\x1b[32mSmoke PASSED\x1b[0m — ${passed}/${total} ≥ ${(MIN_PASS_RATE * 100).toFixed(0)}% required.\n`);
}
else {
console.error(`\x1b[31mSmoke FAILED\x1b[0m — ${passed}/${total} < ${(MIN_PASS_RATE * 100).toFixed(0)}% required.\n`);
process.exit(1);
}
}
// ---------------------------------------------------------------------------
// Entry point
// ---------------------------------------------------------------------------
const isMain = process.argv[1] && (process.argv[1].endsWith('gaia-e2e-smoke.ts') ||
process.argv[1].endsWith('gaia-e2e-smoke.js'));
if (isMain) {
runE2ESmoke().catch((err) => {
console.error('E2E smoke failed:', err);
process.exit(1);
});
}
export { runE2ESmoke };
//# sourceMappingURL=gaia-e2e-smoke.js.map