claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
327 lines • 14.2 kB
JavaScript
/**
* GAIA Critic Smoke Tests — ADR-135 Track D
*
* Tests for the adversarial critic agent in gaia-critic.ts.
* All tests use mocked responses — NO live API calls.
*
* Test coverage:
* 1. Critic returns "pass" → no retry, immediately returns candidate
* 2. Critic returns "fail" with suggestedRevision → triggers one retry
* 3. Critic returns "fail" twice → retries exhausted, returns last candidate
* 4. Critic returns "uncertain" → treated as "pass", no retry
* 5. API error in critic → graceful fallback, returns candidate as-is
* 6. Malformed JSON from critic → fallback parser extracts verdict
*
* Usage (no API key required):
* npx tsx src/benchmarks/gaia-critic.smoke.ts
*
* Refs: ADR-135, #2156
*/
import { criticReview, runGaiaAgentWithCritic, } from './gaia-critic.js';
// ---------------------------------------------------------------------------
// Test helpers
// ---------------------------------------------------------------------------
let passed = 0;
let failed = 0;
function assert(condition, message) {
if (condition) {
console.log(` PASS ${message}`);
passed++;
}
else {
console.error(` FAIL ${message}`);
failed++;
}
}
function assertEqual(actual, expected, message) {
assert(actual === expected, `${message} (expected ${String(expected)}, got ${String(actual)})`);
}
/** A minimal GaiaQuestion fixture. */
const FIXTURE_QUESTION = {
task_id: 'smoke-001',
level: 1,
question: 'What is the capital of France?',
final_answer: 'Paris',
file_name: null,
file_path: null,
};
/** Minimal GaiaAgentResult with a given finalAnswer. */
function makeAgentResult(finalAnswer, turns = 2) {
return {
questionId: FIXTURE_QUESTION.task_id,
finalAnswer,
turns,
toolCallsByName: { web_search: 1 },
totalInputTokens: 100,
totalOutputTokens: 50,
wallMs: 1200,
};
}
function installFetchMock(responses) {
let callIndex = 0;
const original = globalThis.fetch;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
globalThis.fetch = async (_url, _init) => {
const resp = responses[Math.min(callIndex, responses.length - 1)];
callIndex++;
if (!resp.ok) {
return {
ok: false,
status: resp.status ?? 500,
text: async () => resp.text ?? 'Internal error',
json: async () => { throw new Error('not ok'); },
};
}
return {
ok: true,
status: 200,
json: async () => resp.body ?? {},
text: async () => JSON.stringify(resp.body ?? {}),
};
};
return () => { globalThis.fetch = original; };
}
/**
* Build a mock Anthropic response body containing a critic verdict JSON.
*/
function mockCriticResponse(verdict, reasoning, suggestedRevision = '') {
const content = JSON.stringify({ verdict, reasoning, suggestedRevision });
return {
content: [{ type: 'text', text: content }],
usage: { input_tokens: 200, output_tokens: 40 },
};
}
/**
* Mock for runGaiaAgent that immediately returns a fixed result.
* We monkey-patch the module-level import by wrapping runGaiaAgentWithCritic
* via its options.catalogue approach — but since gaia-agent's runGaiaAgent is
* imported directly, we mock at the fetch level instead (the agent also calls
* the Anthropic API, so we intercept there).
*
* For the smoke tests we only exercise criticReview directly in Tests 1-6, and
* use a simplified version of runGaiaAgentWithCritic that pre-supplies a
* mocked agent result rather than actually calling the API for the agent run.
*
* This avoids needing to rewrite the import mechanism just for smoke tests.
*/
// ---------------------------------------------------------------------------
// Test 1: critic returns "pass" → no retry
// ---------------------------------------------------------------------------
async function test1_criticPass() {
console.log('\nTest 1: critic returns "pass" → no retry');
const restore = installFetchMock([
{ ok: true, body: mockCriticResponse('pass', 'Answer is correct.') },
]);
try {
const verdict = await criticReview(FIXTURE_QUESTION, 'Paris', { steps: [{ tool: 'web_search', result: 'Paris is the capital' }], turns: 2 }, { model: 'claude-sonnet-4-6', apiKey: 'test-key' });
assertEqual(verdict.verdict, 'pass', 'verdict is "pass"');
assert(verdict.reasoning.length > 0, 'reasoning is non-empty');
assert(verdict.costUsd >= 0, 'costUsd is non-negative');
assert(!verdict.error, 'no error flag');
}
finally {
restore();
}
}
// ---------------------------------------------------------------------------
// Test 2: critic returns "fail" with suggestedRevision
// ---------------------------------------------------------------------------
async function test2_criticFail() {
console.log('\nTest 2: critic returns "fail" with suggestedRevision');
const restore = installFetchMock([
{
ok: true,
body: mockCriticResponse('fail', 'The answer is the wrong city.', 'Paris'),
},
]);
try {
const verdict = await criticReview(FIXTURE_QUESTION, 'Lyon', { steps: [{ tool: 'web_search', result: 'Lyon is in France' }], turns: 1 }, { model: 'claude-sonnet-4-6', apiKey: 'test-key' });
assertEqual(verdict.verdict, 'fail', 'verdict is "fail"');
assert((verdict.suggestedRevision ?? '').length > 0, 'suggestedRevision is non-empty');
assertEqual(verdict.suggestedRevision, 'Paris', 'suggestedRevision is "Paris"');
}
finally {
restore();
}
}
// ---------------------------------------------------------------------------
// Test 3: critic returns "fail" twice → retries exhausted
// ---------------------------------------------------------------------------
async function test3_retriesExhausted() {
console.log('\nTest 3: critic fails twice → retries exhausted, returns last candidate');
// We test this at the runGaiaAgentWithCritic level.
// We need to mock BOTH the critic fetch calls AND the agent API calls.
// Strategy: sequence the mock responses in the order they will be called.
//
// Call sequence (with enableCritic=true, maxRetries=1):
// 1. runGaiaAgent attempt 1 → agent Anthropic API call (returns answer "Lyon")
// 2. criticReview attempt 1 → critic API call (returns "fail")
// 3. runGaiaAgent attempt 2 → agent Anthropic API call (returns answer "Marseille")
// 4. criticReview attempt 2 → critic API call (returns "fail")
//
// For simplicity we make the agent calls also return valid Anthropic responses
// that produce a FINAL_ANSWER (the agent code parses stop_reason=end_turn).
const agentResponseLyon = {
id: 'msg_01',
type: 'message',
role: 'assistant',
stop_reason: 'end_turn',
content: [{ type: 'text', text: 'FINAL_ANSWER: Lyon' }],
usage: { input_tokens: 150, output_tokens: 20 },
model: 'claude-haiku-4-5',
};
const agentResponseMarseille = {
id: 'msg_02',
type: 'message',
role: 'assistant',
stop_reason: 'end_turn',
content: [{ type: 'text', text: 'FINAL_ANSWER: Marseille' }],
usage: { input_tokens: 150, output_tokens: 20 },
model: 'claude-haiku-4-5',
};
const restore = installFetchMock([
{ ok: true, body: agentResponseLyon }, // agent call 1
{ ok: true, body: mockCriticResponse('fail', 'Wrong city.', 'Paris') }, // critic 1
{ ok: true, body: agentResponseMarseille }, // agent call 2 (retry)
{ ok: true, body: mockCriticResponse('fail', 'Still wrong.', 'Paris') }, // critic 2
]);
try {
const result = await runGaiaAgentWithCritic(FIXTURE_QUESTION, {
enableCritic: true,
apiKey: 'test-key',
criticOptions: { apiKey: 'test-key', maxRetries: 1 },
});
assertEqual(result.retriesAttempted, 1, 'retriesAttempted is 1');
assertEqual(result.criticVerdicts.length, 2, 'two critic verdicts collected');
assertEqual(result.criticVerdicts[0].verdict, 'fail', 'first verdict is fail');
assertEqual(result.criticVerdicts[1].verdict, 'fail', 'second verdict is fail');
// Last candidate is returned regardless.
assert(result.finalAnswer !== null, 'finalAnswer is non-null (last candidate returned)');
}
finally {
restore();
}
}
// ---------------------------------------------------------------------------
// Test 4: critic returns "uncertain" → treated as "pass", no retry
// ---------------------------------------------------------------------------
async function test4_uncertainAsPass() {
console.log('\nTest 4: critic returns "uncertain" → treated as pass, no retry');
const restore = installFetchMock([
{
ok: true,
body: mockCriticResponse('uncertain', 'Cannot verify without more context.'),
},
]);
try {
const verdict = await criticReview(FIXTURE_QUESTION, 'Paris', { steps: [], turns: 1 }, { model: 'claude-sonnet-4-6', apiKey: 'test-key' });
assertEqual(verdict.verdict, 'uncertain', 'verdict is "uncertain"');
// Verify orchestrator treats uncertain as pass (no retry fired).
// We simulate by checking the logic directly: uncertain !== 'fail', so loop
// body is skipped. We test it via runGaiaAgentWithCritic with a minimal
// agent mock that returns a final answer.
const agentResponseParis = {
id: 'msg_03',
type: 'message',
role: 'assistant',
stop_reason: 'end_turn',
content: [{ type: 'text', text: 'FINAL_ANSWER: Paris' }],
usage: { input_tokens: 100, output_tokens: 15 },
model: 'claude-haiku-4-5',
};
const restore2 = installFetchMock([
{ ok: true, body: agentResponseParis }, // agent call
{ ok: true, body: mockCriticResponse('uncertain', 'Cannot verify.') }, // critic
]);
try {
const result = await runGaiaAgentWithCritic(FIXTURE_QUESTION, {
enableCritic: true,
apiKey: 'test-key',
criticOptions: { apiKey: 'test-key', maxRetries: 1 },
});
assertEqual(result.retriesAttempted, 0, 'no retries for uncertain verdict');
assertEqual(result.criticVerdicts.length, 1, 'one critic verdict collected');
assertEqual(result.criticVerdicts[0].verdict, 'uncertain', 'verdict is uncertain');
}
finally {
restore2();
}
}
finally {
restore();
}
}
// ---------------------------------------------------------------------------
// Test 5: API error in critic → graceful fallback
// ---------------------------------------------------------------------------
async function test5_apiErrorFallback() {
console.log('\nTest 5: API error in critic → graceful fallback, original candidate returned');
const restore = installFetchMock([
{ ok: false, status: 529, text: 'Overloaded' },
]);
try {
const verdict = await criticReview(FIXTURE_QUESTION, 'Paris', { steps: [], turns: 1 }, { model: 'claude-sonnet-4-6', apiKey: 'test-key' });
// On API error, critic returns uncertain with error flag — does not throw.
assertEqual(verdict.verdict, 'uncertain', 'verdict is "uncertain" on API error');
assert(verdict.error === true, 'error flag is set');
assert(verdict.costUsd === 0, 'costUsd is 0 on error');
assert(verdict.reasoning.includes('failed'), 'reasoning mentions failure');
}
finally {
restore();
}
}
// ---------------------------------------------------------------------------
// Test 6: malformed JSON from critic → fallback parser
// ---------------------------------------------------------------------------
async function test6_malformedJson() {
console.log('\nTest 6: malformed JSON from critic → fallback parser extracts verdict');
// Simulate Sonnet returning prose with embedded JSON fragment.
const malformedBody = {
content: [{
type: 'text',
text: 'After careful review, I believe the answer is wrong. {"verdict":"fail","reasoning":"Incorrect city","suggestedRevision":"Paris"} The agent should try again.',
}],
usage: { input_tokens: 180, output_tokens: 60 },
};
const restore = installFetchMock([
{ ok: true, body: malformedBody },
]);
try {
const verdict = await criticReview(FIXTURE_QUESTION, 'Lyon', { steps: [], turns: 1 }, { model: 'claude-sonnet-4-6', apiKey: 'test-key' });
// Fallback parser should extract "fail" from the embedded JSON.
assertEqual(verdict.verdict, 'fail', 'fallback parser extracts "fail" verdict');
assert(!verdict.error, 'no error flag for recoverable parse');
}
finally {
restore();
}
}
// ---------------------------------------------------------------------------
// Run all tests
// ---------------------------------------------------------------------------
async function main() {
console.log('=== GAIA Critic Smoke Tests (ADR-135 Track D) ===');
console.log('All tests use mocked responses — no live API calls.\n');
try {
await test1_criticPass();
await test2_criticFail();
await test3_retriesExhausted();
await test4_uncertainAsPass();
await test5_apiErrorFallback();
await test6_malformedJson();
}
catch (err) {
console.error('\nUnexpected test runner error:', err);
process.exit(1);
}
console.log(`\n=== Results: ${passed} passed, ${failed} failed ===`);
if (failed > 0) {
process.exit(1);
}
}
main().catch(err => {
console.error(err);
process.exit(1);
});
//# sourceMappingURL=gaia-critic.smoke.js.map