UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

242 lines 11.1 kB
"use strict"; /** * Smoke tests for extractFinalAnswer + buildUserMessage (iter 53a T2 narrowed). * * Iter 53a changes vs iter 52 T2: * - Stage 2/3 (prose fallback) REMOVED — extractFinalAnswer is Stage 1 only. * - Commitment prompt no longer has "FINAL_ANSWER: unknown" surrender instruction. * - Reversed-text preprocessor PRESERVED (not the source of regressions). * * Anti-regression suite (7 cases): * These are the exact failure modes from iter 52 T2 that must not regress. * They test that the extraction logic is stable and correct for the 7 questions * where iter 52 broke but iter 51 was correct. * * Run (after build): * node dist/benchmarks/gaia-extract.smoke.js * * Exit 0 on all pass, 1 on any failure. * * Refs: ADR-133, ADR-135, iter 52 T2, iter 53a, #2156 */ // Dynamic import so this file can run either as TS source (tsx) or compiled JS. // eslint-disable-next-line @typescript-eslint/no-explicit-any let _extractFinalAnswer; // eslint-disable-next-line @typescript-eslint/no-explicit-any let _buildUserMessage; function makeResp(text) { return { id: 'test', model: 'test', stop_reason: 'end_turn', content: [{ type: 'text', text }], usage: { input_tokens: 0, output_tokens: 0 }, }; } const CASES = [ // --------------------------------------------------------------------------- // Stage 1: primary FINAL_ANSWER: pattern (iter 53a: this is the ONLY stage) // --------------------------------------------------------------------------- { label: 'Stage1 basic FINAL_ANSWER:', input: 'I searched and found the capital.\nFINAL_ANSWER: Paris', expected: 'Paris', }, { label: 'Stage1 case-insensitive final_answer:', input: 'The result is known.\nfinal_answer: 42', expected: '42', }, { label: 'Stage1 with leading whitespace', input: 'Done.\n FINAL_ANSWER: Tokyo ', expected: 'Tokyo', }, // --------------------------------------------------------------------------- // Prose-only outputs with no FINAL_ANSWER: tag — must return null (iter 53a). // These were previously handled by Stage 2/3 (iter 52 T2). // In iter 53a, we rely on the agent committing with the tag instead. // --------------------------------------------------------------------------- { label: 'Prose-only "The answer is X" — null in iter 53a (no Stage 2)', input: 'After analysis, the answer is Right.', expected: null, }, { label: 'Prose-only "Therefore X" — null in iter 53a (no Stage 2)', input: 'Based on the clues, therefore the answer is Berlin.', expected: null, }, { label: 'Prose-only "Answer: X" — null in iter 53a (no Stage 2)', input: 'Let me compute this.\nAnswer: 17', expected: null, }, { label: 'Prose-only all-caps last line — null in iter 53a (no Stage 3)', input: 'I computed the value based on many steps.\nRIGHT', expected: null, }, { label: 'Prose-only numeric last line — null in iter 53a (no Stage 3)', input: 'The final calculation gives us:\n346', expected: null, }, { label: 'Prose-only short-phrase last line — null in iter 53a (no Stage 3)', input: 'After extensive research, the result is:\nBerlin, Germany', expected: null, }, // Null case: no answer extractable (verbose prose, no commitment) { label: 'Null case: only tool-call reasoning, no commitment', input: 'I tried searching but could not find the specific information requested.', expected: null, }, // --------------------------------------------------------------------------- // Reversed text pre-processor (buildUserMessage) — PRESERVED in iter 53a // --------------------------------------------------------------------------- { label: 'Reversed text: adds decoded hint', input: '.rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI', expected: '[NOTE:', isUserMsg: true, }, { label: 'Normal text: no hint added', input: 'What is the capital of France?', expected: 'What is the capital of France?', isUserMsg: true, }, // --------------------------------------------------------------------------- // Anti-regression suite (iter 53a) — 7 cases for iter 52 regressions. // // These represent the exact regression patterns from iter 52 T2: // a1e91b78: agent surrendered with "unknown" (commitment prompt over-trigger) // 935e2cff: agent gave empty answer (Stage 2/3 failed after extraction) // 305ac316: agent gave empty answer (Stage 2/3 failed after extraction) // 7673d772: agent gave empty answer (Stage 2/3 failed after extraction) // 3f57289b: Stage 3 grabbed wrong number from prose // 50ec8903: Stage 3 grabbed markdown fragment instead of correct answer // 5a0c1adf: Stage 3 grabbed "Claus Peter" instead of correct "Claus" // // Each test constructs a synthetic response that mimics the failure mode // and asserts the correct Stage 1 extraction behavior. // --------------------------------------------------------------------------- // AR-1: a1e91b78 — agent used to surrender with "FINAL_ANSWER: unknown" // In iter 52, system prompt said to output "FINAL_ANSWER: unknown" if unsure. // With that instruction removed, the agent reasons further and finds the answer. // This test verifies: if the agent DOES output a correct FINAL_ANSWER, we extract it. { label: 'AR-1 (a1e91b78): Stage1 extracts correct answer — not "unknown"', input: 'I found the video. The maximum number of bird species seen together is 3.\nFINAL_ANSWER: 3', expected: '3', }, // AR-2: 935e2cff — agent had the answer but Stage 2 produced empty string // In iter 53a, agent should output FINAL_ANSWER: Research correctly. { label: 'AR-2 (935e2cff): Stage1 extracts "Research" correctly', input: 'After reviewing the policy document, R stands for Research.\nFINAL_ANSWER: Research', expected: 'Research', }, // AR-3: 305ac316 — agent had "Wojciech" but extraction failed { label: 'AR-3 (305ac316): Stage1 extracts Polish name correctly', input: 'The actor played Wojciech in the movie.\nFINAL_ANSWER: Wojciech', expected: 'Wojciech', }, // AR-4: 7673d772 — agent had "inference" but extraction failed { label: 'AR-4 (7673d772): Stage1 extracts legal term correctly', input: 'The fifth section of the federal rule uses the word "inference".\nFINAL_ANSWER: inference', expected: 'inference', }, // AR-5: 3f57289b — Stage 3 grabbed wrong number (525 instead of 519) // The regression was caused by Stage 3 picking a wrong number from prose. // With Stage 3 removed, Stage 1 is the only source — agent must use the tag. { label: 'AR-5 (3f57289b): Stage1 extracts correct at-bat count', input: 'Reggie Jackson had 525 at-bats but the player with most walks had 519.\nFINAL_ANSWER: 519', expected: '519', }, // AR-6: 50ec8903 — Stage 3 grabbed "- Orange-Green edge →" (markdown fragment) // With Stage 3 removed, only FINAL_ANSWER: tag is extracted. { label: 'AR-6 (50ec8903): Stage1 extracts Rubik cube colors correctly', input: 'After solving the cube configuration, the answer is green, white.\nFINAL_ANSWER: green, white', expected: 'green, white', }, // AR-7: 5a0c1adf — Stage 3 grabbed "Claus Peter" (too many words) // With Stage 3 removed, only FINAL_ANSWER: tag is extracted. { label: 'AR-7 (5a0c1adf): Stage1 extracts first name only, not "Claus Peter"', input: 'The conductor\'s full name is Claus Peter Flor, but his first name is Claus.\nFINAL_ANSWER: Claus', expected: 'Claus', }, ]; // --------------------------------------------------------------------------- // Runner // --------------------------------------------------------------------------- async function main() { // Import at runtime to work with both tsx and compiled JS. const mod = await import('./gaia-agent.js'); // eslint-disable-next-line @typescript-eslint/no-explicit-any const m = mod; _extractFinalAnswer = m._extractFinalAnswerForTest; _buildUserMessage = m._buildUserMessageForTest; if (typeof _extractFinalAnswer !== 'function' || typeof _buildUserMessage !== 'function') { console.error('ERROR: _extractFinalAnswerForTest / _buildUserMessageForTest not exported from gaia-agent.' + '\nAdd `export { extractFinalAnswer as _extractFinalAnswerForTest, ' + 'buildUserMessage as _buildUserMessageForTest }` to gaia-agent.ts.'); process.exit(1); } let failures = 0; const PASS = '\x1b[32mPASS\x1b[0m'; const FAIL = '\x1b[31mFAIL\x1b[0m'; console.log('\n=== gaia-extract smoke (iter 53a T2 narrowed) ===\n'); console.log('Suite: 12 original + 7 anti-regression = 19 cases\n'); for (const tc of CASES) { let actual; if (tc.isUserMsg) { actual = _buildUserMessage(tc.input); // For user message tests, check startsWith instead of exact equality. const pass = tc.expected === null ? actual === null : actual !== null && (tc.expected === actual || actual.startsWith(tc.expected)); if (pass) { console.log(` ${PASS} ${tc.label}`); } else { console.log(` ${FAIL} ${tc.label}`); console.log(` expected starts-with: ${JSON.stringify(tc.expected)}`); console.log(` actual: ${JSON.stringify((actual ?? '').slice(0, 80))}`); failures++; } } else { actual = _extractFinalAnswer(makeResp(tc.input)); const pass = tc.expected === null ? actual === null : actual !== null && actual === tc.expected; if (pass) { console.log(` ${PASS} ${tc.label}`); } else { console.log(` ${FAIL} ${tc.label}`); console.log(` expected: ${JSON.stringify(tc.expected)}`); console.log(` actual: ${JSON.stringify(actual)}`); failures++; } } } const total = CASES.length; const passed = total - failures; console.log(`\n=== ${failures === 0 ? 'ALL PASSED' : `${failures} FAILED`} (${passed}/${total} cases) ===\n`); if (failures > 0) process.exit(1); } main().catch((err) => { console.error('Smoke crashed:', err); process.exit(2); }); //# sourceMappingURL=gaia-extract.smoke.js.map