UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

437 lines 19.9 kB
/** * GAIA Judge — ADR-133-PR6 * * Two-stage answer scorer for the GAIA benchmark: * * Stage 1 — Fast path: normalized exact-match. * Normalise = lowercase + strip surrounding whitespace + strip surrounding * single/double quotes + collapse internal whitespace runs to one space. * Roughly 30 % of GAIA Level-1 answers satisfy this; no API call required. * * Stage 2 — LLM-as-judge: when exact-match fails, ask Claude Sonnet whether * the candidate answer is semantically equivalent to the ground truth. * The prompt embeds GAIA's official evaluation guideline (see * https://huggingface.co/datasets/gaia-benchmark/GAIA for full spec). * * Caching: judgment results are persisted under * ~/.cache/ruflo/gaia/judgments/<hash>.json * keyed on (question_id, candidate_answer, model_id, JUDGE_PROMPT_VERSION). * Re-running the same pair hits the cache and returns instantly. * * API pattern: raw fetch() against https://api.anthropic.com/v1/messages — * mirrors the pattern established in gaia-agent.ts (ADR-133-PR3). * * Refs: ADR-133, #2156 */ import { createHash } from 'node:crypto'; import * as fs from 'node:fs'; import * as path from 'node:path'; import * as os from 'node:os'; import { execSync } from 'node:child_process'; // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages'; const ANTHROPIC_API_VERSION = '2023-06-01'; const DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-6'; const DEFAULT_CACHE_DIR = path.join(os.homedir(), '.cache', 'ruflo', 'gaia', 'judgments'); /** * Bump this string whenever the judge prompt changes so stale cached verdicts * are automatically invalidated (different key → cache miss). */ const JUDGE_PROMPT_VERSION = 'v1'; // Sonnet pricing (input/output per million tokens, 2026-05-27). const SONNET_INPUT_COST_PER_M = 3.0; const SONNET_OUTPUT_COST_PER_M = 15.0; // --------------------------------------------------------------------------- // Normalisation // --------------------------------------------------------------------------- /** * GAIA normalisation as specified in the dataset paper: * - strip surrounding whitespace * - lowercase * - strip a single pair of surrounding quotes (single or double) * - collapse internal whitespace runs to one space */ export function normaliseAnswer(raw) { if (raw == null) return ''; let s = raw.trim().toLowerCase(); // Strip one pair of surrounding quotes if ((s.startsWith('"') && s.endsWith('"')) || (s.startsWith("'") && s.endsWith("'"))) { s = s.slice(1, -1); } // Collapse internal whitespace s = s.replace(/\s+/g, ' ').trim(); return s; } // --------------------------------------------------------------------------- // Unit-aware numeric matching // --------------------------------------------------------------------------- /** * Attempt to match a candidate numeric answer to an expected answer where the * question implies a unit scale. * * Examples that this catches: * candidate="17000", expected="17", question contains "thousand" * → candidate / 1000 ≈ expected → MATCH * candidate="17", expected="17000", question contains "thousand" * → candidate × 1000 ≈ expected → MATCH (reverse direction) * * Returns true only when a numeric match is found under one of the scale * multipliers mentioned in the question text. Returns false for non-numeric * inputs or when no multiplier matches. * * @param candidate - The raw string from the model (may include commas/spaces). * @param expected - The raw ground-truth string. * @param questionText - The original question (used to detect multiplier words). */ export function unitAwareNumberMatch(candidate, expected, questionText) { // Strip commas, spaces, and trailing unit suffixes for numeric parsing const toNum = (s) => parseFloat(s.replace(/[,\s]/g, '')); const candNum = toNum(candidate); const expNum = toNum(expected); if (isNaN(candNum) || isNaN(expNum)) return false; // Exact numeric equality (handles "17" vs "17.0", etc.) if (Math.abs(candNum - expNum) < 0.001 * (Math.abs(expNum) + 1)) return true; if (!questionText) return false; const qLower = questionText.toLowerCase(); const MULTIPLIERS = [ ['trillion', 1e12], ['billion', 1e9], ['million', 1e6], ['thousand', 1e3], ['hundred', 1e2], ]; for (const [word, mult] of MULTIPLIERS) { if (!qLower.includes(word)) continue; // Model returned raw, expected is already in scaled units // e.g. model says "17000", question asks "how many thousand hours", expected is "17" if (Math.abs(candNum / mult - expNum) < 0.01 * (Math.abs(expNum) + 1)) return true; // Reverse: model returned scaled, expected is raw if (Math.abs(candNum - expNum / mult) < 0.01 * (Math.abs(candNum) + 1)) return true; } return false; } // --------------------------------------------------------------------------- // Cache helpers // --------------------------------------------------------------------------- function cacheKey(questionId, candidateAnswer, judgeModel) { const raw = `${questionId}||${candidateAnswer}||${judgeModel}||${JUDGE_PROMPT_VERSION}`; return createHash('sha256').update(raw).digest('hex'); } function cacheRead(cacheDir, key) { const file = path.join(cacheDir, `${key}.json`); try { const txt = fs.readFileSync(file, 'utf-8'); return JSON.parse(txt); } catch { return null; } } function cacheWrite(cacheDir, key, result) { try { fs.mkdirSync(cacheDir, { recursive: true }); fs.writeFileSync(path.join(cacheDir, `${key}.json`), JSON.stringify(result, null, 2), 'utf-8'); } catch { // Non-fatal — cache write failure should not abort a benchmark run. } } // --------------------------------------------------------------------------- // API key resolution (mirrors gaia-agent.ts resolveAnthropicApiKey) // --------------------------------------------------------------------------- function resolveApiKey(supplied) { if (supplied && supplied.trim()) return supplied.trim(); const envKey = process.env.ANTHROPIC_API_KEY; if (envKey && envKey.trim()) return envKey.trim(); try { const out = execSync('gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY 2>/dev/null', { encoding: 'utf-8', timeout: 10_000 }).trim(); if (out) return out; } catch { /* fall through */ } throw new Error('ANTHROPIC_API_KEY not found. Set the env var or store it in GCP Secret Manager under ' + '"ANTHROPIC_API_KEY".'); } // --------------------------------------------------------------------------- // LLM-as-judge prompt // --------------------------------------------------------------------------- /** * Build the judge system prompt. * * References the GAIA official scoring guideline: * "The evaluation of the answer is done by exact string match after * normalisation. For numerical answers, units are ignored unless * the question explicitly asks for them. For named-entity answers, * common aliases are accepted. For open-ended questions where an exact * match is not possible, the answer is judged correct if it is semantically * equivalent to the ground truth and contains all required information." * Source: https://huggingface.co/datasets/gaia-benchmark/GAIA (README, §Evaluation) */ function buildJudgeSystemPrompt() { return [ 'You are a precise judge evaluating whether a candidate answer to a', 'question-answering benchmark is correct.', '', 'SCORING RULES (from the GAIA benchmark specification):', '1. Exact-string equivalence (after normalisation) is always correct.', '2. For NUMERICAL answers: ignore units unless the question explicitly requests them.', ' "3.14" and "approximately 3.14" for "what is pi to 2 decimal places" are both correct.', '3. For NAMED-ENTITY answers: accept common aliases and alternative spellings.', ' "UK" and "United Kingdom" are equivalent.', '4. For LIST answers: the candidate must contain all required items; extra items are ok.', '5. Do NOT accept answers that are vague or incomplete when the ground truth is specific.', ' "a European city" is NOT correct if the ground truth is "Paris".', '', 'You MUST respond with a single JSON object on one line, exactly this shape:', '{"passed": true, "reason": "..."} or {"passed": false, "reason": "..."}', 'The "reason" must be 200 characters or fewer.', 'Do not output anything outside the JSON object.', ].join('\n'); } function buildJudgeUserMessage(question, groundTruth, candidate) { return [ `QUESTION: ${question}`, `GROUND TRUTH: ${groundTruth}`, `CANDIDATE ANSWER: ${candidate}`, '', 'Is the candidate answer correct per the scoring rules above?', ].join('\n'); } async function callJudge(systemPrompt, userMessage, model, apiKey) { const messages = [ { role: 'user', content: userMessage }, ]; const body = JSON.stringify({ model, max_tokens: 256, system: systemPrompt, messages, }); const response = await fetch(ANTHROPIC_API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json', 'x-api-key': apiKey, 'anthropic-version': ANTHROPIC_API_VERSION, }, body, }); if (!response.ok) { const errText = await response.text().catch(() => '(no body)'); throw new Error(`Anthropic API error ${response.status}: ${errText}`); } const data = (await response.json()); const textBlock = data.content.find((c) => c.type === 'text'); const rawText = textBlock?.text ?? ''; // Parse the JSON the model produced let passed = false; let reason = ''; try { // The model might wrap the JSON in a code fence — strip it const jsonStr = rawText.replace(/^```(?:json)?\s*/i, '').replace(/\s*```$/, '').trim(); const parsed = JSON.parse(jsonStr); passed = Boolean(parsed.passed); reason = String(parsed.reason ?? '').slice(0, 200); } catch { // Fallback: scan the text for obvious pass/fail signals const lower = rawText.toLowerCase(); passed = lower.includes('"passed": true') || lower.includes('"passed":true'); reason = `parse error — raw: ${rawText.slice(0, 100)}`; } return { passed, reason, tokensIn: data.usage.input_tokens, tokensOut: data.usage.output_tokens, }; } // --------------------------------------------------------------------------- // Public API // --------------------------------------------------------------------------- /** * Judge a single GAIA answer. * * @param question - Object with `id` (task_id), `expected` (ground truth), * and optional `questionText` (the full question string, * used for unit-aware numeric matching in Stage 1). * @param candidateAnswer - The answer produced by the agent; `null` counts as a miss. * @param options - Optional overrides (model, cache dir, API key, etc.). * @returns - JudgeResult with pass/fail, scoring path, and cost metrics. */ export async function judgeAnswer(question, candidateAnswer, options) { const judgeModel = options?.judgeModel ?? DEFAULT_JUDGE_MODEL; const cacheDir = options?.cacheDir ?? DEFAULT_CACHE_DIR; const candidate = candidateAnswer ?? ''; // ── Stage 0: null / empty candidate is always a miss ── if (!candidate.trim()) { return { questionId: question.id, passed: false, scoringPath: 'exact-match', candidateAnswer: candidate, groundTruth: question.expected, }; } // ── Stage 1a: normalised exact-match (no API call) ── const normCandidate = normaliseAnswer(candidate); const normExpected = normaliseAnswer(question.expected); if (normCandidate === normExpected) { return { questionId: question.id, passed: true, scoringPath: 'exact-match', candidateAnswer: candidate, groundTruth: question.expected, }; } // ── Stage 1b: unit-aware numeric match (no API call) ── // Handles cases like model returns "17000" but expected is "17" and // the question asks "how many thousand hours". if (unitAwareNumberMatch(normCandidate, normExpected, question.questionText)) { return { questionId: question.id, passed: true, scoringPath: 'exact-match', candidateAnswer: candidate, groundTruth: question.expected, }; } // ── Cache lookup (before calling the LLM) ── const key = cacheKey(question.id, candidate, judgeModel); if (!options?.skipCache) { const cached = cacheRead(cacheDir, key); if (cached !== null) { return { ...cached, scoringPath: 'cache' }; } } // ── Stage 2: LLM-as-judge ── const apiKey = resolveApiKey(options?.apiKey); const systemPrompt = buildJudgeSystemPrompt(); const userMessage = buildJudgeUserMessage(question.questionText ?? question.expected, question.expected, candidate); const { passed, reason, tokensIn, tokensOut } = await callJudge(systemPrompt, userMessage, judgeModel, apiKey); const costUsd = (tokensIn / 1_000_000) * SONNET_INPUT_COST_PER_M + (tokensOut / 1_000_000) * SONNET_OUTPUT_COST_PER_M; const result = { questionId: question.id, passed, scoringPath: 'llm-judge', candidateAnswer: candidate, groundTruth: question.expected, judgeReason: reason, judgeModel, judgeTokensIn: tokensIn, judgeTokensOut: tokensOut, judgeCostUsd: costUsd, }; // Persist to cache (even failures — avoids repeated LLM calls on re-run) cacheWrite(cacheDir, key, result); return result; } // --------------------------------------------------------------------------- // Smoke runner // --------------------------------------------------------------------------- /** * Self-contained smoke test. Run with: * npx tsx src/benchmarks/gaia-judge.ts * * Does NOT require an ANTHROPIC_API_KEY for the exact-match cases. * The LLM-judge cases require a live key and cost ~$0.001 total. * * Expected cost: ≤ 2 Sonnet judge calls × ~300 tokens ≈ $0.001 */ async function runSmoke() { const PASS = '\x1b[32mPASS\x1b[0m'; const FAIL = '\x1b[31mFAIL\x1b[0m'; let failures = 0; function check(label, condition) { if (condition) { console.log(` ${PASS} ${label}`); } else { console.log(` ${FAIL} ${label}`); failures++; } } // Use a temp cache dir so smoke runs are isolated const tmpCacheDir = path.join(os.tmpdir(), `gaia-judge-smoke-${Date.now()}`); const baseOpts = { cacheDir: tmpCacheDir }; console.log('\n=== gaia-judge smoke ===\n'); // ── Stage 1a: normaliseAnswer unit tests (no API call, no judgeAnswer) ── console.log('-- Stage 1a: normaliseAnswer --'); check('normalise("346") === "346"', normaliseAnswer('346') === '346'); check('normalise(" YES ") === "yes"', normaliseAnswer(' YES ') === 'yes'); check('normalise(\'"Paris"\') === "paris"', normaliseAnswer('"Paris"') === 'paris'); check('normalise("hello world") === "hello world"', normaliseAnswer('hello world') === 'hello world'); check('normalise(null) === ""', normaliseAnswer(null) === ''); check('"346" !== "347" after normalise', normaliseAnswer('346') !== normaliseAnswer('347')); // ── Stage 1b: exact-match hit and null-candidate cases (no API call) ── console.log('\n-- Stage 1b: exact-match path --'); const r1 = await judgeAnswer({ id: 'em-1', expected: '346' }, '346', baseOpts); check('exact match "346" vs "346" → pass, exact-match path', r1.passed && r1.scoringPath === 'exact-match'); const r3 = await judgeAnswer({ id: 'em-3', expected: 'yes' }, ' YES ', baseOpts); check('normalised "yes" vs " YES " → pass, exact-match path', r3.passed && r3.scoringPath === 'exact-match'); const r4 = await judgeAnswer({ id: 'em-4', expected: 'Paris' }, '"Paris"', baseOpts); check('quote-stripped "Paris" vs \'"Paris"\' → pass, exact-match path', r4.passed && r4.scoringPath === 'exact-match'); const r5 = await judgeAnswer({ id: 'em-5', expected: 'hello world' }, 'hello world', baseOpts); check('whitespace-collapsed "hello world" → pass, exact-match path', r5.passed && r5.scoringPath === 'exact-match'); const rNull = await judgeAnswer({ id: 'em-null', expected: '346' }, null, baseOpts); check('null candidate → fail, exact-match path', !rNull.passed && rNull.scoringPath === 'exact-match'); // ── LLM-judge cases (requires ANTHROPIC_API_KEY) ── const hasKey = !!(process.env.ANTHROPIC_API_KEY?.trim()); if (!hasKey) { console.log('\n-- Stage 2: llm-judge (SKIPPED — no ANTHROPIC_API_KEY) --'); } else { console.log('\n-- Stage 2: llm-judge --'); // Case 1: semantically equivalent (should pass) const r6 = await judgeAnswer({ id: 'llm-1', expected: 'Paris' }, 'The capital of France is Paris', baseOpts); check(`llm-judge "Paris" vs "The capital of France is Paris" → pass (path=${r6.scoringPath})`, r6.passed); // Case 2: numerically wrong (should fail) const r7 = await judgeAnswer({ id: 'llm-2', expected: '3.14159' }, 'approximately three', baseOpts); check(`llm-judge "3.14159" vs "approximately three" → fail (path=${r7.scoringPath})`, !r7.passed); const llmCost = (r6.judgeCostUsd ?? 0) + (r7.judgeCostUsd ?? 0); console.log(` cost: $${llmCost.toFixed(5)} (${(r6.judgeTokensIn ?? 0) + (r7.judgeTokensIn ?? 0)} in, ${(r6.judgeTokensOut ?? 0) + (r7.judgeTokensOut ?? 0)} out)`); // ── Cache hit verification ── console.log('\n-- Stage 3: cache hit --'); // Re-run case 1 — must return from cache const r8 = await judgeAnswer({ id: 'llm-1', expected: 'Paris' }, 'The capital of France is Paris', baseOpts); check('second run of llm-1 → cache hit', r8.scoringPath === 'cache'); check('cache hit preserves original verdict', r8.passed === r6.passed); // skipCache forces an LLM call even if cached const r9 = await judgeAnswer({ id: 'llm-1', expected: 'Paris' }, 'The capital of France is Paris', { ...baseOpts, skipCache: true }); check('skipCache=true bypasses cache → llm-judge', r9.scoringPath === 'llm-judge'); } // Cleanup temp cache dir try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch { /* ignore */ } console.log(`\n=== smoke ${failures === 0 ? 'PASSED' : `FAILED (${failures} assertion(s))`} ===\n`); if (failures > 0) process.exit(1); } // Run smoke when executed directly const isMain = process.argv[1] && (process.argv[1].endsWith('gaia-judge.ts') || process.argv[1].endsWith('gaia-judge.js')); if (isMain) { runSmoke().catch((err) => { console.error('Smoke failed:', err); process.exit(1); }); } //# sourceMappingURL=gaia-judge.js.map