UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

88 lines 3.67 kB
/** * GAIA Judge — ADR-133-PR6 * * Two-stage answer scorer for the GAIA benchmark: * * Stage 1 — Fast path: normalized exact-match. * Normalise = lowercase + strip surrounding whitespace + strip surrounding * single/double quotes + collapse internal whitespace runs to one space. * Roughly 30 % of GAIA Level-1 answers satisfy this; no API call required. * * Stage 2 — LLM-as-judge: when exact-match fails, ask Claude Sonnet whether * the candidate answer is semantically equivalent to the ground truth. * The prompt embeds GAIA's official evaluation guideline (see * https://huggingface.co/datasets/gaia-benchmark/GAIA for full spec). * * Caching: judgment results are persisted under * ~/.cache/ruflo/gaia/judgments/<hash>.json * keyed on (question_id, candidate_answer, model_id, JUDGE_PROMPT_VERSION). * Re-running the same pair hits the cache and returns instantly. * * API pattern: raw fetch() against https://api.anthropic.com/v1/messages — * mirrors the pattern established in gaia-agent.ts (ADR-133-PR3). * * Refs: ADR-133, #2156 */ export interface JudgeResult { questionId: string; passed: boolean; scoringPath: 'exact-match' | 'llm-judge' | 'cache'; candidateAnswer: string; groundTruth: string; judgeReason?: string; judgeModel?: string; judgeTokensIn?: number; judgeTokensOut?: number; judgeCostUsd?: number; } export interface JudgeOptions { /** Default: 'claude-sonnet-4-6' */ judgeModel?: string; /** Default: '~/.cache/ruflo/gaia/judgments/' */ cacheDir?: string; skipCache?: boolean; apiKey?: string; } /** * GAIA normalisation as specified in the dataset paper: * - strip surrounding whitespace * - lowercase * - strip a single pair of surrounding quotes (single or double) * - collapse internal whitespace runs to one space */ export declare function normaliseAnswer(raw: string | null | undefined): string; /** * Attempt to match a candidate numeric answer to an expected answer where the * question implies a unit scale. * * Examples that this catches: * candidate="17000", expected="17", question contains "thousand" * → candidate / 1000 ≈ expected → MATCH * candidate="17", expected="17000", question contains "thousand" * → candidate × 1000 ≈ expected → MATCH (reverse direction) * * Returns true only when a numeric match is found under one of the scale * multipliers mentioned in the question text. Returns false for non-numeric * inputs or when no multiplier matches. * * @param candidate - The raw string from the model (may include commas/spaces). * @param expected - The raw ground-truth string. * @param questionText - The original question (used to detect multiplier words). */ export declare function unitAwareNumberMatch(candidate: string, expected: string, questionText?: string): boolean; /** * Judge a single GAIA answer. * * @param question - Object with `id` (task_id), `expected` (ground truth), * and optional `questionText` (the full question string, * used for unit-aware numeric matching in Stage 1). * @param candidateAnswer - The answer produced by the agent; `null` counts as a miss. * @param options - Optional overrides (model, cache dir, API key, etc.). * @returns - JudgeResult with pass/fail, scoring path, and cost metrics. */ export declare function judgeAnswer(question: { id: string; expected: string; questionText?: string; }, candidateAnswer: string | null, options?: JudgeOptions): Promise<JudgeResult>; //# sourceMappingURL=gaia-judge.d.ts.map