claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
88 lines • 3.67 kB
TypeScript
/**
* GAIA Judge — ADR-133-PR6
*
* Two-stage answer scorer for the GAIA benchmark:
*
* Stage 1 — Fast path: normalized exact-match.
* Normalise = lowercase + strip surrounding whitespace + strip surrounding
* single/double quotes + collapse internal whitespace runs to one space.
* Roughly 30 % of GAIA Level-1 answers satisfy this; no API call required.
*
* Stage 2 — LLM-as-judge: when exact-match fails, ask Claude Sonnet whether
* the candidate answer is semantically equivalent to the ground truth.
* The prompt embeds GAIA's official evaluation guideline (see
* https://huggingface.co/datasets/gaia-benchmark/GAIA for full spec).
*
* Caching: judgment results are persisted under
* ~/.cache/ruflo/gaia/judgments/<hash>.json
* keyed on (question_id, candidate_answer, model_id, JUDGE_PROMPT_VERSION).
* Re-running the same pair hits the cache and returns instantly.
*
* API pattern: raw fetch() against https://api.anthropic.com/v1/messages —
* mirrors the pattern established in gaia-agent.ts (ADR-133-PR3).
*
* Refs: ADR-133, #2156
*/
export interface JudgeResult {
questionId: string;
passed: boolean;
scoringPath: 'exact-match' | 'llm-judge' | 'cache';
candidateAnswer: string;
groundTruth: string;
judgeReason?: string;
judgeModel?: string;
judgeTokensIn?: number;
judgeTokensOut?: number;
judgeCostUsd?: number;
}
export interface JudgeOptions {
/** Default: 'claude-sonnet-4-6' */
judgeModel?: string;
/** Default: '~/.cache/ruflo/gaia/judgments/' */
cacheDir?: string;
skipCache?: boolean;
apiKey?: string;
}
/**
* GAIA normalisation as specified in the dataset paper:
* - strip surrounding whitespace
* - lowercase
* - strip a single pair of surrounding quotes (single or double)
* - collapse internal whitespace runs to one space
*/
export declare function normaliseAnswer(raw: string | null | undefined): string;
/**
* Attempt to match a candidate numeric answer to an expected answer where the
* question implies a unit scale.
*
* Examples that this catches:
* candidate="17000", expected="17", question contains "thousand"
* → candidate / 1000 ≈ expected → MATCH
* candidate="17", expected="17000", question contains "thousand"
* → candidate × 1000 ≈ expected → MATCH (reverse direction)
*
* Returns true only when a numeric match is found under one of the scale
* multipliers mentioned in the question text. Returns false for non-numeric
* inputs or when no multiplier matches.
*
* @param candidate - The raw string from the model (may include commas/spaces).
* @param expected - The raw ground-truth string.
* @param questionText - The original question (used to detect multiplier words).
*/
export declare function unitAwareNumberMatch(candidate: string, expected: string, questionText?: string): boolean;
/**
* Judge a single GAIA answer.
*
* @param question - Object with `id` (task_id), `expected` (ground truth),
* and optional `questionText` (the full question string,
* used for unit-aware numeric matching in Stage 1).
* @param candidateAnswer - The answer produced by the agent; `null` counts as a miss.
* @param options - Optional overrides (model, cache dir, API key, etc.).
* @returns - JudgeResult with pass/fail, scoring path, and cost metrics.
*/
export declare function judgeAnswer(question: {
id: string;
expected: string;
questionText?: string;
}, candidateAnswer: string | null, options?: JudgeOptions): Promise<JudgeResult>;
//# sourceMappingURL=gaia-judge.d.ts.map