claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
103 lines • 3.9 kB
TypeScript
/**
* GAIA Ensemble Runner — ADR-139
*
* Runs a GAIA question through N models in parallel, aggregates answers via
* majority vote, with a judge-model tiebreak when no consensus is reached.
*
* Architecture:
* 1. Each model runs independently using the full tool harness.
* 2. Answers are normalised (via normaliseAnswer from gaia-judge.ts).
* 3. Majority vote: if ≥2 models agree on normalised answer → that wins.
* 4. Tiebreak: when all answers differ (or N=2 with disagreement), the judge
* model picks the best answer with a brief rationale.
* 5. Abstain: if all models return null/timedOut → failed question.
*
* Supported models:
* - Claude (claude-sonnet-4-6, etc.) via Anthropic API (gaia-agent.ts)
* - Gemini (gemini-2.5-pro, etc.) via Google AI API (gaia-agent-gemini.js compiled)
* - OpenRouter (gpt-5, deepseek-v3.2, kimi-k2, etc.) via OpenAI-compatible API
* NOTE: OpenRouter requires funded account — returns 402 when credits exhausted.
*
* CLI integration:
* gaia-bench run --mode=ensemble --models=claude-sonnet-4-6,gemini-2.5-pro,openai/gpt-5
*
* Cost model (per question, typical L1 ~10k input + ~3k output tokens):
* claude-sonnet-4-6: ~$0.075
* gemini-2.5-pro: ~$0.043
* openai/gpt-5 (OR): ~$0.043
* 3-model total: ~$0.161 (53-Q: ~$8.54, 300-Q: ~$48.30)
*
* Refs: ADR-139, ADR-133, ADR-135, #2156
*/
import { GaiaQuestion } from './gaia-loader.js';
export interface ModelRunResult {
model: string;
finalAnswer: string | null;
normalisedAnswer: string;
turns: number;
totalInputTokens: number;
totalOutputTokens: number;
wallMs: number;
estimatedCostUsd: number;
timedOut?: boolean;
error?: string;
}
export type AggregationMethod = 'majority' | 'judge-tiebreak' | 'abstain';
export interface EnsembleResult {
questionId: string;
finalAnswer: string | null;
aggregationMethod: AggregationMethod;
/** Rationale from the judge tiebreak (only set when method is 'judge-tiebreak'). */
judgeRationale?: string;
models: ModelRunResult[];
totalInputTokens: number;
totalOutputTokens: number;
estimatedCostUsd: number;
wallMs: number;
}
export interface EnsembleOptions {
/** Models to use (provider inferred from model ID). */
models?: string[];
/** Judge model for tiebreak (default: claude-sonnet-4-6). */
judgeModel?: string;
/** Anthropic API key (resolved from env/gcloud if not supplied). */
anthropicApiKey?: string;
/** Google AI API key (resolved from env/gcloud if not supplied). */
geminiApiKey?: string;
/** OpenRouter API key (resolved from env/gcloud if not supplied). */
openrouterApiKey?: string;
/** Per-model max turns (default: 8). */
maxTurns?: number;
/** Per-model max tokens per turn (default: 2048). */
maxTokensPerTurn?: number;
/** Per-turn timeout in ms (default: 60 000). */
perTurnTimeoutMs?: number;
}
export declare function runEnsembleQuestion(question: GaiaQuestion, options?: EnsembleOptions): Promise<EnsembleResult>;
export interface EnsemblePilotResult {
correct: number;
total: number;
accuracy: number;
perQuestion: Array<{
taskId: string;
question: string;
expected: string;
got: string | null;
correct: boolean;
aggregationMethod: AggregationMethod;
judgeRationale?: string;
costUsd: number;
wallMs: number;
perModel: Array<{
model: string;
answer: string | null;
costUsd: number;
}>;
}>;
totalCostUsd: number;
projectedCost53Q: number;
projectedCost300Q: number;
meanWallMs: number;
}
export declare function runEnsemblePilot(questions: GaiaQuestion[], options?: EnsembleOptions): Promise<EnsemblePilotResult>;
//# sourceMappingURL=gaia-ensemble.d.ts.map