claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
198 lines • 8.22 kB
TypeScript
/**
* GAIA Agent — ADR-133-PR3 / ADR-135 (planning interval)
*
* Multi-turn Anthropic Messages API loop that drives Claude through the
* GAIA benchmark questions using a tool-use agent pattern.
*
* Loop algorithm:
* 1. Build initial message with the question and a system prompt that
* instructs Claude to output `FINAL_ANSWER: <value>` when done.
* 2. Call Anthropic Messages API with the registered tool definitions.
* 3. On `stop_reason === 'tool_use'`: execute all tool_use blocks in
* parallel, append results as a `user` turn, and repeat.
* Every PLANNING_INTERVAL turns, inject a planning-checkpoint text
* alongside the tool results to force strategy re-evaluation.
* 4. On `stop_reason === 'end_turn'`: scan content for the final answer
* pattern and return the result.
* 5. On timeout (maxTurns exceeded): return `{ timedOut: true }`.
*
* API key resolution order (mirrors resolveHfToken from gaia-loader.ts):
* 1. `options.apiKey` (caller-supplied)
* 2. `ANTHROPIC_API_KEY` env var
* 3. `gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY`
*
* Cost discipline: smoke runs use `claude-haiku-4-5` only. The smoke
* runner at the bottom of this file enforces that model.
*
* Planning interval (iter 30 finding #3):
* smolagents CodeAgent uses planning_interval=4 — replans every 4 steps
* to prevent tunnel-vision on bad strategies. Adds ~80 tokens per
* replan event (~$0.0001 each), negligible cost.
*
* Iter 53a T2 narrowing:
* Three precise changes from iter 52 T2 (which had net -1q: +6 recoveries, -7 regressions):
* 1. extractFinalAnswer uses Stage 1 only (no Stage 2/3 prose fallback).
* Stage 2/3 fired too aggressively: overwriting correct Stage 1 answers and
* extracting wrong prose fragments. Now Stage 1 is the only extraction path.
* 2. System prompt removes surrender instruction ("FINAL_ANSWER: unknown / I don't know").
* That instruction caused the agent to give up on questions it would have figured out.
* Replaced with: "When you reach a final answer, output FINAL_ANSWER: <value>."
* 3. Reversed-text preprocessor is preserved (iter 52 T2 finding: 2d83110e has reversed text).
*
* Refs: ADR-133, ADR-135, iter 30, iter 52, iter 53a, #2156
*/
import { GaiaQuestion } from './gaia-loader.js';
import { GaiaToolCatalogue, ContentBlock } from './gaia-tools/index.js';
/**
* Every PLANNING_INTERVAL tool_use turns, inject a planning-checkpoint
* message to force the agent to reassess its strategy.
*
* Based on iter 30 research: smolagents CodeAgent uses planning_interval=4.
* HAL reliability analysis showed agents fail when they exhaust step
* budgets without recalibrating.
*/
export declare const PLANNING_INTERVAL = 4;
/**
* Build the planning-checkpoint text injected every PLANNING_INTERVAL turns.
* Exported so tests can snapshot the exact wording.
*/
export declare function buildPlanningCheckpoint(turn: number, maxTurns: number): string;
export interface GaiaAgentResult {
questionId: string;
finalAnswer: string | null;
turns: number;
toolCallsByName: Record<string, number>;
totalInputTokens: number;
totalOutputTokens: number;
wallMs: number;
/** Number of planning-checkpoint injections during this run (0 when planning is disabled). */
replanCount?: number;
timedOut?: boolean;
/** Set when the convergence layer fired and committed the final answer. */
convergenceTrigger?: string;
/** True when the convergence layer recovered the answer from prior message history. */
convergenceUsedFallback?: boolean;
error?: string;
}
export interface GaiaAgentOptions {
/** Model to use (default: 'claude-haiku-4-5'). */
model?: string;
/** Maximum number of agent turns before giving up (default: 8). */
maxTurns?: number;
/** Maximum tokens per Anthropic API call (default: 2048). */
maxTokensPerTurn?: number;
/** Per-turn HTTP timeout in milliseconds (default: 60 000). */
perTurnTimeoutMs?: number;
/**
* Inject a planning-checkpoint every N tool_use turns (default: PLANNING_INTERVAL = 4).
* Set to 0 to disable planning checkpoints.
*/
planningInterval?: number;
/**
* Anthropic API key. Resolved automatically via env var + gcloud fallback
* if omitted.
*/
apiKey?: string;
/**
* Pre-built tool catalogue. Defaults to `createDefaultToolCatalogue()`.
* Exposed so callers can inject mocks for testing.
*/
catalogue?: GaiaToolCatalogue;
/**
* Enable the convergence layer (default: true).
*
* When enabled, the convergence layer monitors for three failure modes:
* 1. max_turns hit without FINAL_ANSWER
* 2. Loop (same tool+args 3× in a 5-turn window)
* 3. Token overflow (>120k input tokens)
*
* On detection, a forced-commit phase is run: one API call with a
* directive prompt, no tools, then a fallback scan of prior messages.
* Set to false to disable (e.g. for ablation testing).
*/
enableConvergence?: boolean;
}
/**
* Resolve the Anthropic API key.
*
* Resolution order:
* 1. Caller-supplied `apiKey`
* 2. `ANTHROPIC_API_KEY` env var
* 3. `gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY`
*
* Throws with a clear message if none of the above is available.
*/
export declare function resolveAnthropicApiKey(apiKey?: string): string;
/**
* If the question text appears to be reversed English, prepend a de-reversed
* version so the agent sees both the original and the decoded form.
*
* Iter 52 T2 — gate 1 finding: task 2d83110e has a reversed sentence.
* Kept in iter 53a (this is not the source of the iter 52 regressions).
*/
declare function buildUserMessage(question: string): string;
/** Anthropic image content block for vision API. */
interface ImageContentBlock {
type: 'image';
source: {
type: 'base64';
media_type: string;
data: string;
};
}
/**
* Parse an IMAGE_BASE64 marker returned by file_read's extractImage().
* Returns an Anthropic image content block, or null if the marker is invalid.
*
* Marker format: [IMAGE_BASE64:{"mediaType":"image/png","base64":"...","path":"..."}]
*/
export declare function parseImageMarker(marker: string): ImageContentBlock | null;
/** Minimal types for the Anthropic Messages API response. */
interface AnthropicResponse {
id: string;
model: string;
stop_reason: 'end_turn' | 'tool_use' | 'max_tokens' | string;
content: ContentBlock[];
usage: {
input_tokens: number;
output_tokens: number;
};
}
declare function extractFinalAnswer(resp: AnthropicResponse): string | null;
/**
* Run a GAIA question through Claude with tool use.
*
* @returns GaiaAgentResult with the final answer (or null if timed out),
* turn count, token totals, and per-tool call counts.
*/
export declare function runGaiaAgent(question: GaiaQuestion, options?: GaiaAgentOptions): Promise<GaiaAgentResult>;
/**
* Check whether a model answer matches the expected ground-truth answer.
*
* Matching rules (mirrors GAIA evaluation):
* - Normalise: trim whitespace, lowercase.
* - Substring match: expected is contained in model answer (handles "Paris" vs "Paris, France").
* - Direct equality after normalisation.
* - Numeric: parse as floats and compare with ±1% tolerance.
*/
export declare function isAnswerCorrect(modelAnswer: string, expected: string): boolean;
/**
* Run all 5 SMOKE_FIXTURE questions and report results to stdout.
*
* Pass criteria: ≥3/5 correct (60% pass rate).
*
* Cost estimate is printed at the end using Haiku pricing.
*
* This function is exported so tests can call it directly and capture output;
* it also runs when this file is executed directly via `node gaia-agent.js --smoke`.
*/
export declare function runSmokeTest(opts?: {
verbose?: boolean;
apiKey?: string;
}): Promise<{
passRate: number;
passed: number;
total: number;
}>;
export { extractFinalAnswer as _extractFinalAnswerForTest, buildUserMessage as _buildUserMessageForTest, };
//# sourceMappingURL=gaia-agent.d.ts.map