claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
129 lines • 5.52 kB
TypeScript
/**
* GAIA CodeAgent — ADR-138 iter 54 (FINAL design)
*
* smolagents-style CodeAgent harness for the GAIA L1 benchmark.
* NOT the smolagents library — this is the PATTERN implemented natively in ruflo TS.
*
* Architecture (HAL replication):
* Instead of Anthropic native tool_use blocks (JSON → execute → repeat),
* the agent writes Python code that calls tools as functions. The code is
* parsed from markdown code blocks, executed in gaia-codeagent-runner.py,
* and the stdout is fed back as the next user turn. The agent commits its
* answer by calling final_answer("value") in Python.
*
* Why this beats ToolCallingAgent on GAIA (from HAL-DEEP-STUDY.md):
* - 30% fewer steps for the same task (Python is more expressive than JSON)
* - Variables persist across steps in the agent's mental model
* - Complex control flow (loops, try/except) is native
* - final_answer() is deterministic — no regex extraction fragility
*
* Loop algorithm:
* 1. Build system prompt with tool signatures and GAIA instruction template.
* 2. Call Anthropic Messages API (text-in / text-out — NO tools array).
* 3. Parse the response for a ```python ... ``` code block.
* 4. If code block found: run gaia-codeagent-runner.py subprocess.
* - Runner pre-defines tool functions (web_search, visit_webpage, etc.)
* - If final_answer("X") is called in the code: extract X, return result.
* - Otherwise: feed stdout back as user turn, continue.
* 5. If no code block: prompt agent to produce one (max 3 retries).
* 6. If maxTurns exceeded: return timedOut=true.
* 7. Every planningInterval turns: inject a planning checkpoint.
*
* Tool routing (via gaia-codeagent-runner.py):
* web_search(query) → claude -p with WebSearch (best web coverage)
* visit_webpage(url) → requests + bs4 HTML extraction
* grounded_query(query) → Gemini with Google Search grounding (ruflo unique)
* read_file(path) → Python direct (text/csv/json/xlsx) or subprocess
* describe_image(path) → claude -p with vision
* final_answer(x) → writes sentinel JSON and exits runner
*
* Key parameters:
* model: claude-sonnet-4-6 (default; ADR-138 targets Sonnet 4.5+)
* maxTurns: 20 (HAL uses 200; 20 is cost-controlled for L1)
* planningInterval: 4 (match HAL's planning_interval=4)
* maxTokensPerTurn: 4096 (code generation needs more space than ToolCalling)
*
* Refs: ADR-138, HAL-DEEP-STUDY.md, smolagents CodeAgent, #2156, iter 54
*/
import { GaiaQuestion } from './gaia-loader.js';
import { isAnswerCorrect } from './gaia-agent.js';
export { isAnswerCorrect };
export interface CodeAgentResult {
questionId: string;
finalAnswer: string | null;
turns: number;
toolCallsByName: Record<string, number>;
totalInputTokens: number;
totalOutputTokens: number;
wallMs: number;
replanCount: number;
timedOut?: boolean;
error?: string;
/** Steps log for debugging — each entry is one turn's code + output. */
steps?: Array<{
code: string;
output: string;
}>;
}
export interface CodeAgentOptions {
model?: string;
maxTurns?: number;
maxTokensPerTurn?: number;
perTurnTimeoutMs?: number;
/** Timeout for each Python step execution (default: 30s). */
perStepTimeoutMs?: number;
planningInterval?: number;
apiKey?: string;
/** If true, include step-by-step code/output log in the result. */
verbose?: boolean;
/**
* Optional tool catalogue override — used by unit tests to inject mock tools.
* In production (Python runner mode) this is ignored; the runner defines its own tools.
*/
catalogue?: unknown[];
}
/**
* Extract the first ```python ... ``` code block from assistant output.
* Returns null if no code block is found.
*
* Exported for use in unit tests (gaia-codeagent.smoke.ts T1).
*/
export declare function extractCodeBlock(text: string): string | null;
/**
* Execute a single Python code step via the gaia-codeagent-runner.py subprocess.
*
* This is a thin public wrapper around `executeAgentCodeStep` that:
* - Exposes a clean typed signature for unit tests (gaia-codeagent.smoke.ts T2-T4)
* - Renames `output` → `observation` to match the smolagents naming convention
*
* @param code Python code to execute
* @param attachmentPath Path to attached file, or null
* @param timeoutMs Subprocess timeout in ms (default: 30s)
* @param apiKey Anthropic API key (passed to runner for claude -p tool calls)
*/
export declare function runCodeAgentStep(code: string, attachmentPath: string | null, timeoutMs?: number, apiKey?: string): {
observation: string;
finalAnswer: string | null;
};
/**
* Run a GAIA question through the smolagents-style CodeAgent harness.
*
* The agent writes Python code, we execute it with tool stubs, and feed
* the output back. The loop continues until final_answer() is called
* or maxTurns is exhausted.
*/
export declare function runGaiaCodeAgent(question: GaiaQuestion, options?: CodeAgentOptions): Promise<CodeAgentResult>;
/**
* Run all 5 SMOKE_FIXTURE questions through the CodeAgent harness.
* Pass criteria: ≥3/5 correct (60%).
*/
export declare function runCodeAgentSmokeTest(opts?: {
verbose?: boolean;
apiKey?: string;
model?: string;
}): Promise<{
passRate: number;
passed: number;
total: number;
}>;
//# sourceMappingURL=gaia-codeagent.d.ts.map