claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
105 lines • 4.71 kB
TypeScript
/**
* GAIA Claude-p Wrapper — iter 54 (#2156)
*
* Delegates each GAIA question to `claude -p` (Claude Code headless mode),
* which gives us WebSearch, WebFetch, Read (multimodal incl. PDF/DOCX/images),
* and Bash (Python execution) for free — the same tools HAL uses.
*
* Why this approach over a native TS CodeAgent:
* - HAL gaps vs ruflo were: visit_webpage, file reading (PDF/DOCX/XLSX/images),
* Python execution. Claude Code's built-in tools solve ALL of these.
* - No wheel-reinvention: battle-tested tool infra, native multimodal, proper
* tool-budget management, Anthropic WebSearch API.
* - Baseline: 24/53 (45.3%). Target: ≥45/53 to surpass HAL's 82.07%.
*
* SECURITY NOTE on --dangerously-skip-permissions:
* This flag is ONLY used inside the GAIA harness context, which is a sandboxed
* benchmark evaluation environment. GAIA questions have no real-world
* side effects — they are read-only research questions. The flag lets Claude Code
* use its tools (WebSearch, WebFetch, Read, Bash) without per-tool permission
* prompts, which is required for unattended benchmark execution. It MUST NOT
* be used in production workflows where Claude Code could affect real systems.
*
* JSON output format from `claude -p --output-format json`:
* {
* type: "result",
* subtype: "success" | "error_max_budget_usd" | ...,
* is_error: boolean,
* result: string, // final assistant message text
* total_cost_usd: number,
* duration_ms: number,
* num_turns: number,
* ...
* }
*
* Refs: ADR-138 (reference, NOT implemented), iter 54, #2156
*/
import type { GaiaQuestion } from './gaia-loader.js';
/** Default model for claude -p GAIA runs. Sonnet for quality parity with HAL. */
export declare const CLAUDE_P_DEFAULT_MODEL = "claude-sonnet-4-6";
/** Per-question budget cap (USD). HAL uses Sonnet 4.5 so $0.30 headroom is safe. */
export declare const CLAUDE_P_PER_QUESTION_BUDGET_USD = 0.3;
/** Subprocess timeout: 5 minutes per question. */
export declare const CLAUDE_P_TIMEOUT_MS: number;
export interface ClaudePResult {
/** The extracted answer, or null if extraction failed. */
finalAnswer: string | null;
/** Raw result text from claude -p. */
rawResult: string;
/** Whether claude -p exited with an error. */
isError: boolean;
/** claude -p's reported error message (if any). */
errorMessage?: string;
/** Actual cost reported by claude -p. */
costUsd: number;
/** Wall-clock time in ms. */
wallMs: number;
/** Number of turns claude -p used. */
numTurns: number;
/** claude -p stop reason. */
stopReason?: string;
}
export interface ClaudePOptions {
/** Model ID (default: CLAUDE_P_DEFAULT_MODEL). */
model?: string;
/** Per-question budget cap in USD (default: CLAUDE_P_PER_QUESTION_BUDGET_USD). */
budgetUsd?: number;
/** Timeout in ms (default: CLAUDE_P_TIMEOUT_MS). */
timeoutMs?: number;
/** Absolute path to the claude binary (default: resolved from $PATH). */
claudeBin?: string;
}
/**
* Build the prompt sent to claude -p for a GAIA question.
*
* Includes the question text, optional attachment path, and precise instructions
* for using available tools and producing FINAL_ANSWER: in the expected format.
*/
export declare function buildClaudePPrompt(question: GaiaQuestion): string;
/**
* Run a single GAIA question via `claude -p` headless mode.
*
* Spawns a subprocess, captures JSON output, extracts the final answer.
*/
export declare function runGaiaQuestionViaClaudeP(question: GaiaQuestion, options?: ClaudePOptions): Promise<ClaudePResult>;
/**
* Extract the FINAL_ANSWER value from claude -p's result text.
*
* Primary: regex match on `FINAL_ANSWER: <value>`
* Fallback: last non-empty line if no FINAL_ANSWER marker found.
*/
export declare function extractFinalAnswer(text: string): string | null;
export interface ClaudePBatchOptions extends ClaudePOptions {
/** Max parallel questions (default: 2 — claude -p uses significant local resources). */
concurrency?: number;
/** Callback for per-question progress logging. */
onProgress?: (idx: number, total: number, questionId: string, answer: string | null, costUsd: number) => void;
}
/**
* Run a batch of GAIA questions through the claude -p wrapper.
*
* Concurrency is limited (default 2) because each claude -p subprocess
* is heavyweight — it starts a full Claude Code session with LSP etc.
*/
export declare function runGaiaQuestionsBatchViaClaudeP(questions: GaiaQuestion[], options?: ClaudePBatchOptions): Promise<ClaudePResult[]>;
//# sourceMappingURL=gaia-claude-p.d.ts.map