UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

658 lines 29.4 kB
/** * GAIA DAG Harness — Co-Sight Architecture Port (ADR-139 Addendum) * * Ports the ZTE-AICloud/Co-Sight DAG orchestration pattern (Apache 2.0, * arXiv 2510.21557) into the ruflo GAIA harness. * * Architecture: * 1. PLAN — Claude Sonnet 4.6 reads the question and emits a DAG of 3-7 * steps: {id, description, depends_on: [], suggested_tool}. * Claude-aware prompt: "3-5 steps max, direct answer when clear." * 2. EXECUTE — Loop while ready steps exist (deps satisfied). * Run all ready steps in PARALLEL (Promise.all, cap ≤5). * Each step = a Gemini 2.5 Pro actor with the full tool suite. * Actor marks step completed/blocked + writes step_notes. * Blocked steps trigger planner re_plan before next cycle. * 3. FINALIZE — Planner reads all step_notes → produces final answer * using T2 extraction cascade from gaia-agent.ts. * 4. CAMV — Async credibility labeling per step (stubbed, iter 65). * * Role assignment (env-configurable): * PLAN_MODEL = claude-sonnet-4-6 (default) * ACT_MODEL = gemini-2.5-pro (default) * VISION_MODEL = gemini-2.5-pro (default, same as act) * * CLI: * gaia-bench run --mode=dag --model claude-sonnet-4-6 * * Cost: planner ~$0.02/Q + actors ~$0.03/Q = ~$0.05/Q (vs single-Sonnet ~$0.075) * * Refs: ADR-139, github.com/ZTE-AICloud/Co-Sight, arXiv 2510.21557, #2156 */ import { execSync } from 'node:child_process'; import { createDefaultToolCatalogue, } from './gaia-tools/index.js'; import { normaliseAnswer } from './gaia-judge.js'; import { resolveAnthropicApiKey } from './gaia-agent.js'; // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages'; const ANTHROPIC_API_VERSION = '2023-06-01'; const GEMINI_API_BASE = 'https://generativelanguage.googleapis.com/v1beta/models'; const DEFAULT_PLAN_MODEL = process.env['PLAN_MODEL'] ?? 'claude-sonnet-4-6'; const DEFAULT_ACT_MODEL = process.env['ACT_MODEL'] ?? 'gemini-2.5-pro'; const MAX_PLAN_STEPS = 7; const MIN_PLAN_STEPS = 1; const MAX_CONCURRENT_ACTORS = 5; const MAX_ACTOR_TURNS = 8; const MAX_REPLAN_CYCLES = 3; const ACTOR_TIMEOUT_MS = 90_000; const PLANNER_TIMEOUT_MS = 60_000; const FINAL_ANSWER_RE = /FINAL_ANSWER:\s*(.+)/i; // --------------------------------------------------------------------------- // API key resolution // --------------------------------------------------------------------------- function resolveGeminiApiKey(supplied) { if (supplied?.trim()) return supplied.trim(); const env = process.env['GOOGLE_AI_API_KEY'] ?? process.env['GEMINI_API_KEY']; if (env?.trim()) return env.trim(); for (const secret of ['GOOGLE_AI_API_KEY', 'GEMINI_API_KEY']) { try { const out = execSync(`gcloud secrets versions access latest --secret=${secret} 2>/dev/null`, { encoding: 'utf-8', timeout: 10_000 }).trim(); if (out) return out; } catch { /* fall through */ } } throw new Error('GOOGLE_AI_API_KEY / GEMINI_API_KEY not found in env or GCP Secret Manager.'); } /** * Get all steps whose dependencies are fully satisfied (completed). * Mirrors Co-Sight's Plan.get_ready_steps(). */ export function getReadySteps(plan) { return plan.steps.filter((step) => { if (step.status !== 'not_started') return false; return step.depends_on.every((depId) => { const dep = plan.steps.find((s) => s.id === depId); return dep?.status === 'completed'; }); }); } // --------------------------------------------------------------------------- // Planner system prompt (Claude-aware, ported from planner_prompt.py) // --------------------------------------------------------------------------- function buildPlannerSystemPrompt() { return [ '# Role and Objective', 'You are a planning assistant for a question-answering system. Your task is to create', 'a small, focused plan as a Directed Acyclic Graph (DAG) to answer the given question.', '', '# Plan Creation Rules (Claude model — simplified)', '1. When the answer is clear and direct, create a SINGLE step: just answer the question.', '2. Otherwise, create 3-5 high-level steps (NEVER more than 7).', '3. Each step must be a concrete, actionable description.', '4. Specify dependencies ONLY when a step genuinely requires output from a prior step.', '5. Steps without dependencies run in parallel — prefer parallelism.', '', '# Output Format (JSON ONLY, no markdown fences)', '{', ' "title": "brief plan title",', ' "steps": [', ' { "id": 0, "description": "step description", "depends_on": [], "suggested_tool": "web_search" },', ' { "id": 1, "description": "step description", "depends_on": [0], "suggested_tool": "python_exec" }', ' ]', '}', '', '# Suggested Tools: web_search, grounded_query, file_read, python_exec', '# Rules:', '- ids must be sequential integers starting from 0', '- depends_on contains only valid step ids from the same plan', '- NO markdown, NO explanation — output ONLY the JSON object', ].join('\n'); } function buildReplannerSystemPrompt() { return [ '# Role and Objective', 'You are a planning assistant. Some steps in the current plan are BLOCKED.', 'Your task is to update the plan to work around the blocked steps.', '', '# Replan Rules', '1. Preserve ALL completed steps — do not modify them.', '2. For blocked steps: either try an alternative approach or skip if non-critical.', '3. If the plan has enough information to answer already, output FINAL_ANSWER: <answer>', '4. Keep the total step count ≤ 7.', '', '# Output', 'Either:', ' FINAL_ANSWER: <the answer>', 'Or updated JSON plan (same format as plan creation, include all steps with their current status):', '{', ' "title": "...",', ' "steps": [...]', '}', ].join('\n'); } function buildFinalizerSystemPrompt() { return [ 'You are a precise question-answering agent finalizing a multi-step research task.', 'You have all the gathered evidence in step notes. Produce the final answer.', '', 'RULES:', '1. Synthesize ONLY from the provided step notes — do not invent facts.', '2. Keep answers concise: just the value, name, or number unless context demands more.', '3. Do NOT include units unless the question asks for them.', '4. You MUST end with: FINAL_ANSWER: <your answer>', '5. NEVER end without a FINAL_ANSWER line.', ].join('\n'); } async function callAnthropic(apiKey, model, systemPrompt, messages, maxTokens, timeoutMs) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); let res; try { res = await fetch(ANTHROPIC_API_URL, { method: 'POST', headers: { 'x-api-key': apiKey, 'anthropic-version': ANTHROPIC_API_VERSION, 'content-type': 'application/json', }, body: JSON.stringify({ model, max_tokens: maxTokens, system: systemPrompt, messages }), signal: controller.signal, }); } finally { clearTimeout(timer); } if (!res.ok) { const errText = await res.text().catch(() => '<unreadable>'); throw new Error(`Anthropic API error ${res.status}: ${errText.slice(0, 400)}`); } return (await res.json()); } async function callGemini(apiKey, model, systemInstruction, contents, tools, maxTokens, timeoutMs) { const url = `${GEMINI_API_BASE}/${model}:generateContent?key=${apiKey}`; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); let res; try { res = await fetch(url, { method: 'POST', headers: { 'content-type': 'application/json' }, body: JSON.stringify({ system_instruction: { parts: [{ text: systemInstruction }] }, contents, tools: tools.length > 0 ? [{ function_declarations: tools }] : undefined, generationConfig: { maxOutputTokens: maxTokens, temperature: 0 }, }), signal: controller.signal, }); } finally { clearTimeout(timer); } if (!res.ok) { const errText = await res.text().catch(() => '<unreadable>'); throw new Error(`Gemini API error ${res.status}: ${errText.slice(0, 400)}`); } return (await res.json()); } // --------------------------------------------------------------------------- // Parse planner JSON output (robust) // --------------------------------------------------------------------------- function parsePlanJson(text) { // Strip markdown fences if present const cleaned = text.replace(/```(?:json)?\n?/g, '').replace(/```\n?/g, '').trim(); // Find the first '{' ... last '}' const start = cleaned.indexOf('{'); const end = cleaned.lastIndexOf('}'); if (start === -1 || end === -1) return null; try { const parsed = JSON.parse(cleaned.slice(start, end + 1)); if (!Array.isArray(parsed.steps) || parsed.steps.length === 0) return null; return { steps: parsed.steps.map((s, i) => ({ id: typeof s.id === 'number' ? s.id : i, description: String(s.description ?? `Step ${i}`), depends_on: Array.isArray(s.depends_on) ? s.depends_on.map(Number) : [], suggested_tool: s.suggested_tool, })), }; } catch { return null; } } // --------------------------------------------------------------------------- // PLAN phase // --------------------------------------------------------------------------- async function createPlan(question, anthropicKey, planModel) { const attachmentHint = question.file_path ? `\nThis question has an attached file at: ${question.file_path}` : ''; const resp = await callAnthropic(anthropicKey, planModel, buildPlannerSystemPrompt(), [{ role: 'user', content: question.question + attachmentHint }], 1024, PLANNER_TIMEOUT_MS); const text = resp.content .filter((b) => b.type === 'text') .map((b) => b.text) .join('\n'); const parsed = parsePlanJson(text); let steps; if (parsed && parsed.steps.length >= MIN_PLAN_STEPS) { // Cap steps const rawSteps = parsed.steps.slice(0, MAX_PLAN_STEPS); steps = rawSteps.map((s) => ({ id: s.id, description: s.description, depends_on: s.depends_on.filter((d) => d < rawSteps.length && d !== s.id), suggested_tool: s.suggested_tool, status: 'not_started', step_notes: '', })); } else { // Fallback: single-step plan (treat as direct answer task) steps = [{ id: 0, description: `Answer the question directly: ${question.question.slice(0, 100)}`, depends_on: [], status: 'not_started', step_notes: '', }]; } return { plan: { title: question.task_id, question: question.question, steps }, inputTokens: resp.usage.input_tokens, outputTokens: resp.usage.output_tokens, }; } // --------------------------------------------------------------------------- // ACTOR phase — Gemini 2.5 Pro executes a single step // --------------------------------------------------------------------------- function buildActorSystemPrompt(question, planSummary, stepDesc) { return [ 'You are a precise research agent executing one step of a multi-step plan.', '', `ORIGINAL QUESTION: ${question}`, '', `CURRENT PLAN STATE:\n${planSummary}`, '', `YOUR STEP: ${stepDesc}`, '', 'RULES:', '1. Use the available tools to gather information for your step.', '2. When you have completed your step, output your findings as:', ' STEP_RESULT: <your findings and evidence>', '3. If you cannot complete your step (blocked), output:', ' STEP_BLOCKED: <reason why blocked>', '4. Keep responses focused on completing this specific step.', '5. MANDATORY: Always end with either STEP_RESULT or STEP_BLOCKED.', ].join('\n'); } function buildPlanSummary(plan) { return plan.steps.map((s) => { const statusIcon = { not_started: '[ ]', in_progress: '[→]', completed: '[✓]', blocked: '[!]' }[s.status]; const notes = s.step_notes ? ` → ${s.step_notes.slice(0, 200)}` : ''; return `${statusIcon} Step ${s.id}: ${s.description}${notes}`; }).join('\n'); } /** Convert ruflo tool catalogue to Gemini function declarations. */ function toGeminiFunctionDeclarations(catalogue) { return catalogue.map((t) => ({ name: t.definition.name, description: t.definition.description, parameters: t.definition.input_schema, })); } async function executeActorStep(question, step, plan, geminiKey, actModel, catalogue) { const systemPrompt = buildActorSystemPrompt(question.question, buildPlanSummary(plan), step.description); const tools = toGeminiFunctionDeclarations(catalogue); let inputTokens = 0; let outputTokens = 0; const contents = [ { role: 'user', parts: [{ text: `Execute your assigned step. Use tools as needed, then output STEP_RESULT or STEP_BLOCKED.` }] }, ]; // Include attachment hint on first user message if (question.file_path) { contents[0].parts[0].text += `\nNote: There is an attached file at "${question.file_path}" — call file_read if needed.`; } const STEP_RESULT_RE = /STEP_RESULT:\s*([\s\S]+)/i; const STEP_BLOCKED_RE = /STEP_BLOCKED:\s*(.+)/i; for (let turn = 0; turn < MAX_ACTOR_TURNS; turn++) { let resp; try { resp = await callGemini(geminiKey, actModel, systemPrompt, contents, tools, 2048, ACTOR_TIMEOUT_MS); } catch (err) { return { notes: `Actor error: ${err instanceof Error ? err.message : String(err)}`, status: 'blocked', inputTokens, outputTokens, }; } inputTokens += resp.usageMetadata?.promptTokenCount ?? 0; outputTokens += resp.usageMetadata?.candidatesTokenCount ?? 0; const candidate = resp.candidates[0]; if (!candidate) break; const parts = candidate.content?.parts ?? []; const textParts = parts.filter((p) => p.text).map((p) => p.text); const funcCalls = parts.filter((p) => p.functionCall); // Check for terminal signals in text const fullText = textParts.join('\n'); const resultMatch = STEP_RESULT_RE.exec(fullText); if (resultMatch) { return { notes: resultMatch[1].trim().slice(0, 2000), status: 'completed', inputTokens, outputTokens }; } const blockedMatch = STEP_BLOCKED_RE.exec(fullText); if (blockedMatch) { return { notes: blockedMatch[1].trim(), status: 'blocked', inputTokens, outputTokens }; } // No function calls and finish — treat text as the result if (funcCalls.length === 0) { const answerText = fullText.trim(); if (answerText) { return { notes: answerText.slice(0, 2000), status: 'completed', inputTokens, outputTokens }; } break; } // Execute tool calls const toolResultParts = []; // Append model turn contents.push({ role: 'model', parts: parts }); await Promise.all(funcCalls.map(async (part) => { const fc = part.functionCall; const tool = catalogue.find((t) => t.name === fc.name); let result; if (!tool) { result = `Unknown tool: "${fc.name}"`; } else { try { result = await tool.execute(fc.args); } catch (err) { result = `Tool error: ${err instanceof Error ? err.message : String(err)}`; } } // Truncate large outputs (mirrors Co-Sight MAX_TOOL_CONTENT_LENGTH) toolResultParts.push({ functionResponse: { name: fc.name, response: { content: result.slice(0, 10_000) } }, }); })); contents.push({ role: 'user', parts: toolResultParts }); } return { notes: 'No result after max turns', status: 'blocked', inputTokens, outputTokens }; } // --------------------------------------------------------------------------- // REPLAN phase // --------------------------------------------------------------------------- async function replan(question, plan, anthropicKey, planModel) { const planSummary = buildPlanSummary(plan); const prompt = [ `QUESTION: ${question.question}`, '', `CURRENT PLAN:\n${planSummary}`, '', 'Some steps are BLOCKED. Update the plan or provide the final answer if enough info is gathered.', ].join('\n'); const resp = await callAnthropic(anthropicKey, planModel, buildReplannerSystemPrompt(), [{ role: 'user', content: prompt }], 1024, PLANNER_TIMEOUT_MS); const text = resp.content .filter((b) => b.type === 'text') .map((b) => b.text) .join('\n'); // Check for early final answer const faMatch = FINAL_ANSWER_RE.exec(text); if (faMatch) { return { earlyAnswer: faMatch[1].trim(), updatedSteps: null, inputTokens: resp.usage.input_tokens, outputTokens: resp.usage.output_tokens }; } // Try to parse updated plan const parsed = parsePlanJson(text); if (parsed) { const updatedSteps = parsed.steps.slice(0, MAX_PLAN_STEPS).map((s) => { // Preserve completed/blocked status for existing steps const existing = plan.steps.find((es) => es.id === s.id); if (existing && existing.status !== 'not_started') { return existing; } return { id: s.id, description: s.description, depends_on: s.depends_on.filter((d) => d < parsed.steps.length && d !== s.id), suggested_tool: s.suggested_tool, status: 'not_started', step_notes: '', }; }); return { earlyAnswer: null, updatedSteps, inputTokens: resp.usage.input_tokens, outputTokens: resp.usage.output_tokens }; } return { earlyAnswer: null, updatedSteps: null, inputTokens: resp.usage.input_tokens, outputTokens: resp.usage.output_tokens }; } // --------------------------------------------------------------------------- // FINALIZE phase // --------------------------------------------------------------------------- async function finalizePlan(question, plan, anthropicKey, planModel) { const stepNotes = plan.steps .filter((s) => s.step_notes) .map((s) => `Step ${s.id} (${s.status}): ${s.description}\nFindings: ${s.step_notes}`) .join('\n\n'); const prompt = [ `QUESTION: ${question.question}`, '', stepNotes ? `GATHERED EVIDENCE:\n${stepNotes}` : '(No step findings were gathered.)', '', 'Based on the evidence above, provide the final answer.', 'End with: FINAL_ANSWER: <your answer>', ].join('\n'); const resp = await callAnthropic(anthropicKey, planModel, buildFinalizerSystemPrompt(), [{ role: 'user', content: prompt }], 1024, PLANNER_TIMEOUT_MS); const text = resp.content .filter((b) => b.type === 'text') .map((b) => b.text) .join('\n'); const match = FINAL_ANSWER_RE.exec(text); return { finalAnswer: match ? match[1].trim() : null, inputTokens: resp.usage.input_tokens, outputTokens: resp.usage.output_tokens, }; } // --------------------------------------------------------------------------- // Pricing // --------------------------------------------------------------------------- function estimateCostUsd(planModel, actModel, planIn, planOut, actIn, actOut) { const planPrice = planModel.startsWith('claude-sonnet') ? { inputPerM: 3.0, outputPerM: 15.0 } : { inputPerM: 3.0, outputPerM: 15.0 }; const actPrice = actModel.startsWith('gemini-2.5-pro') ? { inputPerM: 1.25, outputPerM: 10.0 } : { inputPerM: 1.25, outputPerM: 10.0 }; return (planIn / 1_000_000) * planPrice.inputPerM + (planOut / 1_000_000) * planPrice.outputPerM + (actIn / 1_000_000) * actPrice.inputPerM + (actOut / 1_000_000) * actPrice.outputPerM; } // --------------------------------------------------------------------------- // Main entry point: runGaiaDAG // --------------------------------------------------------------------------- /** * Run a GAIA question through the Co-Sight DAG harness. * * Steps: * 1. Planner (Claude Sonnet) creates a DAG plan. * 2. Execute loop: parallel actors (Gemini 2.5 Pro) run ready steps. * 3. Blocked steps trigger replan (up to MAX_REPLAN_CYCLES). * 4. Finalizer (Claude Sonnet) reads all step notes → final answer. */ export async function runGaiaDAG(question, options = {}) { const wallStart = Date.now(); const planModel = options.planModel ?? DEFAULT_PLAN_MODEL; const actModel = options.actModel ?? DEFAULT_ACT_MODEL; const anthropicKey = resolveAnthropicApiKey(options.anthropicApiKey); const geminiKey = resolveGeminiApiKey(options.geminiApiKey); const catalogue = options.catalogue ?? createDefaultToolCatalogue(); let plannerInputTokens = 0; let plannerOutputTokens = 0; let actorInputTokens = 0; let actorOutputTokens = 0; // PHASE 1: PLAN process.stderr.write(`[dag] ${question.task_id} — planning with ${planModel}\n`); let planResult; try { planResult = await createPlan(question, anthropicKey, planModel); } catch (err) { return { questionId: question.task_id, finalAnswer: null, normalisedAnswer: '', plan: { title: question.task_id, question: question.question, steps: [] }, totalSteps: 0, completedSteps: 0, blockedSteps: 0, plannerCycles: 0, totalInputTokens: 0, totalOutputTokens: 0, estimatedCostUsd: 0, wallMs: Date.now() - wallStart, error: `Plan creation failed: ${err instanceof Error ? err.message : String(err)}`, }; } const plan = planResult.plan; plannerInputTokens += planResult.inputTokens; plannerOutputTokens += planResult.outputTokens; process.stderr.write(`[dag] ${question.task_id} — plan: ${plan.steps.length} steps\n` + plan.steps.map((s) => ` [${s.id}] ${s.description} (deps: [${s.depends_on.join(',')}])`).join('\n') + '\n'); // PHASE 2: EXECUTE + REPLAN loop let plannerCycles = 1; // count initial plan creation let earlyAnswer = null; for (let cycle = 0; cycle < MAX_REPLAN_CYCLES + 1; cycle++) { const readySteps = getReadySteps(plan); if (readySteps.length === 0) break; process.stderr.write(`[dag] ${question.task_id} — cycle ${cycle}: ${readySteps.length} ready steps\n`); // Mark ready steps as in_progress for (const step of readySteps) step.status = 'in_progress'; // Run all ready steps in parallel (cap at MAX_CONCURRENT_ACTORS) const batches = []; for (let i = 0; i < readySteps.length; i += MAX_CONCURRENT_ACTORS) { batches.push(readySteps.slice(i, i + MAX_CONCURRENT_ACTORS)); } for (const batch of batches) { const results = await Promise.all(batch.map((step) => executeActorStep(question, step, plan, geminiKey, actModel, catalogue))); for (let i = 0; i < batch.length; i++) { const step = batch[i]; const res = results[i]; step.step_notes = res.notes; step.status = res.status; actorInputTokens += res.inputTokens; actorOutputTokens += res.outputTokens; process.stderr.write(`[dag] ${question.task_id} — step ${step.id}${step.status}\n`); } } // Check for blocked steps const blockedSteps = plan.steps.filter((s) => s.status === 'blocked'); if (blockedSteps.length === 0) continue; // If any steps remain not_started or there are more cycles, try replan const remainingSteps = plan.steps.filter((s) => s.status === 'not_started'); if (remainingSteps.length === 0 || cycle >= MAX_REPLAN_CYCLES) break; process.stderr.write(`[dag] ${question.task_id}${blockedSteps.length} blocked, replanning\n`); plannerCycles++; try { const replanResult = await replan(question, plan, anthropicKey, planModel); plannerInputTokens += replanResult.inputTokens; plannerOutputTokens += replanResult.outputTokens; if (replanResult.earlyAnswer) { earlyAnswer = replanResult.earlyAnswer; break; } if (replanResult.updatedSteps) { plan.steps = replanResult.updatedSteps; } } catch (err) { process.stderr.write(`[dag] replan error: ${err instanceof Error ? err.message : String(err)}\n`); break; } } // PHASE 3: FINALIZE let finalAnswer = earlyAnswer; if (!finalAnswer) { process.stderr.write(`[dag] ${question.task_id} — finalizing\n`); try { const fin = await finalizePlan(question, plan, anthropicKey, planModel); plannerInputTokens += fin.inputTokens; plannerOutputTokens += fin.outputTokens; finalAnswer = fin.finalAnswer; } catch (err) { process.stderr.write(`[dag] finalize error: ${err instanceof Error ? err.message : String(err)}\n`); } } const completedSteps = plan.steps.filter((s) => s.status === 'completed').length; const blockedSteps = plan.steps.filter((s) => s.status === 'blocked').length; return { questionId: question.task_id, finalAnswer, normalisedAnswer: normaliseAnswer(finalAnswer), plan, totalSteps: plan.steps.length, completedSteps, blockedSteps, plannerCycles, totalInputTokens: plannerInputTokens + actorInputTokens, totalOutputTokens: plannerOutputTokens + actorOutputTokens, estimatedCostUsd: estimateCostUsd(planModel, actModel, plannerInputTokens, plannerOutputTokens, actorInputTokens, actorOutputTokens), wallMs: Date.now() - wallStart, }; } export async function runDagPilot(questions, options = {}) { const perQuestion = []; let correct = 0; for (const q of questions) { const result = await runGaiaDAG(q, options); const isCorrect = result.normalisedAnswer !== '' && result.normalisedAnswer === normaliseAnswer(q.final_answer); if (isCorrect) correct++; perQuestion.push({ taskId: q.task_id, question: q.question.slice(0, 80), expected: q.final_answer ?? '', got: result.finalAnswer, correct: isCorrect, steps: result.totalSteps, completedSteps: result.completedSteps, blockedSteps: result.blockedSteps, plannerCycles: result.plannerCycles, costUsd: result.estimatedCostUsd, wallMs: result.wallMs, }); process.stderr.write(`[dag-pilot] ${q.task_id}${result.finalAnswer ?? 'null'} ` + `(${isCorrect ? 'CORRECT' : 'WRONG'}, steps=${result.totalSteps}, ` + `completed=${result.completedSteps}, $${result.estimatedCostUsd.toFixed(4)})\n`); } const totalCostUsd = perQuestion.reduce((s, r) => s + r.costUsd, 0); const avgCostPerQ = totalCostUsd / Math.max(perQuestion.length, 1); const meanWallMs = perQuestion.reduce((s, r) => s + r.wallMs, 0) / Math.max(perQuestion.length, 1); const avgSteps = perQuestion.reduce((s, r) => s + r.steps, 0) / Math.max(perQuestion.length, 1); return { correct, total: perQuestion.length, accuracy: correct / Math.max(perQuestion.length, 1), avgStepsPerQuestion: avgSteps, perQuestion, totalCostUsd, projectedCost53Q: avgCostPerQ * 53, meanWallMs, }; } //# sourceMappingURL=gaia-dag.js.map