claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
158 lines • 7.81 kB
JavaScript
/**
* GAIA DAG 5-Question Pilot Runner — iter 64
*
* Runs 5 specific GAIA L1 questions through the Co-Sight DAG harness
* and compares results against single-Sonnet baseline (from iter 63b).
*
* Pilot question mix (selected from iter 63b failures):
* 1. 5d0080cb — calculation (fish bag volume from academic paper, multi-hop)
* 2. cffe0e32 — reasoning puzzle (Secret Santa assignment chain)
* 3. ec09fa32 — riddle (game show, requires careful reasoning)
* 4. 46719c30 — retrieval (paper authors -> their other publications)
* 5. b816bfce — retrieval+reasoning (journal name from Norse mythology)
*
* Cost cap: $2.00
*
* Usage:
* node dist/src/benchmarks/gaia-dag-pilot.js
*
* Refs: ADR-139, iter 64, #2156
*/
import { loadGaia } from './gaia-loader.js';
import { runDagPilot } from './gaia-dag.js';
// ---------------------------------------------------------------------------
// Pilot question IDs (5 questions from iter 63b failures)
// ---------------------------------------------------------------------------
const PILOT_TASK_IDS = [
'5d0080cb-90d7-4712-bc33-848150e917d3', // calculation: fish bag volume
'cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb', // reasoning: Secret Santa
'ec09fa32-d03f-4bf8-84b0-1f16922c3ae4', // riddle: game show bees
'46719c30-f4c3-4cad-be07-d5cb21eee6bb', // retrieval: paper authors + their work
'b816bfce-3d80-4913-a07d-69b752ce6377', // retrieval: journal from Norse mythology
];
// Baseline: single-Sonnet results from iter 63b
const BASELINE = {
'5d0080cb-90d7-4712-bc33-848150e917d3': { correct: false, answer: null },
'cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb': { correct: false, answer: null },
'ec09fa32-d03f-4bf8-84b0-1f16922c3ae4': { correct: false, answer: null },
'46719c30-f4c3-4cad-be07-d5cb21eee6bb': { correct: false, answer: null },
'b816bfce-3d80-4913-a07d-69b752ce6377': { correct: false, answer: 'cuddly' }, // close but wrong
};
const COST_CAP_USD = 2.00;
async function main() {
console.log('\n=== GAIA DAG 5-Question Pilot (iter 64) ===\n');
console.log('Questions: 5 (from iter 63b failures)');
console.log(`Planner: ${process.env['PLAN_MODEL'] ?? 'claude-sonnet-4-6'}`);
console.log(`Actor: ${process.env['ACT_MODEL'] ?? 'gemini-2.5-pro'}`);
console.log(`Cost cap: $${COST_CAP_USD}\n`);
// Load full GAIA L1 dataset and filter to pilot questions
console.log('Loading GAIA L1 dataset...');
let allQuestions;
try {
allQuestions = await loadGaia({ level: 1 });
}
catch (err) {
console.error('Failed to load GAIA dataset:', err instanceof Error ? err.message : String(err));
process.exit(1);
}
const pilotQuestions = PILOT_TASK_IDS.map((id) => {
const q = allQuestions.find((q) => q.task_id === id);
if (!q) {
console.warn(`WARNING: Question ${id} not found in dataset`);
}
return q;
}).filter(Boolean);
if (pilotQuestions.length === 0) {
console.error('No pilot questions found. Check HF token and dataset availability.');
process.exit(1);
}
console.log(`Loaded ${pilotQuestions.length} pilot questions\n`);
for (const q of pilotQuestions) {
console.log(` ${q.task_id.slice(0, 8)}: ${q.question.slice(0, 70)}...`);
console.log(` expected: "${q.final_answer}"`);
if (q.file_path)
console.log(` file: ${q.file_path}`);
}
console.log('');
// Run DAG pilot
const result = await runDagPilot(pilotQuestions, {});
// Check cost cap
if (result.totalCostUsd > COST_CAP_USD) {
console.warn(`\nWARNING: Cost $${result.totalCostUsd.toFixed(4)} exceeded cap $${COST_CAP_USD}\n`);
}
// Report results
console.log('\n=== DAG Pilot Results ===\n');
console.log(`Score: ${result.correct}/${result.total} (${(result.accuracy * 100).toFixed(1)}%)`);
console.log(`Avg steps/question: ${result.avgStepsPerQuestion.toFixed(1)}`);
console.log(`Total cost: $${result.totalCostUsd.toFixed(4)}`);
console.log(`Projected 53Q cost: $${result.projectedCost53Q.toFixed(2)}`);
console.log(`Mean wall time: ${(result.meanWallMs / 1000).toFixed(1)}s\n`);
console.log('Per-question breakdown:');
const baselineCorrect = Object.values(BASELINE).filter((b) => b.correct).length;
let dagBetter = 0;
let dagWorse = 0;
for (const q of result.perQuestion) {
const baseline = BASELINE[q.taskId];
const baselineResult = baseline?.correct ? 'PASS' : 'FAIL';
const dagResult = q.correct ? 'PASS' : 'FAIL';
const change = !baseline?.correct && q.correct ? ' (+RECOVERED)' :
baseline?.correct && !q.correct ? ' (-REGRESSED)' : '';
if (!baseline?.correct && q.correct)
dagBetter++;
if (baseline?.correct && !q.correct)
dagWorse++;
console.log(` ${dagResult} [was ${baselineResult}]${change} ${q.taskId.slice(0, 8)}: got="${q.got?.slice(0, 40) ?? 'null'}" expected="${q.expected.slice(0, 40)}"`);
console.log(` steps=${q.steps} (completed=${q.completedSteps}, blocked=${q.blockedSteps}) plannerCycles=${q.plannerCycles} cost=$${q.costUsd.toFixed(4)} wall=${(q.wallMs / 1000).toFixed(1)}s`);
}
console.log(`\nComparison vs single-Sonnet (iter 63b):`);
console.log(` Baseline: ${baselineCorrect}/5 (${(baselineCorrect * 20).toFixed(0)}%)`);
console.log(` DAG: ${result.correct}/5 (${(result.accuracy * 100).toFixed(1)}%)`);
console.log(` Recovered: ${dagBetter} questions (single-Sonnet failed, DAG solved)`);
console.log(` Regressed: ${dagWorse} questions (single-Sonnet passed, DAG failed)`);
console.log('\n=== Gate Verdict ===');
const multiStepExecuted = result.perQuestion.some((q) => q.steps > 1);
const avgSteps = result.avgStepsPerQuestion;
if (result.correct >= 4 && multiStepExecuted) {
console.log(`PROCEED to iter 65 full 53Q run.`);
console.log(` Reason: DAG ${result.correct}/5, avg ${avgSteps.toFixed(1)} steps/Q, multi-step plans executing.`);
}
else if (result.correct >= 3 && avgSteps >= 2) {
console.log(`PROCEED with CAUTION to iter 65 — diagnose blocking first.`);
console.log(` Reason: DAG ${result.correct}/5, avg ${avgSteps.toFixed(1)} steps/Q.`);
}
else {
console.log(`DIAGNOSE before iter 65.`);
console.log(` Reason: DAG ${result.correct}/5, avg ${avgSteps.toFixed(1)} steps/Q.`);
if (!multiStepExecuted) {
console.log(` Issue: Plans collapsed to single step — check planner prompt.`);
}
}
// Write JSON result
const jsonOut = {
run_type: 'dag-5q-pilot',
iter: 64,
timestamp: new Date().toISOString(),
planModel: process.env['PLAN_MODEL'] ?? 'claude-sonnet-4-6',
actModel: process.env['ACT_MODEL'] ?? 'gemini-2.5-pro',
summary: {
total: result.total,
correct: result.correct,
accuracy: result.accuracy,
avgStepsPerQuestion: result.avgStepsPerQuestion,
totalCostUsd: result.totalCostUsd,
projectedCost53Q: result.projectedCost53Q,
meanWallMs: result.meanWallMs,
},
baseline: { correct: baselineCorrect, total: 5, source: 'iter-63b' },
perQuestion: result.perQuestion,
};
const outPath = `/Users/cohen/Projects/ruflo/docs/benchmarks/runs/gaia-l1-iter64-dag-5q-pilot.json`;
const fs = await import('node:fs');
fs.writeFileSync(outPath, JSON.stringify(jsonOut, null, 2));
console.log(`\nResults written to: ${outPath}\n`);
}
main().catch((err) => {
console.error('Pilot failed:', err);
process.exit(1);
});
//# sourceMappingURL=gaia-dag-pilot.js.map