UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

349 lines (348 loc) 15.5 kB
/** * V3 CLI Performance Capability Benchmark * * Runs a small verifiable-answer corpus through the Anthropic API and reports * pass-rate, latency, and cost. Closes the capability-evaluation gap that * `performance benchmark --suite agent` does NOT cover — that suite measures * the agent control plane (router, memory, hooks) without LLM calls; this * subcommand measures the actual model's ability to solve agent-style tasks. * * Features: * - Parallel execution with configurable concurrency * - Multi-model comparison in a single run (`--models a,b,c`) * - Per-task max-tokens overrides (declared in the fixture) * - Configurable corpus via `--questions <path>` * * Inspired by GAIA / SWE-bench / GSM8K but text-only and scoreable via * substring / exact match — no web browsing, no file attachments, no * Hugging Face dataset download. * * API key resolution (in order): * 1. $ANTHROPIC_API_KEY env var * 2. `gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY` * 3. Fail with a clear error * * Refs: #2156 (Dream Cycle 2026-05-27 capabilities scan) */ import { execSync } from 'node:child_process'; import * as fs from 'node:fs'; import * as path from 'node:path'; import { output } from '../output.js'; import { BUILTIN_CAPABILITY_TASKS } from '../benchmarks/capability-tasks.js'; // Anthropic pricing (per 1M tokens, USD) const PRICING = { 'claude-haiku-4-5': { in: 1.0, out: 5.0 }, 'claude-haiku-4-5-20251001': { in: 1.0, out: 5.0 }, 'claude-sonnet-4-6': { in: 3.0, out: 15.0 }, 'claude-opus-4-7': { in: 15.0, out: 75.0 }, }; const DEFAULT_MAX_TOKENS = 256; function resolveApiKey() { const envKey = process.env.ANTHROPIC_API_KEY; if (envKey && envKey.trim()) return envKey.trim(); try { const out = execSync('gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY 2>/dev/null', { encoding: 'utf-8', timeout: 10_000 }).trim(); if (out) return out; } catch { /* fall through */ } throw new Error('ANTHROPIC_API_KEY not found. Set the env var or store it as a gcloud secret named ANTHROPIC_API_KEY (e.g. `echo -n "$KEY" | gcloud secrets versions add ANTHROPIC_API_KEY --data-file=-`).'); } function loadTaskFile(custom) { if (custom) { const resolved = path.resolve(custom); if (!fs.existsSync(resolved)) throw new Error(`questions file not found: ${resolved}`); return JSON.parse(fs.readFileSync(resolved, 'utf-8')); } return BUILTIN_CAPABILITY_TASKS; } function buildPrompt(task) { return `You are answering an agent-capability benchmark question. Show only the key reasoning steps (one or two lines), then wrap your final answer in <answer>...</answer> tags. Be exact — the harness compares the tag contents to a ground-truth string. Question: ${task.prompt}`; } function extractAnswer(text) { const m = text.match(/<answer>([\s\S]*?)<\/answer>/i); if (m && m[1] !== undefined) return m[1].trim(); // Fallback: take last non-empty line. Strip leading markdown bullets/quotes/heading marks // and trailing sentence-ending punctuation. Models sometimes give the bare answer on the // final line without the <answer> tags, often prefixed with "- " or "* " from a list. const lines = text.split(/\r?\n/).map((l) => l.trim()).filter(Boolean); const last = lines[lines.length - 1] || ''; return last .replace(/^[-*>#\s]+/, '') // leading bullet / quote / heading .replace(/^\*\*|\*\*$/g, '') // bold markers .replace(/[.,!?]+$/, '') // trailing punctuation .trim(); } function check(answer, task) { const a = answer.trim().toLowerCase(); const e = task.expected.trim().toLowerCase(); switch (task.matchMode) { case 'exact': return a === e; case 'substring': return a.includes(e); case 'regex': try { return new RegExp(task.expected, 'i').test(answer); } catch { return false; } default: return false; } } async function callAnthropic(apiKey, model, prompt, maxTokens, timeoutMs) { const ac = new AbortController(); const timer = setTimeout(() => ac.abort(), timeoutMs); try { const resp = await fetch('https://api.anthropic.com/v1/messages', { method: 'POST', headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'Content-Type': 'application/json', }, signal: ac.signal, body: JSON.stringify({ model, max_tokens: maxTokens, messages: [{ role: 'user', content: prompt }], }), }); if (!resp.ok) { const body = await resp.text().catch(() => ''); throw new Error(`HTTP ${resp.status}: ${body.slice(0, 200)}`); } const body = (await resp.json()); const text = body.content?.[0]?.text ?? ''; return { text, inputTokens: body.usage?.input_tokens ?? 0, outputTokens: body.usage?.output_tokens ?? 0, }; } finally { clearTimeout(timer); } } /** * Concurrency-limited parallel mapper. Avoids a p-limit dep; rate-limits via * a sliding window of in-flight promises. Anthropic Haiku tier-1 has 50 RPM * + 50K TPM headroom — concurrency 4 keeps us well under both. */ async function parallelMap(items, concurrency, fn) { const results = new Array(items.length); let cursor = 0; async function worker() { while (true) { const i = cursor++; if (i >= items.length) return; results[i] = await fn(items[i], i); } } const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => worker()); await Promise.all(workers); return results; } async function runOne(task, model, apiKey, defaultMaxTokens, timeoutMs) { const maxTokens = task.maxTokens ?? defaultMaxTokens; const start = performance.now(); try { const { text, inputTokens, outputTokens } = await callAnthropic(apiKey, model, buildPrompt(task), maxTokens, timeoutMs); const answer = extractAnswer(text); return { id: task.id, category: task.category, model, correct: check(answer, task), answer, expected: task.expected, latencyMs: performance.now() - start, inputTokens, outputTokens, }; } catch (err) { return { id: task.id, category: task.category, model, correct: false, answer: '', expected: task.expected, latencyMs: performance.now() - start, inputTokens: 0, outputTokens: 0, error: err.message.slice(0, 120), }; } } function summarizeModel(results) { const model = results[0]?.model ?? ''; const passed = results.filter((r) => r.correct).length; const meanLatencyMs = results.reduce((a, b) => a + b.latencyMs, 0) / results.length; const totalInputTokens = results.reduce((a, b) => a + b.inputTokens, 0); const totalOutputTokens = results.reduce((a, b) => a + b.outputTokens, 0); const price = PRICING[model] ?? { in: 3.0, out: 15.0 }; const estCostUsd = (totalInputTokens / 1_000_000) * price.in + (totalOutputTokens / 1_000_000) * price.out; return { model, passed, total: results.length, passRate: passed / results.length, meanLatencyMs, totalInputTokens, totalOutputTokens, estCostUsd, }; } const capabilityCommand = { name: 'capability', description: 'Run a real LLM-driven agent-capability benchmark against the Anthropic API', options: [ { name: 'model', short: 'm', type: 'string', description: 'Single model id (default: claude-haiku-4-5). Overridden by --models.', default: 'claude-haiku-4-5' }, { name: 'models', short: 'M', type: 'string', description: 'Comma-separated list of models for cross-model comparison (e.g. claude-haiku-4-5,claude-sonnet-4-6)' }, { name: 'questions', short: 'q', type: 'string', description: 'Path to a custom tasks JSON file (default: built-in fixture)' }, { name: 'concurrency', short: 'c', type: 'number', description: 'Parallel in-flight requests', default: '4' }, { name: 'max-tokens', type: 'number', description: 'Default max_tokens cap (per-task overrides in fixture take precedence)', default: String(DEFAULT_MAX_TOKENS) }, { name: 'timeout', short: 't', type: 'number', description: 'Per-question timeout (ms)', default: '30000' }, { name: 'limit', short: 'l', type: 'number', description: 'Run only the first N questions' }, { name: 'output', short: 'o', type: 'string', description: 'Output format: text, json', default: 'text' }, ], examples: [ { command: 'claude-flow performance capability', description: 'Run the built-in fixture against Haiku (parallel, default)' }, { command: 'claude-flow performance capability -M claude-haiku-4-5,claude-sonnet-4-6', description: 'Compare Haiku vs Sonnet on every question' }, { command: 'claude-flow performance capability -c 8 -o json', description: 'Higher concurrency, emit JSON' }, { command: 'claude-flow performance capability -q ./my-eval.json -l 3', description: 'Custom dataset, first 3 only' }, ], action: async (ctx) => { const modelsFlag = ctx.flags.models; const singleModel = ctx.flags.model || 'claude-haiku-4-5'; const models = modelsFlag ? modelsFlag.split(',').map((m) => m.trim()).filter(Boolean) : [singleModel]; const customPath = ctx.flags.questions; const outputFormat = ctx.flags.output || 'text'; const timeoutMs = parseInt(String(ctx.flags.timeout ?? '30000'), 10); const limit = ctx.flags.limit ? parseInt(String(ctx.flags.limit), 10) : undefined; const concurrency = Math.max(1, parseInt(String(ctx.flags.concurrency ?? '4'), 10)); const defaultMaxTokens = Math.max(32, parseInt(String(ctx.flags['max-tokens'] ?? DEFAULT_MAX_TOKENS), 10)); output.writeln(); output.writeln(output.bold('Agent Capability Benchmark (Anthropic API)')); output.writeln(output.dim('─'.repeat(60))); let apiKey; try { apiKey = resolveApiKey(); } catch (err) { output.writeln(output.error(err.message)); return { success: false, message: err.message, exitCode: 1 }; } let file; try { file = loadTaskFile(customPath); } catch (err) { output.writeln(output.error(err.message)); return { success: false, message: err.message, exitCode: 1 }; } const tasks = limit ? file.tasks.slice(0, limit) : file.tasks; output.writeln(`Models: ${models.join(', ')}`); output.writeln(`Questions: ${tasks.length}${customPath ? ` (custom: ${customPath})` : ' (built-in fixture)'}`); output.writeln(`Concurrency: ${concurrency}`); output.writeln(`Default cap: ${defaultMaxTokens} tokens (per-task override allowed)`); output.writeln(); const startWall = performance.now(); const spinner = output.createSpinner({ text: `Running ${models.length * tasks.length} requests...`, spinner: 'dots' }); spinner.start(); // Build flat list of (task, model) pairs, then parallel-execute with concurrency limiter. const work = []; for (const model of models) { for (const task of tasks) work.push({ task, model }); } const results = await parallelMap(work, concurrency, async ({ task, model }) => { return runOne(task, model, apiKey, defaultMaxTokens, timeoutMs); }); const wallMs = performance.now() - startWall; spinner.succeed(`Completed ${results.length} requests in ${(wallMs / 1000).toFixed(2)}s`); // Group by model for per-model summary const byModel = new Map(); for (const r of results) { const arr = byModel.get(r.model) ?? []; arr.push(r); byModel.set(r.model, arr); } const summaries = [...byModel.entries()].map(([, arr]) => summarizeModel(arr)); if (outputFormat === 'json') { output.printJson({ models, questions: tasks.length, concurrency, wallMs, summaries, results, }); const overallPass = summaries.every((s) => s.passRate >= 0.5); return { success: overallPass, data: { summaries, results } }; } // Per-model detail tables for (const [model, arr] of byModel) { output.writeln(); output.writeln(output.bold(`${model}`)); output.printTable({ columns: [ { key: 'id', header: 'Question', width: 22 }, { key: 'category', header: 'Category', width: 24 }, { key: 'correct', header: 'Pass', width: 6 }, { key: 'latency', header: 'Latency', width: 10 }, { key: 'answer', header: 'Answer (got vs expected)', width: 36 }, ], data: arr.map((r) => ({ id: r.id, category: r.category, correct: r.correct ? output.success('✓') : output.error('✗'), latency: `${r.latencyMs.toFixed(0)}ms`, answer: r.error ? output.dim(`error: ${r.error}`) : r.correct ? r.answer.slice(0, 34) : `${r.answer.slice(0, 14)}${r.expected.slice(0, 14)}`, })), }); } // Cross-model summary table output.writeln(); output.writeln(output.bold('Summary')); output.printTable({ columns: [ { key: 'model', header: 'Model', width: 26 }, { key: 'pass', header: 'Pass', width: 14 }, { key: 'mean', header: 'Mean Lat', width: 12 }, { key: 'tokens', header: 'Tokens (in/out)', width: 18 }, { key: 'cost', header: 'Est. Cost', width: 12 }, ], data: summaries.map((s) => ({ model: s.model, pass: `${(s.passRate * 100).toFixed(1)}% (${s.passed}/${s.total})`, mean: `${s.meanLatencyMs.toFixed(0)}ms`, tokens: `${s.totalInputTokens} / ${s.totalOutputTokens}`, cost: `$${s.estCostUsd.toFixed(4)}`, })), }); output.writeln(); output.writeln(output.dim(`Wall time: ${(wallMs / 1000).toFixed(2)}s (concurrency=${concurrency})`)); const overallPass = summaries.every((s) => s.passRate >= 0.5); return { success: overallPass, data: { summaries, wallMs, results } }; }, }; export default capabilityCommand; //# sourceMappingURL=performance-capability.js.map