claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
168 lines • 7.37 kB
JavaScript
/**
* Smoke tests for gaia-claude-p.ts — iter 54 (#2156)
*
* Three minimal cases to verify the claude -p wrapper end-to-end:
* 1. Simple arithmetic: "what is 2+2" → expect "4"
* 2. Current fact: population of Tokyo → expect a large number
* 3. (Optional) Attachment path test — only runs when HF cache has a file
*
* Cost cap: ~$0.25 × 3 = $0.75 worst case at Sonnet rates, but using Haiku
* for smoke to keep it under $0.30 total.
*
* Run:
* npx ts-node src/benchmarks/gaia-claude-p.smoke.ts
* # or after build:
* node dist/src/benchmarks/gaia-claude-p.smoke.js
*
* Refs: iter 54, #2156
*/
import { runGaiaQuestionViaClaudeP, extractFinalAnswer, buildClaudePPrompt, CLAUDE_P_DEFAULT_MODEL, } from './gaia-claude-p.js';
// ---------------------------------------------------------------------------
// Test helpers
// ---------------------------------------------------------------------------
let passed = 0;
let failed = 0;
function assert(condition, label, detail) {
if (condition) {
console.log(` PASS ${label}`);
passed++;
}
else {
console.error(` FAIL ${label}${detail ? ': ' + detail : ''}`);
failed++;
}
}
// ---------------------------------------------------------------------------
// Unit tests (no claude -p invocation)
// ---------------------------------------------------------------------------
function testExtractFinalAnswer() {
console.log('\n-- Unit: extractFinalAnswer --');
assert(extractFinalAnswer('FINAL_ANSWER: 4') === '4', 'extracts plain number');
assert(extractFinalAnswer('Some reasoning...\nFINAL_ANSWER: Paris') === 'Paris', 'extracts after reasoning text');
assert(extractFinalAnswer('final_answer: 42') === '42', 'case-insensitive extraction');
assert(extractFinalAnswer('') === null, 'returns null for empty string');
assert(extractFinalAnswer('No answer marker here') !== null, 'fallback to last line when no marker');
}
function testBuildClaudePPrompt() {
console.log('\n-- Unit: buildClaudePPrompt --');
const q = {
task_id: 'test-001',
level: 1,
question: 'What is 2+2?',
final_answer: '4',
file_name: null,
file_path: null,
};
const prompt = buildClaudePPrompt(q);
assert(prompt.includes('What is 2+2?'), 'prompt contains question text');
assert(prompt.includes('FINAL_ANSWER:'), 'prompt instructs FINAL_ANSWER format');
assert(!prompt.includes('Attachment'), 'no attachment section for null file_path');
const qWithFile = { ...q, file_path: '/tmp/test.pdf' };
const promptWithFile = buildClaudePPrompt(qWithFile);
assert(promptWithFile.includes('/tmp/test.pdf'), 'attachment path included when present');
assert(promptWithFile.includes('Read tool'), 'mentions Read tool for attachments');
}
// ---------------------------------------------------------------------------
// Integration tests (invoke claude -p)
// ---------------------------------------------------------------------------
async function testArithmetic() {
console.log('\n-- Integration: arithmetic (2+2) --');
const q = {
task_id: 'smoke-arithmetic',
level: 1,
question: 'What is 2 + 2? Provide only the numeric answer.',
final_answer: '4',
file_name: null,
file_path: null,
};
console.log(' Spawning claude -p ...');
const result = await runGaiaQuestionViaClaudeP(q, {
model: 'claude-haiku-4-5',
budgetUsd: 0.25,
timeoutMs: 120_000,
});
console.log(` raw result: ${result.rawResult.slice(0, 100)}`);
console.log(` finalAnswer: ${result.finalAnswer}`);
console.log(` costUsd: $${result.costUsd.toFixed(4)}`);
console.log(` numTurns: ${result.numTurns}`);
console.log(` isError: ${result.isError}`);
assert(!result.isError, 'no error', result.errorMessage);
assert(result.finalAnswer !== null, 'finalAnswer is not null');
assert(result.finalAnswer === '4', `answer is "4"`, `got: "${result.finalAnswer}"`);
assert(result.costUsd > 0, 'cost is positive');
assert(result.wallMs > 0, 'wallMs is positive');
}
async function testCurrentFact() {
console.log('\n-- Integration: current fact (Tokyo population) --');
const q = {
task_id: 'smoke-tokyo-pop',
level: 1,
question: 'What is the approximate population of Tokyo (the city proper)? Give only a number in millions, rounded to the nearest million.',
final_answer: '14',
file_name: null,
file_path: null,
};
console.log(' Spawning claude -p ...');
const result = await runGaiaQuestionViaClaudeP(q, {
model: 'claude-haiku-4-5',
budgetUsd: 0.25,
timeoutMs: 120_000,
});
console.log(` raw result excerpt: ${result.rawResult.slice(0, 150)}`);
console.log(` finalAnswer: ${result.finalAnswer}`);
console.log(` costUsd: $${result.costUsd.toFixed(4)}`);
assert(!result.isError, 'no error', result.errorMessage);
assert(result.finalAnswer !== null, 'finalAnswer is not null');
// Tokyo metro area is ~37M, city proper is ~13-14M — any multi-digit number is reasonable
const numAnswer = result.finalAnswer ? parseFloat(result.finalAnswer.replace(/[^0-9.]/g, '')) : NaN;
assert(!isNaN(numAnswer) && numAnswer > 0, 'answer is a non-empty number', `got: "${result.finalAnswer}"`);
}
async function testAnswerFormat() {
console.log('\n-- Integration: answer format discipline --');
const q = {
task_id: 'smoke-format',
level: 1,
question: 'What is the capital of France? Give only the city name, nothing else.',
final_answer: 'Paris',
file_name: null,
file_path: null,
};
console.log(' Spawning claude -p ...');
const result = await runGaiaQuestionViaClaudeP(q, {
model: 'claude-haiku-4-5',
budgetUsd: 0.25,
timeoutMs: 120_000,
});
console.log(` raw result excerpt: ${result.rawResult.slice(0, 150)}`);
console.log(` finalAnswer: ${result.finalAnswer}`);
console.log(` costUsd: $${result.costUsd.toFixed(4)}`);
assert(!result.isError, 'no error', result.errorMessage);
assert(result.finalAnswer !== null, 'finalAnswer is not null');
assert(result.finalAnswer !== null && result.finalAnswer.toLowerCase().includes('paris'), 'answer contains "paris"', `got: "${result.finalAnswer}"`);
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
console.log('=== gaia-claude-p smoke tests ===');
console.log(`Model: claude-haiku-4-5 (smoke uses Haiku to minimize cost)`);
console.log(`Default production model: ${CLAUDE_P_DEFAULT_MODEL}`);
// Unit tests (no API calls)
testExtractFinalAnswer();
testBuildClaudePPrompt();
// Integration tests (invoke claude -p)
await testArithmetic();
await testCurrentFact();
await testAnswerFormat();
// Summary
const total = passed + failed;
console.log(`\n=== Results: ${passed}/${total} passed ===`);
if (failed > 0) {
process.exit(1);
}
}
main().catch((err) => {
console.error('Smoke test runner error:', err);
process.exit(1);
});
//# sourceMappingURL=gaia-claude-p.smoke.js.map