claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
300 lines • 16 kB
JavaScript
/**
* Smoke tests — ADR-135 Track I: Causal failure-avoidance edges
*
* All I/O is mocked (no real filesystem or network calls).
*
* Tests:
* 1. Record one failure → retrieve same-signature question → returns hint
* 2. Record 3 failures → retrieve unrelated question → returns empty hint
* 3. Same edge recorded twice → occurrenceCount=2, not duplicated
* 4. File doesn't exist → graceful empty result on retrieve
* 5. Corrupted JSONL line → skipped, doesn't crash
* 6. maxEdgesPerSignature cap respected (no new edge after cap)
* 7. Signature computation is deterministic (same input → same output)
* 8. Correct answer trajectory → no edges recorded
*
* Usage:
* npx tsx src/benchmarks/gaia-causal-memory.smoke.ts
*
* Refs: ADR-135, #2156
*/
import * as fs from 'node:fs';
import * as os from 'node:os';
import * as path from 'node:path';
import { computeQuestionSignature, inferFailureType, recordCausalFailures, retrieveCausalHints, } from './gaia-causal-memory.js';
// ---------------------------------------------------------------------------
// Mini test runner (zero external deps)
// ---------------------------------------------------------------------------
let passed = 0;
let failed = 0;
const results = [];
async function test(name, fn) {
try {
await fn();
passed++;
results.push({ name, ok: true });
console.log(` PASS ${name}`);
}
catch (err) {
failed++;
const msg = err instanceof Error ? err.message : String(err);
results.push({ name, ok: false, error: msg });
console.error(` FAIL ${name}`);
console.error(` ${msg}`);
}
}
function assert(condition, message) {
if (!condition) {
throw new Error(`Assertion failed: ${message}`);
}
}
function assertEqual(actual, expected, label) {
if (actual !== expected) {
throw new Error(`${label}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
}
}
// ---------------------------------------------------------------------------
// Fixtures
// ---------------------------------------------------------------------------
function makeQuestion(text) {
return {
task_id: 'smoke-' + text.slice(0, 8),
level: 1,
question: text,
final_answer: 'ignored-in-smoke',
file_name: null,
file_path: null,
};
}
function makeResult(overrides = {}) {
return {
questionId: 'smoke-q1',
finalAnswer: 'wrong answer',
turns: 3,
toolCallsByName: { web_search: 2 },
totalInputTokens: 100,
totalOutputTokens: 50,
wallMs: 1500,
...overrides,
};
}
// ---------------------------------------------------------------------------
// Test suite
// ---------------------------------------------------------------------------
async function runAll() {
console.log('\nADR-135 Track I — causal failure-avoidance edges (smoke)\n');
// We use a fresh temp dir per test group to isolate state.
const tmpBase = path.join(os.tmpdir(), 'ruflo-causal-smoke-' + process.pid);
// -------------------------------------------------------------------------
// Test 1: Record one failure → retrieve same-signature question → hint returned
// -------------------------------------------------------------------------
await test('1. record one failure → retrieve same question → hint returned', async () => {
const dir = path.join(tmpBase, 't1');
fs.mkdirSync(dir, { recursive: true });
const storePath = path.join(dir, 'edges.jsonl');
const question = makeQuestion('What is the capital of France?');
const result = makeResult({ finalAnswer: 'London' });
const recRes = await recordCausalFailures(question, result, false, { storePath });
assert(recRes.edgesRecorded > 0, 'should have recorded at least one edge');
const retRes = await retrieveCausalHints(question, { storePath });
assert(retRes.edgesMatched > 0, 'should match the stored edge');
assert(retRes.hint.length > 0, 'hint should be non-empty');
assert(retRes.hint.includes('[PRIOR FAILURES]'), 'hint should contain [PRIOR FAILURES] header');
assert(retRes.hint.includes('web_search'), 'hint should mention the failed tool');
});
// -------------------------------------------------------------------------
// Test 2: Record 3 failures → retrieve unrelated question → empty hint
// -------------------------------------------------------------------------
await test('2. record 3 failures → retrieve unrelated question → empty hint', async () => {
const dir = path.join(tmpBase, 't2');
fs.mkdirSync(dir, { recursive: true });
const storePath = path.join(dir, 'edges.jsonl');
const questions = [
makeQuestion('What is the capital of France?'),
makeQuestion('Who wrote Hamlet?'),
makeQuestion('What is 2 + 2?'),
];
for (const q of questions) {
await recordCausalFailures(q, makeResult(), false, { storePath });
}
const unrelated = makeQuestion('Completely different and unrelated question about astrophysics and quantum phenomena XYZ123');
const retRes = await retrieveCausalHints(unrelated, { storePath });
assertEqual(retRes.edgesMatched, 0, 'edgesMatched for unrelated question');
assertEqual(retRes.hint, '', 'hint for unrelated question');
});
// -------------------------------------------------------------------------
// Test 3: Same edge recorded twice → occurrenceCount=2, not duplicated
// -------------------------------------------------------------------------
await test('3. same edge recorded twice → occurrenceCount=2, not duplicated', async () => {
const dir = path.join(tmpBase, 't3');
fs.mkdirSync(dir, { recursive: true });
const storePath = path.join(dir, 'edges.jsonl');
const question = makeQuestion('What year did World War II end?');
const result = makeResult({ toolCallsByName: { web_search: 1 } });
// Record same failure twice
await recordCausalFailures(question, result, false, { storePath });
await recordCausalFailures(question, result, false, { storePath });
// Read the JSONL directly and count lines / occurrence counts
const raw = fs.readFileSync(storePath, 'utf8');
const lines = raw.trim().split('\n').filter((l) => l.trim() !== '');
const edges = lines.map((l) => JSON.parse(l));
// Should have exactly one edge for this signature+tool+step
const sig = computeQuestionSignature(question.question);
const sigEdges = edges.filter((e) => e.questionSignature === sig && e.failedTool === 'web_search');
assertEqual(sigEdges.length, 1, 'should have exactly one edge per signature+tool+step');
assertEqual(sigEdges[0].occurrenceCount, 2, 'occurrenceCount after two recordings');
});
// -------------------------------------------------------------------------
// Test 4: File doesn't exist → graceful empty result on retrieve
// -------------------------------------------------------------------------
await test('4. file does not exist → graceful empty result on retrieve', async () => {
const storePath = path.join(tmpBase, 'nonexistent', 'edges.jsonl');
const question = makeQuestion('How many legs does a spider have?');
const retRes = await retrieveCausalHints(question, { storePath });
assertEqual(retRes.edgesMatched, 0, 'edgesMatched when file absent');
assertEqual(retRes.hint, '', 'hint when file absent');
});
// -------------------------------------------------------------------------
// Test 5: Corrupted JSONL line → skipped, doesn't crash
// -------------------------------------------------------------------------
await test('5. corrupted JSONL line → skipped, does not crash', async () => {
const dir = path.join(tmpBase, 't5');
fs.mkdirSync(dir, { recursive: true });
const storePath = path.join(dir, 'edges.jsonl');
const goodQuestion = makeQuestion('Name the largest planet in the solar system.');
const goodEdge = {
questionSignature: computeQuestionSignature(goodQuestion.question),
failedTool: 'python_exec',
failedTrajectoryStep: 'python_exec called once; failure type=wrong_answer',
observedFailureType: 'wrong_answer',
createdAt: new Date().toISOString(),
occurrenceCount: 1,
};
// Write one valid edge + one corrupted line
fs.mkdirSync(path.dirname(storePath), { recursive: true });
fs.writeFileSync(storePath, JSON.stringify(goodEdge) + '\n' +
'{"this": is not valid JSON!!!\n' +
'{"partial":true\n', 'utf8');
// retrieve should work and not throw
const retRes = await retrieveCausalHints(goodQuestion, { storePath });
assertEqual(retRes.edgesMatched, 1, 'should still find the one valid edge');
assert(retRes.hint.includes('[PRIOR FAILURES]'), 'hint should be returned for valid edge');
assert(retRes.hint.includes('python_exec'), 'hint should reference python_exec');
});
// -------------------------------------------------------------------------
// Test 6: maxEdgesPerSignature cap respected
// -------------------------------------------------------------------------
await test('6. maxEdgesPerSignature cap respected', async () => {
const dir = path.join(tmpBase, 't6');
fs.mkdirSync(dir, { recursive: true });
const storePath = path.join(dir, 'edges.jsonl');
const maxEdgesPerSignature = 2;
const question = makeQuestion('Explain the theory of relativity in simple terms.');
// Record 3 failures with different tools — only first 2 should be stored
for (const tool of ['web_search', 'file_read', 'python_exec']) {
const result = makeResult({ toolCallsByName: { [tool]: 1 } });
await recordCausalFailures(question, result, false, {
storePath,
maxEdgesPerSignature,
});
}
const raw = fs.readFileSync(storePath, 'utf8');
const lines = raw.trim().split('\n').filter((l) => l.trim() !== '');
const sig = computeQuestionSignature(question.question);
const sigEdges = lines
.map((l) => JSON.parse(l))
.filter((e) => e.questionSignature === sig);
assert(sigEdges.length <= maxEdgesPerSignature, `should have at most ${maxEdgesPerSignature} edges, got ${sigEdges.length}`);
});
// -------------------------------------------------------------------------
// Test 7: Signature computation is deterministic
// -------------------------------------------------------------------------
await test('7. signature computation is deterministic', async () => {
const text = 'What is the boiling point of water at sea level?';
const sig1 = computeQuestionSignature(text);
const sig2 = computeQuestionSignature(text);
const sig3 = computeQuestionSignature(text.trim()); // trim should not change result
const sig4 = computeQuestionSignature(text.toUpperCase().toLowerCase()); // normalisation
assertEqual(sig1, sig2, 'same input produces same signature (call 1 vs 2)');
assertEqual(sig1, sig3, 'trimmed input produces same signature');
assertEqual(sig1, sig4, 'case-normalised input produces same signature');
assertEqual(sig1.length, 16, 'signature is 16 hex characters');
// Different text should produce different signature (with overwhelming probability)
const sigOther = computeQuestionSignature('Completely different question about astronomy.');
assert(sig1 !== sigOther, 'different inputs produce different signatures');
});
// -------------------------------------------------------------------------
// Test 8: Correct answer trajectory → no edges recorded
// -------------------------------------------------------------------------
await test('8. correct answer trajectory → no edges recorded', async () => {
const dir = path.join(tmpBase, 't8');
fs.mkdirSync(dir, { recursive: true });
const storePath = path.join(dir, 'edges.jsonl');
const question = makeQuestion('What color is the sky on a clear day?');
const result = makeResult({ finalAnswer: 'blue' });
// wasCorrect = true
const recRes = await recordCausalFailures(question, result, true, { storePath });
assertEqual(recRes.edgesRecorded, 0, 'no edges for correct trajectory');
assert(!fs.existsSync(storePath), 'store file should not be created for correct answer');
});
// -------------------------------------------------------------------------
// Bonus: inferFailureType unit checks
// -------------------------------------------------------------------------
await test('inferFailureType: null for correct answers', async () => {
const result = makeResult({ finalAnswer: 'blue' });
const ft = inferFailureType(result, true);
assertEqual(ft, null, 'inferFailureType returns null for wasCorrect=true');
});
await test('inferFailureType: timeout when timedOut=true', async () => {
const result = makeResult({ timedOut: true });
const ft = inferFailureType(result, false);
assertEqual(ft, 'timeout', 'inferFailureType=timeout when timedOut flag set');
});
await test('inferFailureType: tool_error when error present', async () => {
const result = makeResult({ error: 'network timeout' });
const ft = inferFailureType(result, false);
assertEqual(ft, 'tool_error', 'inferFailureType=tool_error when error field set');
});
await test('inferFailureType: empty_result when finalAnswer is null', async () => {
const result = makeResult({ finalAnswer: null });
const ft = inferFailureType(result, false);
assertEqual(ft, 'empty_result', 'inferFailureType=empty_result when finalAnswer=null');
});
await test('inferFailureType: wrong_answer for normal failure', async () => {
const result = makeResult({ finalAnswer: 'incorrect value' });
const ft = inferFailureType(result, false);
assertEqual(ft, 'wrong_answer', 'inferFailureType=wrong_answer for normal wrong answer');
});
// -------------------------------------------------------------------------
// Cleanup
// -------------------------------------------------------------------------
try {
fs.rmSync(tmpBase, { recursive: true, force: true });
}
catch {
// Non-fatal cleanup failure; temp files will be cleared by OS.
}
// -------------------------------------------------------------------------
// Summary
// -------------------------------------------------------------------------
console.log(`\n${'─'.repeat(60)}`);
console.log(`Results: ${passed} passed, ${failed} failed (${passed + failed} total)`);
if (failed > 0) {
console.error('\nFailed tests:');
for (const r of results.filter((r) => !r.ok)) {
console.error(` ${r.name}: ${r.error}`);
}
process.exit(1);
}
else {
console.log('\nAll smoke tests passed.');
console.log('\nNote: Track I lift is compound (+0pp first run, +2-5pp after 5+ runs).');
console.log('Wiring into gaia-bench.ts is a follow-up PR (in-flight conflict avoidance).');
process.exit(0);
}
}
runAll().catch((err) => {
console.error('Unhandled error in smoke runner:', err);
process.exit(1);
});
//# sourceMappingURL=gaia-causal-memory.smoke.js.map