UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

235 lines 12.3 kB
/** * Smoke tests for gaia-hardness predictor — ADR-136 Track Q * * All tests are mock-based (no live API calls, no HF token, $0 cost). * * Test matrix: * 1. Feature extraction: math question fires requiresMath + qword_calc * 2. Feature extraction: simple factual question has low complexity signal * 3. Feature extraction: question with file attachment fires has_file_attachment * 4. Feature extraction: multi-hop question fires multi_hop_signal * 5. Cold-start fallback: untrained predictor returns medium with confidence=0.5 * 6. Train on mock data: predictor becomes trained after ≥ 10 examples * 7. Trained predictor: math-heavy question predicts harder than trivial question * 8. Export/import: weights round-trip through JSON without loss * * Refs: ADR-136, #2156 */ import assert from 'node:assert/strict'; import { extractFeatures, FEATURE_LABELS } from './features.js'; import { HardnessPredictor } from './predictor.js'; // --------------------------------------------------------------------------- // Fixture helpers // --------------------------------------------------------------------------- function makeQuestion(text, overrides = {}) { return { task_id: 'smoke-test', level: 1, question: text, final_answer: 'X', file_name: null, file_path: null, ...overrides, }; } function labelIndex(label) { return FEATURE_LABELS.indexOf(label); } // --------------------------------------------------------------------------- // Test 1: Math question fires expected features // --------------------------------------------------------------------------- function testMathQuestionFeatures() { const q = makeQuestion('What is 15 multiplied by 4? Calculate the exact product.'); const fv = extractFeatures(q); const calcIdx = labelIndex('qword_calc_compute'); const mathIdx = labelIndex('requires_math'); const numIdx = labelIndex('has_numeric'); assert.ok(calcIdx >= 0, 'qword_calc_compute feature not found'); assert.ok(mathIdx >= 0, 'requires_math feature not found'); assert.ok(numIdx >= 0, 'has_numeric feature not found'); assert.equal(fv.values[numIdx], 1, 'has_numeric should be 1 for "15 multiplied by 4"'); // "Calculate" or "what" triggers calc compute or requires_math const eitherMath = fv.values[calcIdx] === 1 || fv.values[mathIdx] === 1; assert.ok(eitherMath, 'math question should fire qword_calc_compute or requires_math'); console.log(' [PASS] Test 1: math question fires expected features'); } // --------------------------------------------------------------------------- // Test 2: Simple factual question has low complexity // --------------------------------------------------------------------------- function testSimpleFactualQuestion() { const q = makeQuestion('What is the capital of France?'); const fv = extractFeatures(q); assert.equal(fv.values.length, 17, 'feature vector should have 17 dims'); const multiHopIdx = labelIndex('multi_hop_signal'); const toolIdx = labelIndex('tool_implication_norm'); const fileIdx = labelIndex('has_file_attachment'); assert.equal(fv.values[multiHopIdx], 0, 'simple question should have no multi-hop signal'); assert.equal(fv.values[toolIdx], 0, 'simple question should have no tool implication'); assert.equal(fv.values[fileIdx], 0, 'simple question should have no file attachment'); // Word count should be normalised well below 1 const wordNormIdx = labelIndex('len_words_norm'); assert.ok(fv.values[wordNormIdx] < 0.3, 'short question should have normalised word count < 0.3'); console.log(' [PASS] Test 2: simple factual question has low complexity signal'); } // --------------------------------------------------------------------------- // Test 3: File attachment fires has_file_attachment // --------------------------------------------------------------------------- function testFileAttachmentFeature() { const q = makeQuestion('Analyse the attached PDF document and extract the table.', { file_name: 'document.pdf', file_path: '/tmp/document.pdf', }); const fv = extractFeatures(q); const fileIdx = labelIndex('has_file_attachment'); const toolIdx = labelIndex('tool_implication_norm'); assert.equal(fv.values[fileIdx], 1, 'question with file_name should have has_file_attachment=1'); assert.ok(fv.values[toolIdx] > 0, 'PDF question should have tool_implication > 0'); console.log(' [PASS] Test 3: file attachment fires has_file_attachment'); } // --------------------------------------------------------------------------- // Test 4: Multi-hop question fires multi_hop_signal // --------------------------------------------------------------------------- function testMultiHopFeature() { const q = makeQuestion('Who was the president of the country that won the most gold medals at the Olympics that was held in the city that hosted the 1936 Games?'); const fv = extractFeatures(q); const multiHopIdx = labelIndex('multi_hop_signal'); assert.equal(fv.values[multiHopIdx], 1, 'multi-hop question should fire multi_hop_signal'); console.log(' [PASS] Test 4: multi-hop question fires multi_hop_signal'); } // --------------------------------------------------------------------------- // Test 5: Cold-start fallback returns medium with confidence=0.5 // --------------------------------------------------------------------------- function testColdStartFallback() { const predictor = new HardnessPredictor(); assert.equal(predictor.isTrained, false, 'new predictor should not be trained'); const q = makeQuestion('What is the capital of France?'); const result = predictor.predict(q); assert.equal(result.difficulty, 'medium', 'cold-start should return medium'); assert.equal(result.confidence, 0.5, 'cold-start should return confidence=0.5'); assert.equal(result.budget.model, 'sonnet', 'medium budget should use sonnet'); assert.equal(result.budget.maxTurns, 8, 'medium budget should have maxTurns=8'); assert.equal(result.budget.votingAttempts, 1, 'medium budget should have votingAttempts=1'); console.log(' [PASS] Test 5: cold-start fallback returns medium with confidence=0.5'); } // --------------------------------------------------------------------------- // Test 6: Becomes trained after ≥ 10 labelled examples // --------------------------------------------------------------------------- function testBecomesTrainedWith10Examples() { const predictor = new HardnessPredictor(); // Generate 15 mock examples (mix of correct/incorrect) const examples = []; for (let i = 0; i < 15; i++) { examples.push({ question: makeQuestion(`Mock question number ${i} about topic ${i % 3}`), wasCorrect: i % 3 !== 0, turns: 3 + (i % 6), }); } assert.equal(predictor.isTrained, false, 'should not be trained before training'); predictor.train(examples); assert.equal(predictor.isTrained, true, 'should be trained after ≥10 examples'); console.log(' [PASS] Test 6: predictor becomes trained after ≥ 10 examples'); } // --------------------------------------------------------------------------- // Test 7: Trained predictor scores math question harder than trivial question // --------------------------------------------------------------------------- function testTrainedRelativeHardness() { const predictor = new HardnessPredictor({ conservativeMode: false }); // Train on 20 examples with clear pattern: // - Short simple questions → correct (easy) // - Long multi-step questions → incorrect (hard) const examples = []; for (let i = 0; i < 10; i++) { examples.push({ question: makeQuestion(`What is the capital of country ${i}?`), wasCorrect: true, turns: 2, }); examples.push({ question: makeQuestion(`Calculate the exact percentage difference between ${i * 100} and ${i * 100 + 50}, ` + `then multiply by the number of years between 1990 and 2023. ` + `Express as a decimal rounded to 3 places.`), wasCorrect: false, turns: 12, }); } predictor.train(examples); const trivialQ = makeQuestion('What is the capital of France?'); const mathQ = makeQuestion('Calculate the exact percentage difference between 750 and 1250, ' + 'then multiply by the number of years between 1990 and 2023. ' + 'Express as a decimal rounded to 3 places.'); const trivialResult = predictor.predict(trivialQ); const mathResult = predictor.predict(mathQ); // The difficulty ordering should not put math as easier than trivial. const ORDER = { easy: 0, medium: 1, hard: 2 }; assert.ok(ORDER[mathResult.difficulty] >= ORDER[trivialResult.difficulty], `math question (${mathResult.difficulty}) should be ≥ trivial (${trivialResult.difficulty})`); console.log(` [PASS] Test 7: math="${mathResult.difficulty}" (conf=${mathResult.confidence.toFixed(2)}) ≥ trivial="${trivialResult.difficulty}" (conf=${trivialResult.confidence.toFixed(2)})`); } // --------------------------------------------------------------------------- // Test 8: Weights round-trip through JSON export/import // --------------------------------------------------------------------------- function testWeightRoundTrip() { const predictor = new HardnessPredictor(); const examples = []; for (let i = 0; i < 12; i++) { examples.push({ question: makeQuestion(`Question ${i}`), wasCorrect: i % 2 === 0, turns: 4 + i, }); } predictor.train(examples); const exported = predictor.export(); assert.ok(exported !== null, 'exported state should not be null after training'); const predictor2 = new HardnessPredictor(); predictor2.import(exported); assert.equal(predictor2.isTrained, true, 'imported predictor should be trained'); // Both predictors should give the same result for the same question. const q = makeQuestion('How many sides does a hexagon have?'); const r1 = predictor.predict(q); const r2 = predictor2.predict(q); assert.equal(r1.difficulty, r2.difficulty, 'exported/imported predictor should give same difficulty'); assert.ok(Math.abs(r1.confidence - r2.confidence) < 0.001, 'exported/imported predictor should give same confidence'); console.log(' [PASS] Test 8: weights round-trip through export/import without loss'); } // --------------------------------------------------------------------------- // Runner // --------------------------------------------------------------------------- async function runSmoke() { console.log('\n=== gaia-hardness predictor.smoke.ts — ADR-136 Track Q ==='); console.log('(mock-based, no live API, $0 cost)\n'); let passed = 0; let total = 0; function run(name, fn) { total++; try { fn(); passed++; } catch (err) { console.error(` [FAIL] ${name}: ${err instanceof Error ? err.message : String(err)}`); } } run('Feature extraction: math question', testMathQuestionFeatures); run('Feature extraction: simple factual', testSimpleFactualQuestion); run('Feature extraction: file attachment', testFileAttachmentFeature); run('Feature extraction: multi-hop signal', testMultiHopFeature); run('Cold-start fallback: medium + confidence=0.5', testColdStartFallback); run('Train: becomes trained with ≥10 examples', testBecomesTrainedWith10Examples); run('Trained: math harder than trivial', testTrainedRelativeHardness); run('Export/import: weights round-trip', testWeightRoundTrip); console.log(`\n=== Summary: ${passed}/${total} tests passed ===`); if (passed < total) { process.exit(1); } } // Run when invoked directly. if (process.argv[1]?.endsWith('predictor.smoke.js') || process.argv[1]?.endsWith('predictor.smoke.ts')) { runSmoke().catch((err) => { console.error('Smoke crashed:', err); process.exit(2); }); } export { runSmoke }; //# sourceMappingURL=predictor.smoke.js.map