UNPKG

aiwg

Version:

Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.

493 lines (411 loc) 14.2 kB
/** * Tests for EarlyStopping * * Run with: node tools/ralph-external/early-stopping.test.mjs */ import { EarlyStopping } from './early-stopping.mjs'; import { IterationAnalytics } from './iteration-analytics.mjs'; import { existsSync, mkdirSync, rmSync } from 'fs'; import assert from 'assert'; const TEST_DIR = '.aiwg/ralph/early-stopping-test'; function cleanup() { if (existsSync(TEST_DIR)) { rmSync(TEST_DIR, { recursive: true, force: true }); } } function setup() { cleanup(); mkdirSync(TEST_DIR, { recursive: true }); } function test(name, fn) { try { fn(); console.log(`✓ ${name}`); } catch (error) { console.error(`✗ ${name}`); console.error(` ${error.message}`); throw error; } } // Test: Basic initialization test('EarlyStopping initializes with defaults', () => { const earlyStopping = new EarlyStopping(); const config = earlyStopping.getConfig(); assert.strictEqual(config.highConfidenceThreshold, 0.95); assert.strictEqual(config.plateauConsecutiveCount, 3); assert.strictEqual(config.plateauImprovementThreshold, 0.02); assert.strictEqual(config.minQualityThreshold, 70); assert.strictEqual(config.requireVerification, true); }); // Test: Custom configuration test('EarlyStopping accepts custom config', () => { const earlyStopping = new EarlyStopping({ highConfidenceThreshold: 0.90, plateauConsecutiveCount: 2, minQualityThreshold: 80, }); const config = earlyStopping.getConfig(); assert.strictEqual(config.highConfidenceThreshold, 0.90); assert.strictEqual(config.plateauConsecutiveCount, 2); assert.strictEqual(config.minQualityThreshold, 80); }); // Test: No stop on first iteration test('shouldStop() returns false on first iteration', () => { const earlyStopping = new EarlyStopping(); earlyStopping.recordIterationResult(1, { quality_score: 75, confidence: 0.80, verification_status: 'passed', quality_delta: 0, }); const decision = earlyStopping.shouldStop(1); assert.strictEqual(decision.stop, false); assert.strictEqual(decision.trigger, 'none'); }); // Test: High confidence + verification triggers stop test('High confidence + verification passed triggers stop', () => { const earlyStopping = new EarlyStopping({ highConfidenceThreshold: 0.95, requireVerification: true, minQualityThreshold: 70, }); earlyStopping.recordIterationResult(1, { quality_score: 85, confidence: 0.96, verification_status: 'passed', quality_delta: 0, }); const decision = earlyStopping.shouldStop(1); assert.strictEqual(decision.stop, true); assert.strictEqual(decision.trigger, 'high_confidence'); assert(decision.reason.includes('High confidence')); assert(decision.reason.includes('verification passed')); assert.strictEqual(decision.confidence, 0.96); }); // Test: High confidence but verification failed test('High confidence without verification does not stop (when required)', () => { const earlyStopping = new EarlyStopping({ highConfidenceThreshold: 0.95, requireVerification: true, }); earlyStopping.recordIterationResult(1, { quality_score: 85, confidence: 0.96, verification_status: 'failed', quality_delta: 0, }); const decision = earlyStopping.shouldStop(1); assert.strictEqual(decision.stop, false); // When not stopping, we get the generic continuation message assert(decision.reason.includes('Continuing iteration')); }); // Test: High confidence without verification requirement test('High confidence stops when verification not required', () => { const earlyStopping = new EarlyStopping({ highConfidenceThreshold: 0.95, requireVerification: false, }); earlyStopping.recordIterationResult(1, { quality_score: 85, confidence: 0.96, verification_status: 'failed', quality_delta: 0, }); const decision = earlyStopping.shouldStop(1); assert.strictEqual(decision.stop, true); assert.strictEqual(decision.trigger, 'high_confidence'); }); // Test: Quality below minimum threshold test('High confidence with low quality does not stop', () => { const earlyStopping = new EarlyStopping({ highConfidenceThreshold: 0.95, minQualityThreshold: 70, requireVerification: false, }); earlyStopping.recordIterationResult(1, { quality_score: 65, // Below threshold confidence: 0.96, verification_status: 'passed', quality_delta: 0, }); const decision = earlyStopping.shouldStop(1); assert.strictEqual(decision.stop, false); }); // Test: Quality plateau detection test('Quality plateau triggers stop after consecutive low improvements', () => { const earlyStopping = new EarlyStopping({ plateauConsecutiveCount: 3, plateauImprovementThreshold: 0.02, // 2% enablePlateauDetection: true, }); // Iteration 1: quality 70 earlyStopping.recordIterationResult(1, { quality_score: 70, confidence: 0.80, verification_status: 'passed', quality_delta: 0, }); // Iteration 2: quality 71 (delta: +1, 1.43% improvement - below 2%) earlyStopping.recordIterationResult(2, { quality_score: 71, confidence: 0.82, verification_status: 'passed', quality_delta: 1, }); // Iteration 3: quality 71.5 (delta: +0.5, 0.7% improvement - below 2%) earlyStopping.recordIterationResult(3, { quality_score: 71.5, confidence: 0.83, verification_status: 'passed', quality_delta: 0.5, }); // Iteration 4: quality 72 (delta: +0.5, 0.7% improvement - below 2%) earlyStopping.recordIterationResult(4, { quality_score: 72, confidence: 0.84, verification_status: 'passed', quality_delta: 0.5, }); const decision = earlyStopping.shouldStop(4); assert.strictEqual(decision.stop, true); assert.strictEqual(decision.trigger, 'quality_plateau'); assert(decision.reason.includes('Quality plateau detected')); assert(decision.reason.includes('3 consecutive iterations')); }); // Test: Quality plateau not detected with significant improvement test('Quality plateau not detected when improvements exceed threshold', () => { const earlyStopping = new EarlyStopping({ plateauConsecutiveCount: 3, plateauImprovementThreshold: 0.02, // 2% }); // Iteration 1: 70 earlyStopping.recordIterationResult(1, { quality_score: 70, confidence: 0.80, verification_status: 'passed', quality_delta: 0, }); // Iteration 2: 73 (4.3% improvement - above threshold) earlyStopping.recordIterationResult(2, { quality_score: 73, confidence: 0.82, verification_status: 'passed', quality_delta: 3, }); // Iteration 3: 75 (2.7% improvement - above threshold) earlyStopping.recordIterationResult(3, { quality_score: 75, confidence: 0.84, verification_status: 'passed', quality_delta: 2, }); const decision = earlyStopping.shouldStop(3); assert.strictEqual(decision.stop, false); assert(decision.reason.includes('Continuing iteration') || decision.reason.includes('Quality still improving')); }); // Test: Integration with IterationAnalytics for diminishing returns test('Diminishing returns detection via IterationAnalytics', () => { setup(); const analytics = new IterationAnalytics('test-loop', 'Test task', { storagePath: TEST_DIR, diminishingReturnsThreshold: 0.05, consecutiveCountThreshold: 2, }); const earlyStopping = new EarlyStopping( { enableDiminishingReturns: true }, analytics ); // Record iterations in both systems const iterations = [ { iteration_number: 1, quality_score: 70, quality_delta: 0 }, { iteration_number: 2, quality_score: 72, quality_delta: 2 }, // 2.86% < 5% { iteration_number: 3, quality_score: 73, quality_delta: 1 }, // 1.39% < 5% ]; iterations.forEach(it => { analytics.recordIteration({ ...it, tokens_used: 1000, token_cost_usd: 0.01, execution_time_ms: 5000, verification_status: 'passed', output_snapshot_path: `/path/${it.iteration_number}`, }); earlyStopping.recordIterationResult(it.iteration_number, { quality_score: it.quality_score, confidence: 0.85, verification_status: 'passed', quality_delta: it.quality_delta, }); }); const decision = earlyStopping.shouldStop(3); assert.strictEqual(decision.stop, true); assert.strictEqual(decision.trigger, 'diminishing_returns'); assert(decision.reason.includes('Diminishing returns detected')); }); // Test: Configure method updates settings test('configure() updates configuration', () => { const earlyStopping = new EarlyStopping(); earlyStopping.configure({ highConfidenceThreshold: 0.90, minQualityThreshold: 75, }); const config = earlyStopping.getConfig(); assert.strictEqual(config.highConfidenceThreshold, 0.90); assert.strictEqual(config.minQualityThreshold, 75); // Other settings preserved assert.strictEqual(config.plateauConsecutiveCount, 3); }); // Test: Stopping reason tracking test('getStoppingReason() returns reason after stop', () => { const earlyStopping = new EarlyStopping({ requireVerification: false, }); earlyStopping.recordIterationResult(1, { quality_score: 85, confidence: 0.96, verification_status: 'passed', quality_delta: 0, }); const decision = earlyStopping.shouldStop(1); assert.strictEqual(decision.stop, true); const reason = earlyStopping.getStoppingReason(); assert(reason !== null); assert(reason.includes('High confidence')); }); // Test: Stopping details tracking test('getStoppingDetails() returns details after stop', () => { const earlyStopping = new EarlyStopping({ requireVerification: false, }); earlyStopping.recordIterationResult(1, { quality_score: 85, confidence: 0.96, verification_status: 'passed', quality_delta: 0, }); earlyStopping.shouldStop(1); const details = earlyStopping.getStoppingDetails(); assert(details !== null); assert.strictEqual(details.confidence, 0.96); assert.strictEqual(details.quality_score, 85); }); // Test: Reset functionality test('reset() clears iteration history and stopping state', () => { const earlyStopping = new EarlyStopping(); earlyStopping.recordIterationResult(1, { quality_score: 85, confidence: 0.96, verification_status: 'passed', quality_delta: 0, }); earlyStopping.shouldStop(1); // Before reset assert(earlyStopping.getIterationHistory().length > 0); earlyStopping.reset(); // After reset assert.strictEqual(earlyStopping.getIterationHistory().length, 0); assert.strictEqual(earlyStopping.getStoppingReason(), null); assert.strictEqual(earlyStopping.getStoppingDetails(), null); }); // Test: Generate summary test('generateSummary() produces comprehensive report', () => { const earlyStopping = new EarlyStopping({ requireVerification: false, }); earlyStopping.recordIterationResult(1, { quality_score: 85, confidence: 0.96, verification_status: 'passed', quality_delta: 0, }); const summary = earlyStopping.generateSummary(); assert.strictEqual(summary.total_iterations, 1); assert.strictEqual(summary.stopped_early, true); assert.strictEqual(summary.stopping_trigger, 'high_confidence'); assert(summary.stopping_reason.includes('High confidence')); assert.strictEqual(summary.final_quality, 85); assert.strictEqual(summary.final_confidence, 0.96); assert(summary.config !== undefined); }); // Test: Summary with no iterations test('generateSummary() handles empty history', () => { const earlyStopping = new EarlyStopping(); const summary = earlyStopping.generateSummary(); assert.strictEqual(summary.total_iterations, 0); assert.strictEqual(summary.stopped_early, false); assert.strictEqual(summary.reason, 'No iterations completed'); }); // Test: Multiple stopping criteria priority test('High confidence takes priority over plateau detection', () => { const earlyStopping = new EarlyStopping({ highConfidenceThreshold: 0.95, requireVerification: false, plateauConsecutiveCount: 2, }); // Set up plateau conditions earlyStopping.recordIterationResult(1, { quality_score: 70, confidence: 0.80, verification_status: 'passed', quality_delta: 0, }); earlyStopping.recordIterationResult(2, { quality_score: 71, confidence: 0.82, verification_status: 'passed', quality_delta: 1, }); // Final iteration has high confidence earlyStopping.recordIterationResult(3, { quality_score: 72, confidence: 0.96, // High confidence verification_status: 'passed', quality_delta: 1, }); const decision = earlyStopping.shouldStop(3); assert.strictEqual(decision.stop, true); assert.strictEqual(decision.trigger, 'high_confidence'); // Not plateau }); // Test: Iteration history tracking test('getIterationHistory() returns all recorded iterations', () => { const earlyStopping = new EarlyStopping(); earlyStopping.recordIterationResult(1, { quality_score: 70, confidence: 0.80, verification_status: 'passed', quality_delta: 0, }); earlyStopping.recordIterationResult(2, { quality_score: 75, confidence: 0.85, verification_status: 'passed', quality_delta: 5, }); const history = earlyStopping.getIterationHistory(); assert.strictEqual(history.length, 2); assert.strictEqual(history[0].iteration_number, 1); assert.strictEqual(history[1].iteration_number, 2); assert.strictEqual(history[1].quality_score, 75); }); // Test: Plateau detection with insufficient iterations test('Plateau detection requires minimum consecutive iterations', () => { const earlyStopping = new EarlyStopping({ plateauConsecutiveCount: 3, }); earlyStopping.recordIterationResult(1, { quality_score: 70, confidence: 0.80, verification_status: 'passed', quality_delta: 0, }); earlyStopping.recordIterationResult(2, { quality_score: 71, confidence: 0.81, verification_status: 'passed', quality_delta: 1, }); const decision = earlyStopping.shouldStop(2); assert.strictEqual(decision.stop, false); }); console.log('\n✅ All early stopping tests passed!'); cleanup();