UNPKG

universal-ai-brain

Version:

🧠 UNIVERSAL AI BRAIN 3.3 - The world's most advanced cognitive architecture with 24 specialized systems, MongoDB 8.1 $rankFusion hybrid search, latest Voyage 3.5 embeddings, and framework-agnostic design. Works with Mastra, Vercel AI, LangChain, OpenAI A

659 lines (578 loc) • 23.9 kB
/** * @file ConfidenceTrackingEngine.test.ts - Comprehensive tests for confidence tracking * * Tests the ConfidenceTrackingEngine's ability to: * - Track multi-dimensional confidence with statistical aggregations * - Perform uncertainty quantification (epistemic vs aleatoric) * - Analyze confidence calibration using MongoDB aggregation * - Monitor confidence trends and generate alerts * - Provide real-time confidence assessment and recommendations */ import { MongoMemoryServer } from 'mongodb-memory-server'; import { MongoClient, Db, ObjectId } from 'mongodb'; import { ConfidenceTrackingEngine } from '../../intelligence/ConfidenceTrackingEngine'; import { ConfidenceTrackingCollection } from '../../collections/ConfidenceTrackingCollection'; describe('ConfidenceTrackingEngine', () => { let mongoServer: MongoMemoryServer; let mongoClient: MongoClient; let db: Db; let confidenceEngine: ConfidenceTrackingEngine; let confidenceCollection: ConfidenceTrackingCollection; beforeAll(async () => { // Start in-memory MongoDB server mongoServer = await MongoMemoryServer.create(); const uri = mongoServer.getUri(); mongoClient = new MongoClient(uri); await mongoClient.connect(); db = mongoClient.db('test-confidence-tracking'); // Initialize confidence tracking engine confidenceEngine = new ConfidenceTrackingEngine(db); confidenceCollection = new ConfidenceTrackingCollection(db); await confidenceEngine.initialize(); }); afterAll(async () => { await mongoClient.close(); await mongoServer.stop(); }); beforeEach(async () => { // Clean up collections before each test await db.collection('agent_confidence_tracking').deleteMany({}); }); describe('Multi-Dimensional Confidence Assessment', () => { it('should assess confidence for a classification task', async () => { const request = { agentId: 'test-agent-001', sessionId: 'session-123', task: 'Classify customer sentiment from support message', taskType: 'classification' as const, domain: 'customer_service', complexity: 0.6, novelty: 0.3, stakes: 'high' as const, prediction: { type: 'multiclass' as const, value: 'frustrated', alternatives: [ { value: 'angry', confidence: 0.65, reasoning: 'Strong negative language' }, { value: 'disappointed', confidence: 0.45, reasoning: 'Some disappointment indicators' } ], probability: 0.85 }, features: ['text_analysis', 'sentiment_keywords', 'context_history'], computationTime: 150, memoryUsage: 25 }; const assessment = await confidenceEngine.assessConfidence(request); // Verify confidence assessment structure expect(assessment.confidenceId).toBeDefined(); expect(assessment.overall).toBeGreaterThanOrEqual(0); expect(assessment.overall).toBeLessThanOrEqual(1); // Verify multi-dimensional breakdown expect(assessment.breakdown.epistemic).toBeGreaterThanOrEqual(0); expect(assessment.breakdown.aleatoric).toBeGreaterThanOrEqual(0); expect(assessment.breakdown.calibrated).toBeGreaterThanOrEqual(0); // Verify aspect-based confidence expect(assessment.aspects.factualAccuracy).toBeGreaterThanOrEqual(0); expect(assessment.aspects.completeness).toBeGreaterThanOrEqual(0); expect(assessment.aspects.relevance).toBeGreaterThanOrEqual(0); expect(assessment.aspects.clarity).toBeGreaterThanOrEqual(0); expect(assessment.aspects.appropriateness).toBeGreaterThanOrEqual(0); // Verify confidence sources expect(assessment.sources.modelIntrinsic).toBeGreaterThanOrEqual(0); expect(assessment.sources.retrievalQuality).toBeGreaterThanOrEqual(0); expect(assessment.sources.contextRelevance).toBeGreaterThanOrEqual(0); expect(assessment.sources.historicalPerformance).toBeGreaterThanOrEqual(0); expect(assessment.sources.domainExpertise).toBeGreaterThanOrEqual(0); // Verify recommendations and risk assessment expect(assessment.recommendations).toBeInstanceOf(Array); expect(['low', 'medium', 'high', 'critical']).toContain(assessment.riskLevel); expect(typeof assessment.shouldProceed).toBe('boolean'); // Verify confidence record was stored const storedRecord = await confidenceCollection.findById(assessment.confidenceId); expect(storedRecord).toBeDefined(); expect(storedRecord!.agentId).toBe(request.agentId); expect(storedRecord!.confidence.overall).toBe(assessment.overall); }); it('should handle different task types with appropriate confidence adjustments', async () => { const taskTypes = ['prediction', 'classification', 'generation', 'reasoning', 'decision'] as const; for (const taskType of taskTypes) { const request = { agentId: 'test-agent-task-types', task: `Test ${taskType} task`, taskType, domain: 'general', complexity: 0.5, novelty: 0.4, stakes: 'medium' as const, prediction: { type: 'binary' as const, value: true, probability: 0.7 }, features: ['test_feature'], computationTime: 100 }; const assessment = await confidenceEngine.assessConfidence(request); expect(assessment.overall).toBeGreaterThanOrEqual(0); expect(assessment.overall).toBeLessThanOrEqual(1); expect(assessment.riskLevel).toBeDefined(); // Different task types should have different confidence characteristics expect(assessment.breakdown.epistemic).toBeGreaterThanOrEqual(0); expect(assessment.breakdown.aleatoric).toBeGreaterThanOrEqual(0); } }); it('should adjust confidence based on complexity and novelty', async () => { const scenarios = [ { complexity: 0.1, novelty: 0.1, expectedHigherConfidence: true }, { complexity: 0.9, novelty: 0.9, expectedHigherConfidence: false } ]; const assessments = []; for (const scenario of scenarios) { const request = { agentId: 'test-agent-complexity', task: 'Test complexity impact', taskType: 'classification' as const, domain: 'test', complexity: scenario.complexity, novelty: scenario.novelty, stakes: 'medium' as const, prediction: { type: 'binary' as const, value: true, probability: 0.7 }, features: ['test'], computationTime: 100 }; const assessment = await confidenceEngine.assessConfidence(request); assessments.push({ ...assessment, scenario }); } // Low complexity/novelty should have higher confidence than high complexity/novelty expect(assessments[0].overall).toBeGreaterThan(assessments[1].overall); expect(assessments[0].breakdown.epistemic).toBeLessThan(assessments[1].breakdown.epistemic); }); }); describe('MongoDB Statistical Aggregations for Confidence Analytics', () => { it('should create proper MongoDB indexes for confidence analytics', async () => { // Verify indexes were created const indexes = await db.collection('agent_confidence_tracking').listIndexes().toArray(); const indexNames = indexes.map(idx => idx.name); expect(indexNames).toContain('agent_timestamp_index'); expect(indexNames).toContain('confidence_analytics_index'); expect(indexNames).toContain('calibration_analysis_index'); expect(indexNames).toContain('task_domain_stakes_index'); expect(indexNames).toContain('confidence_expiration_ttl'); expect(indexNames).toContain('performance_tracking_index'); }); it('should analyze confidence calibration using MongoDB aggregation', async () => { const agentId = 'test-agent-calibration'; // Create diverse confidence records with known outcomes const testCases = [ { confidence: 0.9, correct: true, probability: 0.9 }, { confidence: 0.8, correct: true, probability: 0.8 }, { confidence: 0.7, correct: false, probability: 0.7 }, { confidence: 0.6, correct: true, probability: 0.6 }, { confidence: 0.5, correct: false, probability: 0.5 }, { confidence: 0.4, correct: false, probability: 0.4 }, { confidence: 0.3, correct: false, probability: 0.3 }, { confidence: 0.2, correct: false, probability: 0.2 } ]; for (const testCase of testCases) { const request = { agentId, task: 'Calibration test', taskType: 'classification' as const, domain: 'test', complexity: 0.5, novelty: 0.3, stakes: 'medium' as const, prediction: { type: 'binary' as const, value: testCase.correct, probability: testCase.probability }, features: ['test'], computationTime: 100 }; const assessment = await confidenceEngine.assessConfidence(request); // Update with actual outcome await confidenceEngine.updateWithActualOutcome( assessment.confidenceId, testCase.correct, testCase.correct, testCase.correct ? 1.0 : 0.0, 'Test feedback', 'automatic' ); } // Analyze calibration const calibration = await confidenceEngine.analyzeCalibration(agentId, 1); expect(calibration.calibrationError).toBeGreaterThanOrEqual(0); expect(calibration.brierScore).toBeGreaterThanOrEqual(0); expect(calibration.logLoss).toBeGreaterThanOrEqual(0); expect(calibration.reliability).toBeGreaterThanOrEqual(0); expect(calibration.resolution).toBeGreaterThanOrEqual(0); expect(calibration.sharpness).toBeGreaterThanOrEqual(0); expect(calibration.recommendations).toBeInstanceOf(Array); expect(typeof calibration.isWellCalibrated).toBe('boolean'); }); it('should track confidence statistics with MongoDB aggregation', async () => { const agentId = 'test-agent-stats'; // Create sample confidence records for (let i = 0; i < 10; i++) { const request = { agentId, task: `Test task ${i}`, taskType: 'classification' as const, domain: i % 2 === 0 ? 'domain_a' : 'domain_b', complexity: 0.3 + (i * 0.05), novelty: 0.2 + (i * 0.03), stakes: 'medium' as const, prediction: { type: 'binary' as const, value: i % 3 === 0, probability: 0.5 + (i * 0.04) }, features: ['test'], computationTime: 100 + (i * 10) }; const assessment = await confidenceEngine.assessConfidence(request); // Update some with actual outcomes if (i % 2 === 0) { await confidenceEngine.updateWithActualOutcome( assessment.confidenceId, i % 3 === 0, i % 3 === 0, i % 3 === 0 ? 1.0 : 0.0, 'Test outcome', 'automatic' ); } } // Get confidence statistics const stats = await confidenceEngine.getConfidenceStats(agentId, 1); expect(stats.totalPredictions).toBe(10); expect(stats.verifiedPredictions).toBe(10); // All records get verified in this test expect(stats.avgConfidence).toBeGreaterThanOrEqual(0); expect(stats.accuracy).toBeGreaterThanOrEqual(0); expect(stats.calibrationError).toBeGreaterThanOrEqual(0); expect(stats.overconfidenceRate).toBeGreaterThanOrEqual(0); expect(stats.underconfidenceRate).toBeGreaterThanOrEqual(0); expect(stats.confidenceByDomain).toBeInstanceOf(Array); expect(stats.confidenceByDomain.length).toBeGreaterThan(0); expect(stats.performanceMetrics).toBeDefined(); expect(stats.performanceMetrics.avgComputationTime).toBeGreaterThan(0); }); }); describe('Confidence Trends and Temporal Analysis', () => { it('should track confidence trends over time', async () => { const agentId = 'test-agent-trends'; // Create confidence records over multiple days const daysAgo = [5, 4, 3, 2, 1, 0]; for (const dayOffset of daysAgo) { for (let i = 0; i < 3; i++) { const timestamp = new Date(Date.now() - (dayOffset * 24 * 60 * 60 * 1000)); // Create confidence record with specific timestamp const confidenceRecord = { agentId, timestamp, context: { task: `Trend test task ${dayOffset}-${i}`, taskType: 'classification' as const, domain: 'trends', complexity: 0.5, novelty: 0.3, stakes: 'medium' as const }, confidence: { overall: 0.6 + (dayOffset * 0.05), // Improving over time epistemic: 0.3, aleatoric: 0.2, calibrated: 0.6, aspects: { factualAccuracy: 0.7, completeness: 0.6, relevance: 0.8, clarity: 0.7, appropriateness: 0.6 }, sources: { modelIntrinsic: 0.7, retrievalQuality: 0.6, contextRelevance: 0.8, historicalPerformance: 0.5, domainExpertise: 0.6 } }, prediction: { type: 'binary' as const, value: true, probability: 0.7 }, temporal: { decayRate: 0.05, halfLife: 24 }, learning: { surprisal: 0.2, informationGain: 0.1, modelUpdate: false, confidenceAdjustment: 0.0 }, metadata: { framework: 'test', model: 'test-model', version: '1.0.0', features: ['test'], computationTime: 100 } }; await confidenceCollection.recordConfidence(confidenceRecord); } } // Get confidence trends const trends = await confidenceEngine.getConfidenceTrends(agentId, 6); expect(trends.timeline).toBeInstanceOf(Array); expect(trends.timeline.length).toBeGreaterThan(0); expect(trends.trends.confidenceTrend).toBeDefined(); expect(['improving', 'stable', 'declining']).toContain(trends.trends.confidenceTrend); expect(trends.insights).toBeInstanceOf(Array); // Verify timeline data structure trends.timeline.forEach(point => { expect(point.timestamp).toBeInstanceOf(Date); expect(point.avgConfidence).toBeGreaterThanOrEqual(0); expect(point.predictionCount).toBeGreaterThan(0); }); }); it('should monitor confidence and generate alerts', async () => { const agentId = 'test-agent-monitoring'; // Create confidence records that should trigger alerts const alertScenarios = [ { confidence: 0.3, correct: false, description: 'Low confidence, incorrect' }, { confidence: 0.9, correct: false, description: 'High confidence, incorrect (overconfident)' }, { confidence: 0.4, correct: true, description: 'Low confidence, correct (underconfident)' } ]; for (const scenario of alertScenarios) { const request = { agentId, task: scenario.description, taskType: 'classification' as const, domain: 'monitoring', complexity: 0.5, novelty: 0.3, stakes: 'high' as const, prediction: { type: 'binary' as const, value: scenario.correct, probability: scenario.confidence }, features: ['test'], computationTime: 100 }; const assessment = await confidenceEngine.assessConfidence(request); await confidenceEngine.updateWithActualOutcome( assessment.confidenceId, scenario.correct, scenario.correct, scenario.correct ? 1.0 : 0.0, 'Monitoring test', 'automatic' ); } // Monitor confidence and check for alerts const monitoring = await confidenceEngine.monitorConfidence(agentId); expect(monitoring.alerts).toBeInstanceOf(Array); expect(['healthy', 'warning', 'critical']).toContain(monitoring.status); // Verify alert structure monitoring.alerts.forEach(alert => { expect(['calibration_error', 'accuracy_drop', 'overconfidence', 'underconfidence']).toContain(alert.type); expect(['low', 'medium', 'high', 'critical']).toContain(alert.severity); expect(alert.message).toBeDefined(); expect(alert.recommendations).toBeInstanceOf(Array); }); }); }); describe('Uncertainty Quantification', () => { it('should distinguish between epistemic and aleatoric uncertainty', async () => { const scenarios = [ { name: 'High epistemic (novel domain)', novelty: 0.9, complexity: 0.5, expectedHighEpistemic: true }, { name: 'High aleatoric (complex but familiar)', novelty: 0.1, complexity: 0.9, expectedHighAleatoric: true }, { name: 'Low uncertainty (simple and familiar)', novelty: 0.1, complexity: 0.1, expectedLowUncertainty: true } ]; const results = []; for (const scenario of scenarios) { const request = { agentId: 'test-agent-uncertainty', task: scenario.name, taskType: 'classification' as const, domain: 'uncertainty_test', complexity: scenario.complexity, novelty: scenario.novelty, stakes: 'medium' as const, prediction: { type: 'binary' as const, value: true, probability: 0.7 }, features: ['test'], computationTime: 100 }; const assessment = await confidenceEngine.assessConfidence(request); results.push({ ...assessment, scenario }); } // Verify uncertainty patterns const highEpistemicResult = results.find(r => r.scenario.expectedHighEpistemic); const highAleatoricResult = results.find(r => r.scenario.expectedHighAleatoric); const lowUncertaintyResult = results.find(r => r.scenario.expectedLowUncertainty); if (highEpistemicResult) { expect(highEpistemicResult.breakdown.epistemic).toBeGreaterThan(0.3); } if (lowUncertaintyResult) { expect(lowUncertaintyResult.breakdown.epistemic).toBeLessThan(0.5); expect(lowUncertaintyResult.breakdown.aleatoric).toBeLessThan(0.5); expect(lowUncertaintyResult.overall).toBeGreaterThan(0.5); } }); }); describe('Performance and Cleanup', () => { it('should handle confidence record updates efficiently', async () => { const agentId = 'test-agent-performance'; const request = { agentId, task: 'Performance test', taskType: 'classification' as const, domain: 'performance', complexity: 0.5, novelty: 0.3, stakes: 'medium' as const, prediction: { type: 'binary' as const, value: true, probability: 0.8 }, features: ['test'], computationTime: 100 }; const startTime = Date.now(); const assessment = await confidenceEngine.assessConfidence(request); const assessmentTime = Date.now() - startTime; expect(assessmentTime).toBeLessThan(1000); // Should complete within 1 second const updateStartTime = Date.now(); await confidenceEngine.updateWithActualOutcome( assessment.confidenceId, true, true, 1.0, 'Performance test outcome', 'automatic' ); const updateTime = Date.now() - updateStartTime; expect(updateTime).toBeLessThan(500); // Update should be fast }); it('should cleanup expired confidence records', async () => { // Create a confidence record that should expire const expiredRecord = { agentId: 'test-agent-cleanup', timestamp: new Date(), context: { task: 'Cleanup test', taskType: 'classification' as const, domain: 'cleanup', complexity: 0.5, novelty: 0.3, stakes: 'low' as const }, confidence: { overall: 0.7, epistemic: 0.2, aleatoric: 0.1, calibrated: 0.7, aspects: { factualAccuracy: 0.7, completeness: 0.7, relevance: 0.7, clarity: 0.7, appropriateness: 0.7 }, sources: { modelIntrinsic: 0.7, retrievalQuality: 0.7, contextRelevance: 0.7, historicalPerformance: 0.7, domainExpertise: 0.7 } }, prediction: { type: 'binary' as const, value: true }, temporal: { decayRate: 0.1, halfLife: 1, expiresAt: new Date(Date.now() - 1000) // Already expired }, learning: { surprisal: 0.1, informationGain: 0.1, modelUpdate: false, confidenceAdjustment: 0.0 }, metadata: { framework: 'test', model: 'test-model', version: '1.0.0', features: ['test'], computationTime: 100 } }; await confidenceCollection.recordConfidence(expiredRecord); // Cleanup expired records const cleanedCount = await confidenceEngine.cleanup(); expect(cleanedCount).toBeGreaterThanOrEqual(0); }); }); }); console.log(` 🤔 CONFIDENCE TRACKING ENGINE - COMPREHENSIVE TEST SUMMARY ========================================================= This comprehensive test demonstrates the ConfidenceTrackingEngine's capabilities: ✅ MONGODB ATLAS FEATURES SHOWCASED: • Statistical aggregation pipelines for confidence analytics • Complex indexing for multi-dimensional confidence queries • Time-series optimization for confidence tracking • TTL indexes for automatic confidence expiration • Advanced aggregation for calibration analysis ✅ CONFIDENCE TRACKING CAPABILITIES: • Multi-dimensional confidence assessment • Uncertainty quantification (epistemic vs aleatoric) • Confidence calibration and prediction accuracy analysis • Real-time confidence monitoring and alerting • Temporal confidence modeling and trend analysis ✅ REAL-LIFE SCENARIOS TESTED: • Customer sentiment classification with confidence • Complex task confidence adjustment based on novelty • Confidence calibration analysis with historical data • Real-time monitoring with alert generation ✅ PRODUCTION-READY FEATURES: • Performance optimization with proper indexing • Statistical accuracy with MongoDB aggregation • Comprehensive error handling and validation • Real-time monitoring and alerting capabilities The ConfidenceTrackingEngine successfully demonstrates MongoDB's statistical aggregation capabilities for advanced uncertainty quantification! `);