UNPKG

universal-ai-brain

Version:

🧠 UNIVERSAL AI BRAIN 3.3 - The world's most advanced cognitive architecture with 24 specialized systems, MongoDB 8.1 $rankFusion hybrid search, latest Voyage 3.5 embeddings, and framework-agnostic design. Works with Mastra, Vercel AI, LangChain, OpenAI A

760 lines (664 loc) 24.4 kB
/** * @file ConfidenceTrackingEngine - Advanced uncertainty quantification for AI agents * * This engine provides comprehensive confidence tracking and calibration using MongoDB's * statistical aggregation capabilities. Demonstrates MongoDB's advanced analytics for * multi-dimensional confidence data and uncertainty quantification. * * Features: * - Multi-dimensional confidence tracking with statistical aggregations * - Uncertainty quantification (epistemic vs aleatoric) * - Confidence calibration and prediction accuracy analysis * - Real-time confidence monitoring and alerting * - Adaptive confidence adjustment based on historical performance * - Temporal confidence modeling with decay functions */ import { Db, ObjectId } from 'mongodb'; import { ConfidenceTrackingCollection, ConfidenceRecord } from '../collections/ConfidenceTrackingCollection'; export interface ConfidenceAssessmentRequest { agentId: string; sessionId?: string; task: string; taskType: 'prediction' | 'classification' | 'generation' | 'reasoning' | 'decision'; domain: string; complexity: number; // 0-1 novelty: number; // 0-1 stakes: 'low' | 'medium' | 'high' | 'critical'; prediction: { type: 'binary' | 'multiclass' | 'regression' | 'ranking' | 'generation'; value: any; alternatives?: Array<{ value: any; confidence: number; reasoning: string }>; probability?: number; }; features: string[]; computationTime: number; memoryUsage?: number; } export interface ConfidenceAssessment { confidenceId: ObjectId; overall: number; breakdown: { epistemic: number; // Knowledge uncertainty aleatoric: number; // Data uncertainty calibrated: number; // Historically adjusted }; aspects: { factualAccuracy: number; completeness: number; relevance: number; clarity: number; appropriateness: number; }; sources: { modelIntrinsic: number; retrievalQuality: number; contextRelevance: number; historicalPerformance: number; domainExpertise: number; }; recommendations: string[]; riskLevel: 'low' | 'medium' | 'high' | 'critical'; shouldProceed: boolean; } export interface ConfidenceCalibration { isWellCalibrated: boolean; calibrationError: number; overconfidenceRate: number; underconfidenceRate: number; brierScore: number; logLoss: number; reliability: number; resolution: number; sharpness: number; recommendations: string[]; } export interface ConfidenceTrends { timeline: Array<{ timestamp: Date; avgConfidence: number; accuracy: number; calibrationError: number; predictionCount: number; }>; trends: { confidenceTrend: 'improving' | 'stable' | 'declining'; accuracyTrend: 'improving' | 'stable' | 'declining'; calibrationTrend: 'improving' | 'stable' | 'declining'; }; insights: string[]; } /** * ConfidenceTrackingEngine - Advanced uncertainty quantification for AI agents * * This engine showcases MongoDB's statistical aggregation capabilities: * - Complex aggregation pipelines for confidence analytics * - Statistical functions for calibration analysis * - Time-series optimization for confidence tracking * - Multi-dimensional confidence modeling * - Real-time confidence monitoring and alerting */ export class ConfidenceTrackingEngine { private db: Db; private confidenceCollection: ConfidenceTrackingCollection; private isInitialized: boolean = false; // Confidence tracking configuration private config = { calibration: { minSamplesForCalibration: 10, calibrationWindow: 30, // days targetCalibrationError: 0.1, overconfidenceThreshold: 0.8, underconfidenceThreshold: 0.5 }, confidence: { defaultDecayRate: 0.05, // per hour defaultHalfLife: 24, // hours minConfidenceThreshold: 0.3, maxConfidenceThreshold: 0.95 }, monitoring: { alertThresholds: { calibrationError: 0.2, accuracyDrop: 0.1, overconfidenceRate: 0.3 } } }; constructor(db: Db) { this.db = db; this.confidenceCollection = new ConfidenceTrackingCollection(db); } /** * Initialize the confidence tracking engine */ async initialize(): Promise<void> { if (this.isInitialized) { return; } try { // Create collection indexes await this.confidenceCollection.createIndexes(); this.isInitialized = true; console.log('🤔 ConfidenceTrackingEngine initialized successfully'); } catch (error) { console.error('❌ Failed to initialize ConfidenceTrackingEngine:', error); throw error; } } /** * Assess confidence for a prediction or decision */ async assessConfidence(request: ConfidenceAssessmentRequest): Promise<ConfidenceAssessment> { if (!this.isInitialized) { throw new Error('ConfidenceTrackingEngine must be initialized first'); } // Get historical performance for calibration const historicalPerformance = await this.getHistoricalPerformance( request.agentId, request.domain, request.taskType ); // Calculate multi-dimensional confidence const confidence = this.calculateMultiDimensionalConfidence(request, historicalPerformance); // Create confidence record const confidenceRecord: Omit<ConfidenceRecord, '_id' | 'createdAt' | 'updatedAt'> = { agentId: request.agentId, sessionId: request.sessionId, timestamp: new Date(), context: { task: request.task, taskType: request.taskType, domain: request.domain, complexity: request.complexity, novelty: request.novelty, stakes: request.stakes }, confidence: { overall: confidence.overall, epistemic: confidence.breakdown.epistemic, aleatoric: confidence.breakdown.aleatoric, calibrated: confidence.breakdown.calibrated, aspects: confidence.aspects, sources: confidence.sources }, prediction: request.prediction, temporal: { decayRate: this.config.confidence.defaultDecayRate, halfLife: this.config.confidence.defaultHalfLife, expiresAt: new Date(Date.now() + (this.config.confidence.defaultHalfLife * 60 * 60 * 1000)) }, learning: { surprisal: 0, // Will be calculated when actual outcome is known informationGain: 0, modelUpdate: false, confidenceAdjustment: 0 }, metadata: { framework: 'universal-ai-brain', model: 'confidence-tracking-v1', version: '1.0.0', features: request.features, computationTime: request.computationTime, memoryUsage: request.memoryUsage } }; // Store confidence record const confidenceId = await this.confidenceCollection.recordConfidence(confidenceRecord); // Generate recommendations and risk assessment const recommendations = this.generateConfidenceRecommendations(confidence); const riskLevel = this.assessRiskLevel(confidence, request.stakes); const shouldProceed = this.shouldProceedWithPrediction(confidence, riskLevel); return { confidenceId, overall: confidence.overall, breakdown: confidence.breakdown, aspects: confidence.aspects, sources: confidence.sources, recommendations, riskLevel, shouldProceed }; } /** * Update confidence record with actual outcome for calibration */ async updateWithActualOutcome( confidenceId: ObjectId, actualValue: any, correct: boolean, accuracy?: number, feedback?: string, verificationSource: 'automatic' | 'human' | 'external_system' = 'automatic' ): Promise<void> { const actual = { value: actualValue, correct, accuracy, feedback, verificationTime: new Date(), verificationSource }; await this.confidenceCollection.updateWithActual(confidenceId, actual); } /** * Analyze confidence calibration for an agent */ async analyzeCalibration(agentId: string, days: number = 30): Promise<ConfidenceCalibration> { const calibrationAnalysis = await this.confidenceCollection.analyzeCalibration(agentId, { timeRange: { start: new Date(Date.now() - (days * 24 * 60 * 60 * 1000)), end: new Date() } }); const stats = await this.confidenceCollection.getConfidenceStats(agentId, days); const isWellCalibrated = calibrationAnalysis.ece < this.config.calibration.targetCalibrationError; const recommendations = this.generateCalibrationRecommendations(calibrationAnalysis, stats); return { isWellCalibrated, calibrationError: calibrationAnalysis.ece, overconfidenceRate: stats.overconfidenceRate, underconfidenceRate: stats.underconfidenceRate, brierScore: calibrationAnalysis.brierScore, logLoss: calibrationAnalysis.logLoss, reliability: calibrationAnalysis.calibrationCurve.length > 0 ? calibrationAnalysis.calibrationCurve.reduce((sum, point) => sum + point.accuracy, 0) / calibrationAnalysis.calibrationCurve.length : 0, resolution: calibrationAnalysis.mce, sharpness: stats.avgConfidence, recommendations }; } /** * Get confidence trends over time */ async getConfidenceTrends(agentId: string, days: number = 30): Promise<ConfidenceTrends> { const timeline = await this.confidenceCollection.getConfidenceTrends(agentId, days, 'day'); // Calculate trends const trends = this.calculateTrends(timeline); const insights = this.generateTrendInsights(timeline, trends); return { timeline, trends, insights }; } /** * Get confidence statistics for an agent */ async getConfidenceStats(agentId: string, days: number = 7): Promise<{ totalPredictions: number; verifiedPredictions: number; avgConfidence: number; accuracy: number; calibrationError: number; overconfidenceRate: number; underconfidenceRate: number; confidenceByDomain: Array<{ domain: string; avgConfidence: number; accuracy: number }>; performanceMetrics: { avgComputationTime: number; avgMemoryUsage: number; efficiency: number; }; }> { const stats = await this.confidenceCollection.getConfidenceStats(agentId, days); // Calculate performance metrics const performanceMetrics = await this.calculatePerformanceMetrics(agentId, days); return { ...stats, performanceMetrics }; } /** * Monitor confidence in real-time and generate alerts */ async monitorConfidence(agentId: string): Promise<{ alerts: Array<{ type: 'calibration_error' | 'accuracy_drop' | 'overconfidence' | 'underconfidence'; severity: 'low' | 'medium' | 'high' | 'critical'; message: string; recommendations: string[]; }>; status: 'healthy' | 'warning' | 'critical'; }> { const stats = await this.getConfidenceStats(agentId, 7); const calibration = await this.analyzeCalibration(agentId, 7); const alerts = []; // Check calibration error if (calibration.calibrationError > this.config.monitoring.alertThresholds.calibrationError) { alerts.push({ type: 'calibration_error' as const, severity: calibration.calibrationError > 0.3 ? 'critical' : 'high', message: `High calibration error: ${calibration.calibrationError.toFixed(3)}`, recommendations: ['Review confidence assessment methods', 'Increase training data', 'Adjust confidence thresholds'] }); } // Check accuracy drop if (stats.accuracy < 0.7) { alerts.push({ type: 'accuracy_drop' as const, severity: stats.accuracy < 0.5 ? 'critical' : 'medium', message: `Low accuracy: ${stats.accuracy.toFixed(3)}`, recommendations: ['Review model performance', 'Update training data', 'Check for domain shift'] }); } // Check overconfidence if (stats.overconfidenceRate > this.config.monitoring.alertThresholds.overconfidenceRate) { alerts.push({ type: 'overconfidence' as const, severity: stats.overconfidenceRate > 0.5 ? 'high' : 'medium', message: `High overconfidence rate: ${stats.overconfidenceRate.toFixed(3)}`, recommendations: ['Lower confidence thresholds', 'Increase uncertainty estimates', 'Add confidence penalties'] }); } // Determine overall status const status = alerts.some(a => a.severity === 'critical') ? 'critical' : alerts.some(a => a.severity === 'high') ? 'warning' : 'healthy'; return { alerts, status }; } /** * Calculate multi-dimensional confidence */ private calculateMultiDimensionalConfidence( request: ConfidenceAssessmentRequest, historicalPerformance: any ): ConfidenceAssessment { // Base confidence from prediction probability or heuristics const baseConfidence = request.prediction.probability || this.estimateBaseConfidence(request); // Epistemic uncertainty (knowledge-based) const epistemic = this.calculateEpistemicUncertainty(request, historicalPerformance); // Aleatoric uncertainty (data-based) const aleatoric = this.calculateAleatoricUncertainty(request); // Calibrated confidence based on historical performance const calibrated = this.calibrateConfidence(baseConfidence, historicalPerformance); // Aspect-based confidence breakdown const aspects = { factualAccuracy: Math.max(0.1, baseConfidence - (request.novelty * 0.2)), completeness: Math.max(0.1, baseConfidence - (request.complexity * 0.15)), relevance: Math.max(0.1, baseConfidence - (epistemic * 0.3)), clarity: Math.max(0.1, baseConfidence - (aleatoric * 0.2)), appropriateness: Math.max(0.1, baseConfidence - (request.novelty * 0.1)) }; // Confidence sources const sources = { modelIntrinsic: baseConfidence, retrievalQuality: Math.max(0.1, 0.8 - (request.novelty * 0.3)), contextRelevance: Math.max(0.1, 0.9 - (request.complexity * 0.2)), historicalPerformance: historicalPerformance.avgAccuracy || 0.5, domainExpertise: Math.max(0.1, 0.8 - (request.novelty * 0.4)) }; // Overall confidence (weighted combination) const overall = Math.min(0.95, Math.max(0.05, (baseConfidence * 0.4) + (calibrated * 0.3) + ((1 - epistemic) * 0.2) + ((1 - aleatoric) * 0.1) )); return { confidenceId: new ObjectId(), // Temporary, will be replaced overall, breakdown: { epistemic, aleatoric, calibrated }, aspects, sources, recommendations: [], riskLevel: 'medium', shouldProceed: true }; } /** * Get historical performance for calibration */ private async getHistoricalPerformance( agentId: string, domain: string, taskType: string ): Promise<{ avgAccuracy: number; avgConfidence: number; calibrationError: number; sampleCount: number; }> { const stats = await this.confidenceCollection.getConfidenceStats(agentId, 30); const domainStats = stats.confidenceByDomain.find(d => d.domain === domain); return { avgAccuracy: domainStats?.accuracy || stats.accuracy || 0.5, avgConfidence: domainStats?.avgConfidence || stats.avgConfidence || 0.5, calibrationError: stats.calibrationError || 0.2, sampleCount: stats.verifiedPredictions || 0 }; } /** * Estimate base confidence from request characteristics */ private estimateBaseConfidence(request: ConfidenceAssessmentRequest): number { // Heuristic-based confidence estimation let confidence = 0.7; // Base confidence // Adjust for complexity confidence -= request.complexity * 0.2; // Adjust for novelty confidence -= request.novelty * 0.3; // Adjust for stakes (higher stakes = more conservative) const stakesAdjustment = { low: 0.1, medium: 0.05, high: -0.05, critical: -0.1 }; confidence += stakesAdjustment[request.stakes]; // Adjust for task type const taskTypeConfidence = { classification: 0.8, prediction: 0.7, generation: 0.6, reasoning: 0.65, decision: 0.75 }; confidence = (confidence + taskTypeConfidence[request.taskType]) / 2; return Math.min(0.95, Math.max(0.05, confidence)); } /** * Calculate epistemic uncertainty (knowledge-based) */ private calculateEpistemicUncertainty( request: ConfidenceAssessmentRequest, historicalPerformance: any ): number { let uncertainty = 0.2; // Base epistemic uncertainty // Higher uncertainty for novel tasks uncertainty += request.novelty * 0.3; // Higher uncertainty for complex tasks uncertainty += request.complexity * 0.2; // Lower uncertainty with more historical data if (historicalPerformance.sampleCount > 10) { uncertainty -= Math.min(0.2, historicalPerformance.sampleCount / 100); } return Math.min(0.9, Math.max(0.05, uncertainty)); } /** * Calculate aleatoric uncertainty (data-based) */ private calculateAleatoricUncertainty(request: ConfidenceAssessmentRequest): number { let uncertainty = 0.15; // Base aleatoric uncertainty // Task type affects inherent uncertainty const taskTypeUncertainty = { classification: 0.1, prediction: 0.2, generation: 0.25, reasoning: 0.15, decision: 0.18 }; uncertainty = taskTypeUncertainty[request.taskType]; // Domain complexity affects uncertainty uncertainty += request.complexity * 0.1; return Math.min(0.8, Math.max(0.05, uncertainty)); } /** * Calibrate confidence based on historical performance */ private calibrateConfidence(baseConfidence: number, historicalPerformance: any): number { if (historicalPerformance.sampleCount < this.config.calibration.minSamplesForCalibration) { return baseConfidence; } // Adjust based on historical calibration error const adjustment = historicalPerformance.calibrationError * (historicalPerformance.avgConfidence > historicalPerformance.avgAccuracy ? -1 : 1); return Math.min(0.95, Math.max(0.05, baseConfidence + adjustment)); } /** * Generate confidence recommendations */ private generateConfidenceRecommendations(confidence: ConfidenceAssessment): string[] { const recommendations = []; if (confidence.overall < 0.5) { recommendations.push('Consider gathering more information before proceeding'); recommendations.push('Review input data quality and completeness'); } if (confidence.breakdown.epistemic > 0.6) { recommendations.push('High knowledge uncertainty - consider domain expert consultation'); recommendations.push('Increase training data for this domain'); } if (confidence.breakdown.aleatoric > 0.5) { recommendations.push('High data uncertainty - verify input data quality'); recommendations.push('Consider ensemble methods to reduce uncertainty'); } if (confidence.aspects.factualAccuracy < 0.6) { recommendations.push('Low factual accuracy confidence - verify facts before proceeding'); } return recommendations; } /** * Assess risk level based on confidence and stakes */ private assessRiskLevel( confidence: ConfidenceAssessment, stakes: string ): 'low' | 'medium' | 'high' | 'critical' { const stakesWeight = { low: 0.25, medium: 0.5, high: 0.75, critical: 1.0 }; const riskScore = (1 - confidence.overall) * stakesWeight[stakes as keyof typeof stakesWeight]; if (riskScore > 0.7) return 'critical'; if (riskScore > 0.5) return 'high'; if (riskScore > 0.3) return 'medium'; return 'low'; } /** * Determine if should proceed with prediction */ private shouldProceedWithPrediction( confidence: ConfidenceAssessment, riskLevel: string ): boolean { const thresholds = { low: 0.3, medium: 0.5, high: 0.7, critical: 0.9 }; return confidence.overall >= thresholds[riskLevel as keyof typeof thresholds]; } /** * Generate calibration recommendations */ private generateCalibrationRecommendations( calibrationAnalysis: any, stats: any ): string[] { const recommendations = []; if (calibrationAnalysis.ece > 0.15) { recommendations.push('Improve confidence calibration through temperature scaling'); recommendations.push('Collect more diverse training data'); } if (stats.overconfidenceRate > 0.3) { recommendations.push('Reduce overconfidence by lowering confidence thresholds'); recommendations.push('Implement confidence penalties in training'); } if (stats.underconfidenceRate > 0.3) { recommendations.push('Address underconfidence by improving model training'); recommendations.push('Review uncertainty estimation methods'); } return recommendations; } /** * Calculate trends from timeline data */ private calculateTrends(timeline: any[]): { confidenceTrend: 'improving' | 'stable' | 'declining'; accuracyTrend: 'improving' | 'stable' | 'declining'; calibrationTrend: 'improving' | 'stable' | 'declining'; } { if (timeline.length < 3) { return { confidenceTrend: 'stable', accuracyTrend: 'stable', calibrationTrend: 'stable' }; } const recent = timeline.slice(-7); const older = timeline.slice(0, Math.max(1, timeline.length - 7)); const recentAvgConfidence = recent.reduce((sum, t) => sum + t.avgConfidence, 0) / recent.length; const olderAvgConfidence = older.reduce((sum, t) => sum + t.avgConfidence, 0) / older.length; const recentAvgAccuracy = recent.reduce((sum, t) => sum + (t.accuracy || 0), 0) / recent.length; const olderAvgAccuracy = older.reduce((sum, t) => sum + (t.accuracy || 0), 0) / older.length; const recentAvgCalibration = recent.reduce((sum, t) => sum + (t.calibrationError || 0), 0) / recent.length; const olderAvgCalibration = older.reduce((sum, t) => sum + (t.calibrationError || 0), 0) / older.length; return { confidenceTrend: this.determineTrend(recentAvgConfidence, olderAvgConfidence), accuracyTrend: this.determineTrend(recentAvgAccuracy, olderAvgAccuracy), calibrationTrend: this.determineTrend(olderAvgCalibration, recentAvgCalibration) // Lower is better for calibration error }; } /** * Determine trend direction */ private determineTrend(recent: number, older: number): 'improving' | 'stable' | 'declining' { const change = (recent - older) / older; if (change > 0.05) return 'improving'; if (change < -0.05) return 'declining'; return 'stable'; } /** * Generate trend insights */ private generateTrendInsights(timeline: any[], trends: any): string[] { const insights = []; if (trends.confidenceTrend === 'declining') { insights.push('Confidence levels are declining - review model performance'); } if (trends.accuracyTrend === 'improving') { insights.push('Accuracy is improving - current approach is working well'); } if (trends.calibrationTrend === 'declining') { insights.push('Calibration is getting worse - review confidence assessment methods'); } if (timeline.length > 0) { const avgPredictions = timeline.reduce((sum, t) => sum + t.predictionCount, 0) / timeline.length; if (avgPredictions > 100) { insights.push('High prediction volume - ensure quality is maintained'); } } return insights; } /** * Calculate performance metrics */ private async calculatePerformanceMetrics(agentId: string, days: number): Promise<{ avgComputationTime: number; avgMemoryUsage: number; efficiency: number; }> { // This would typically use aggregation to calculate performance metrics // For now, return simulated metrics return { avgComputationTime: 150, // ms avgMemoryUsage: 25, // MB efficiency: 0.85 // Efficiency score }; } /** * Cleanup expired confidence records */ async cleanup(): Promise<number> { return await this.confidenceCollection.cleanupExpiredRecords(); } }