universal-ai-brain
Version:
🧠 UNIVERSAL AI BRAIN 3.3 - The world's most advanced cognitive architecture with 24 specialized systems, MongoDB 8.1 $rankFusion hybrid search, latest Voyage 3.5 embeddings, and framework-agnostic design. Works with Mastra, Vercel AI, LangChain, OpenAI A
578 lines (502 loc) • 16.4 kB
text/typescript
/**
* @file RealTimeMonitor - Enterprise real-time trace monitoring service
*
* This service provides comprehensive real-time monitoring of agent traces
* using MongoDB Change Streams, with alerting, metrics aggregation, and
* dashboard-ready data streams.
*
* Features:
* - Real-time trace monitoring and alerting
* - Performance metrics aggregation
* - Error pattern detection
* - Cost monitoring and budget alerts
* - Framework-specific monitoring
* - WebSocket/SSE ready data streams
*/
import { EventEmitter } from 'events';
import { TracingEngine } from './TracingEngine';
import { ChangeStreamManager, TraceChangeEvent, ChangeStreamSubscriber } from './ChangeStreamManager';
import { TracingCollection, AgentTrace } from '../collections/TracingCollection';
import { TracingUtils } from './index';
export interface MonitoringAlert {
id: string;
type: 'performance' | 'error' | 'cost' | 'health' | 'custom';
severity: 'low' | 'medium' | 'high' | 'critical';
title: string;
message: string;
traceId?: string;
agentId?: string;
sessionId?: string;
timestamp: Date;
metadata?: Record<string, any>;
}
export interface PerformanceMetrics {
timestamp: Date;
activeTraces: number;
avgResponseTime: number;
errorRate: number;
totalCost: number;
tokensPerSecond: number;
frameworkBreakdown: Record<string, number>;
operationBreakdown: Record<string, number>;
}
export interface MonitoringConfig {
// Performance thresholds
maxResponseTime: number; // milliseconds
maxErrorRate: number; // percentage (0-100)
maxCostPerHour: number; // USD
// Alerting
enableAlerts: boolean;
alertWebhook?: string;
// Metrics collection
metricsInterval: number; // milliseconds
retentionPeriod: number; // hours
// Filtering
monitoredAgents?: string[];
monitoredFrameworks?: string[];
}
export interface MonitoringSubscriber {
id: string;
onAlert?: (alert: MonitoringAlert) => void;
onMetrics?: (metrics: PerformanceMetrics) => void;
onTraceUpdate?: (event: TraceChangeEvent) => void;
}
/**
* RealTimeMonitor - Enterprise real-time trace monitoring service
*
* This service provides comprehensive monitoring capabilities for the
* Universal AI Brain with real-time alerting and metrics collection.
*/
export class RealTimeMonitor extends EventEmitter {
private tracingEngine: TracingEngine;
private changeStreamManager: ChangeStreamManager;
private tracingCollection: TracingCollection;
private config: MonitoringConfig;
private subscribers: Map<string, MonitoringSubscriber> = new Map();
private metricsHistory: PerformanceMetrics[] = [];
private metricsInterval?: NodeJS.Timeout;
private isActive: boolean = false;
// Real-time counters
private currentMetrics = {
tracesStarted: 0,
tracesCompleted: 0,
totalErrors: 0,
totalCost: 0,
totalTokens: 0,
lastResetTime: new Date()
};
constructor(
tracingEngine: TracingEngine,
changeStreamManager: ChangeStreamManager,
tracingCollection: TracingCollection,
config: Partial<MonitoringConfig> = {}
) {
super();
this.tracingEngine = tracingEngine;
this.changeStreamManager = changeStreamManager;
this.tracingCollection = tracingCollection;
this.config = {
maxResponseTime: 30000, // 30 seconds
maxErrorRate: 5, // 5%
maxCostPerHour: 10, // $10/hour
enableAlerts: true,
metricsInterval: 60000, // 1 minute
retentionPeriod: 24, // 24 hours
...config
};
}
/**
* Start real-time monitoring
*/
async start(): Promise<void> {
if (this.isActive) {
console.warn('⚠️ RealTimeMonitor is already active');
return;
}
try {
// Start change stream monitoring
await this.changeStreamManager.start();
// Subscribe to change events
this.changeStreamManager.subscribe({
id: 'real-time-monitor',
onTraceChange: (event) => this.handleTraceChange(event),
onError: (error) => this.handleChangeStreamError(error)
});
// Start metrics collection
this.startMetricsCollection();
this.isActive = true;
console.log('📊 RealTimeMonitor started successfully');
} catch (error) {
console.error('❌ Failed to start RealTimeMonitor:', error);
throw error;
}
}
/**
* Stop real-time monitoring
*/
async stop(): Promise<void> {
if (!this.isActive) {
return;
}
this.isActive = false;
// Stop change stream
this.changeStreamManager.unsubscribe('real-time-monitor');
await this.changeStreamManager.stop();
// Stop metrics collection
if (this.metricsInterval) {
clearInterval(this.metricsInterval);
}
console.log('📊 RealTimeMonitor stopped');
}
/**
* Subscribe to monitoring events
*/
subscribe(subscriber: MonitoringSubscriber): void {
this.subscribers.set(subscriber.id, subscriber);
console.log(`📡 Monitoring subscriber ${subscriber.id} added`);
}
/**
* Unsubscribe from monitoring events
*/
unsubscribe(subscriberId: string): void {
this.subscribers.delete(subscriberId);
console.log(`📡 Monitoring subscriber ${subscriberId} removed`);
}
/**
* Handle trace change events from change stream
*/
private handleTraceChange(event: TraceChangeEvent): void {
try {
// Update real-time counters
this.updateCounters(event);
// Check for alerts
this.checkAlerts(event);
// Notify subscribers
this.notifySubscribers('traceUpdate', event);
// Emit event
this.emit('traceChange', event);
} catch (error) {
console.error('❌ Error handling trace change:', error);
}
}
/**
* Update real-time counters based on trace changes
*/
private updateCounters(event: TraceChangeEvent): void {
if (!event.fullDocument) return;
const trace = event.fullDocument;
if (event.operationType === 'insert') {
this.currentMetrics.tracesStarted++;
} else if (event.operationType === 'update' && trace.status === 'completed') {
this.currentMetrics.tracesCompleted++;
this.currentMetrics.totalCost += trace.cost.totalCost;
this.currentMetrics.totalTokens += trace.tokensUsed.totalTokens;
this.currentMetrics.totalErrors += trace.errors.length;
}
}
/**
* Check for alert conditions
*/
private checkAlerts(event: TraceChangeEvent): void {
if (!this.config.enableAlerts || !event.fullDocument) return;
const trace = event.fullDocument;
const alerts: MonitoringAlert[] = [];
// Performance alerts
if (trace.performance.totalDuration > this.config.maxResponseTime) {
alerts.push({
id: `perf-${trace.traceId}`,
type: 'performance',
severity: trace.performance.totalDuration > this.config.maxResponseTime * 2 ? 'critical' : 'high',
title: 'Slow Response Time',
message: `Trace ${trace.traceId} took ${TracingUtils.formatDuration(trace.performance.totalDuration)}`,
traceId: trace.traceId,
agentId: trace.agentId.toString(),
sessionId: trace.sessionId,
timestamp: new Date(),
metadata: { duration: trace.performance.totalDuration }
});
}
// Error alerts
if (trace.errors.length > 0) {
const criticalErrors = trace.errors.filter(e => !e.recoverable);
if (criticalErrors.length > 0) {
alerts.push({
id: `error-${trace.traceId}`,
type: 'error',
severity: 'critical',
title: 'Critical Error Detected',
message: `Trace ${trace.traceId} has ${criticalErrors.length} critical error(s)`,
traceId: trace.traceId,
agentId: trace.agentId.toString(),
sessionId: trace.sessionId,
timestamp: new Date(),
metadata: { errors: criticalErrors }
});
}
}
// Cost alerts
if (trace.cost.totalCost > this.config.maxCostPerHour / 3600) { // Per-operation cost threshold
alerts.push({
id: `cost-${trace.traceId}`,
type: 'cost',
severity: 'medium',
title: 'High Cost Operation',
message: `Trace ${trace.traceId} cost ${TracingUtils.formatCost(trace.cost.totalCost)}`,
traceId: trace.traceId,
agentId: trace.agentId.toString(),
sessionId: trace.sessionId,
timestamp: new Date(),
metadata: { cost: trace.cost.totalCost }
});
}
// Health alerts
const health = TracingUtils.calculateTraceHealth(trace);
if (health.score < 50) {
alerts.push({
id: `health-${trace.traceId}`,
type: 'health',
severity: health.score < 25 ? 'high' : 'medium',
title: 'Poor Trace Health',
message: `Trace ${trace.traceId} has health score of ${health.score}%`,
traceId: trace.traceId,
agentId: trace.agentId.toString(),
sessionId: trace.sessionId,
timestamp: new Date(),
metadata: { healthScore: health.score, factors: health.factors }
});
}
// Send alerts
alerts.forEach(alert => this.sendAlert(alert));
}
/**
* Send an alert to subscribers and external systems
*/
private sendAlert(alert: MonitoringAlert): void {
console.log(`🚨 Alert: ${alert.severity.toUpperCase()} - ${alert.title}`);
// Notify subscribers
this.notifySubscribers('alert', alert);
// Emit event
this.emit('alert', alert);
// Send to webhook if configured
if (this.config.alertWebhook) {
this.sendWebhookAlert(alert).catch(error => {
console.error('❌ Failed to send webhook alert:', error);
});
}
}
/**
* Send alert to webhook
*/
private async sendWebhookAlert(alert: MonitoringAlert): Promise<void> {
if (!this.config.alertWebhook) return;
try {
const response = await fetch(this.config.alertWebhook, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(alert)
});
if (!response.ok) {
throw new Error(`Webhook responded with status ${response.status}`);
}
} catch (error) {
console.error('❌ Webhook alert failed:', error);
}
}
/**
* Start periodic metrics collection
*/
private startMetricsCollection(): void {
this.metricsInterval = setInterval(() => {
this.collectMetrics().catch(error => {
console.error('❌ Error collecting metrics:', error);
});
}, this.config.metricsInterval);
// Collect initial metrics
this.collectMetrics().catch(error => {
console.error('❌ Error collecting initial metrics:', error);
});
}
/**
* Collect current performance metrics
*/
private async collectMetrics(): Promise<void> {
try {
const now = new Date();
const activeTraces = this.tracingEngine.getActiveTraces();
// Calculate error rate
const totalTraces = this.currentMetrics.tracesCompleted || 1;
const errorRate = (this.currentMetrics.totalErrors / totalTraces) * 100;
// Calculate average response time (from active traces)
const avgResponseTime = activeTraces.length > 0
? activeTraces.reduce((sum, trace) => sum + trace.duration, 0) / activeTraces.length
: 0;
// Calculate tokens per second
const timeSinceReset = (now.getTime() - this.currentMetrics.lastResetTime.getTime()) / 1000;
const tokensPerSecond = timeSinceReset > 0 ? this.currentMetrics.totalTokens / timeSinceReset : 0;
// Get framework and operation breakdowns
const frameworkBreakdown = await this.getFrameworkBreakdown();
const operationBreakdown = await this.getOperationBreakdown();
const metrics: PerformanceMetrics = {
timestamp: now,
activeTraces: activeTraces.length,
avgResponseTime,
errorRate,
totalCost: this.currentMetrics.totalCost,
tokensPerSecond,
frameworkBreakdown,
operationBreakdown
};
// Store metrics
this.metricsHistory.push(metrics);
// Clean up old metrics
const cutoffTime = new Date(now.getTime() - this.config.retentionPeriod * 60 * 60 * 1000);
this.metricsHistory = this.metricsHistory.filter(m => m.timestamp > cutoffTime);
// Notify subscribers
this.notifySubscribers('metrics', metrics);
// Emit event
this.emit('metrics', metrics);
// Reset counters periodically (every hour)
if (timeSinceReset > 3600) {
this.resetCounters();
}
} catch (error) {
console.error('❌ Error collecting metrics:', error);
}
}
/**
* Get framework usage breakdown
*/
private async getFrameworkBreakdown(): Promise<Record<string, number>> {
const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000);
const pipeline = [
{
$match: {
startTime: { $gte: oneHourAgo },
status: 'completed'
}
},
{
$group: {
_id: '$framework.frameworkName',
count: { $sum: 1 }
}
}
];
const results = await this.tracingCollection.aggregate(pipeline);
const breakdown: Record<string, number> = {};
results.forEach(result => {
breakdown[result._id] = result.count;
});
return breakdown;
}
/**
* Get operation type breakdown
*/
private async getOperationBreakdown(): Promise<Record<string, number>> {
const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000);
const pipeline = [
{
$match: {
startTime: { $gte: oneHourAgo },
status: 'completed'
}
},
{
$group: {
_id: '$operation.type',
count: { $sum: 1 }
}
}
];
const results = await this.tracingCollection.aggregate(pipeline);
const breakdown: Record<string, number> = {};
results.forEach(result => {
breakdown[result._id] = result.count;
});
return breakdown;
}
/**
* Reset performance counters
*/
private resetCounters(): void {
this.currentMetrics = {
tracesStarted: 0,
tracesCompleted: 0,
totalErrors: 0,
totalCost: 0,
totalTokens: 0,
lastResetTime: new Date()
};
}
/**
* Handle change stream errors
*/
private handleChangeStreamError(error: Error): void {
console.error('❌ Change stream error in monitor:', error);
const alert: MonitoringAlert = {
id: `system-error-${Date.now()}`,
type: 'error',
severity: 'critical',
title: 'Monitoring System Error',
message: `Change stream error: ${error.message}`,
timestamp: new Date(),
metadata: { error: error.message }
};
this.sendAlert(alert);
}
/**
* Notify all subscribers of an event
*/
private notifySubscribers(eventType: 'alert' | 'metrics' | 'traceUpdate', data: any): void {
for (const subscriber of this.subscribers.values()) {
try {
switch (eventType) {
case 'alert':
if (subscriber.onAlert) {
subscriber.onAlert(data);
}
break;
case 'metrics':
if (subscriber.onMetrics) {
subscriber.onMetrics(data);
}
break;
case 'traceUpdate':
if (subscriber.onTraceUpdate) {
subscriber.onTraceUpdate(data);
}
break;
}
} catch (error) {
console.error(`❌ Error notifying subscriber ${subscriber.id}:`, error);
}
}
}
/**
* Get current monitoring statistics
*/
getStats(): {
isActive: boolean;
subscriberCount: number;
metricsHistoryCount: number;
currentMetrics: typeof this.currentMetrics;
changeStreamStats: any;
} {
return {
isActive: this.isActive,
subscriberCount: this.subscribers.size,
metricsHistoryCount: this.metricsHistory.length,
currentMetrics: { ...this.currentMetrics },
changeStreamStats: this.changeStreamManager.getStats()
};
}
/**
* Get recent metrics history
*/
getMetricsHistory(hours: number = 1): PerformanceMetrics[] {
const cutoffTime = new Date(Date.now() - hours * 60 * 60 * 1000);
return this.metricsHistory.filter(m => m.timestamp > cutoffTime);
}
}