jay-code
Version:
Streamlined AI CLI orchestration engine with mathematical rigor and enterprise-grade reliability
1,123 lines (954 loc) • 32.4 kB
text/typescript
/**
* Real-time monitoring system for swarm operations
*/
import { EventEmitter } from 'node:events';
import type { ILogger } from '../core/logger.js';
import type { IEventBus } from '../core/event-bus.js';
import type {
SystemMetrics,
Alert,
AlertLevel,
AlertType,
MonitoringConfig,
AgentMetrics,
SwarmMetrics,
AgentId,
} from '../swarm/types.js';
import type { DistributedMemorySystem } from '../memory/distributed-memory.js';
export interface MonitorConfig {
updateInterval: number;
retentionPeriod: number;
alertingEnabled: boolean;
alertThresholds: AlertThresholds;
metricsEnabled: boolean;
tracingEnabled: boolean;
dashboardEnabled: boolean;
exportEnabled: boolean;
exportFormat: 'json' | 'csv' | 'prometheus';
debugMode: boolean;
}
export interface AlertThresholds {
cpu: { warning: number; critical: number };
memory: { warning: number; critical: number };
disk: { warning: number; critical: number };
errorRate: { warning: number; critical: number };
responseTime: { warning: number; critical: number };
queueDepth: { warning: number; critical: number };
agentHealth: { warning: number; critical: number };
swarmUtilization: { warning: number; critical: number };
}
export interface MetricPoint {
timestamp: Date;
value: number;
tags: Record<string, string>;
metadata?: Record<string, any>;
}
export interface TimeSeries {
name: string;
points: MetricPoint[];
aggregations: {
min: number;
max: number;
avg: number;
sum: number;
count: number;
};
lastUpdated: Date;
}
export interface MonitoringDashboard {
title: string;
panels: DashboardPanel[];
refreshInterval: number;
timeRange: { start: Date; end: Date };
filters: Record<string, any>;
}
export interface DashboardPanel {
id: string;
title: string;
type: 'line' | 'bar' | 'gauge' | 'table' | 'heatmap' | 'stat';
metrics: string[];
config: {
width: number;
height: number;
position: { x: number; y: number };
visualization: Record<string, any>;
};
}
export interface AlertRule {
id: string;
name: string;
enabled: boolean;
metric: string;
condition: 'gt' | 'lt' | 'eq' | 'gte' | 'lte';
threshold: number;
duration: number; // How long condition must persist
severity: AlertLevel;
tags: Record<string, string>;
actions: AlertAction[];
suppressions: AlertSuppression[];
}
export interface AlertAction {
type: 'email' | 'webhook' | 'slack' | 'log' | 'auto-scale' | 'restart';
config: Record<string, any>;
enabled: boolean;
}
export interface AlertSuppression {
condition: string;
duration: number;
reason: string;
}
export interface HealthCheck {
name: string;
type: 'http' | 'tcp' | 'custom';
target: string;
interval: number;
timeout: number;
retries: number;
expectedResponse?: any;
customCheck?: () => Promise<boolean>;
}
/**
* Comprehensive real-time monitoring and alerting system
*/
export class RealTimeMonitor extends EventEmitter {
private logger: ILogger;
private eventBus: IEventBus;
private memory: DistributedMemorySystem;
private config: MonitorConfig;
// Metrics storage
private timeSeries = new Map<string, TimeSeries>();
private activeAlerts = new Map<string, Alert>();
private alertHistory: Alert[] = [];
// Monitoring state
private monitoringInterval?: NodeJS.Timeout;
private healthCheckInterval?: NodeJS.Timeout;
private alertRules = new Map<string, AlertRule>();
private healthChecks = new Map<string, HealthCheck>();
// System state tracking
private systemMetrics: SystemMetrics;
private agentMetrics = new Map<string, AgentMetrics>();
private swarmMetrics: SwarmMetrics;
// Dashboards
private dashboards = new Map<string, MonitoringDashboard>();
// Performance tracking
private lastMetricsUpdate = new Date();
private metricsBuffer: MetricPoint[] = [];
private alertProcessor?: NodeJS.Timeout;
constructor(
config: Partial<MonitorConfig>,
logger: ILogger,
eventBus: IEventBus,
memory: DistributedMemorySystem,
) {
super();
this.logger = logger;
this.eventBus = eventBus;
this.memory = memory;
this.config = {
updateInterval: 5000,
retentionPeriod: 86400000, // 24 hours
alertingEnabled: true,
alertThresholds: {
cpu: { warning: 70, critical: 90 },
memory: { warning: 80, critical: 95 },
disk: { warning: 85, critical: 95 },
errorRate: { warning: 5, critical: 10 },
responseTime: { warning: 5000, critical: 10000 },
queueDepth: { warning: 10, critical: 20 },
agentHealth: { warning: 0.7, critical: 0.5 },
swarmUtilization: { warning: 0.8, critical: 0.95 },
},
metricsEnabled: true,
tracingEnabled: true,
dashboardEnabled: true,
exportEnabled: false,
exportFormat: 'json',
debugMode: false,
...config,
};
this.systemMetrics = this.initializeSystemMetrics();
this.swarmMetrics = this.initializeSwarmMetrics();
this.setupEventHandlers();
this.initializeDefaultAlertRules();
this.initializeDefaultDashboards();
}
private setupEventHandlers(): void {
// Agent events
this.eventBus.on('agent:metrics-update', (data) => {
this.updateAgentMetrics(data.agentId, data.metrics);
});
this.eventBus.on('agent:status-changed', (data) => {
this.recordMetric('agent.status.change', 1, {
agentId: data.agentId,
from: data.from,
to: data.to,
});
});
// Task events
this.eventBus.on('task:started', (data) => {
this.recordMetric('task.started', 1, { taskId: data.taskId, agentId: data.agentId });
});
this.eventBus.on('task:completed', (data) => {
this.recordMetric('task.completed', 1, { taskId: data.taskId });
this.recordMetric('task.duration', data.duration, { taskId: data.taskId });
});
this.eventBus.on('task:failed', (data) => {
this.recordMetric('task.failed', 1, { taskId: data.taskId, error: data.error });
});
// System events
this.eventBus.on('system:resource-update', (data) => {
this.updateSystemMetrics(data);
});
this.eventBus.on('swarm:metrics-update', (data) => {
this.updateSwarmMetrics(data.metrics);
});
// Error events
this.eventBus.on('error', (data) => {
this.handleError(data);
});
}
async initialize(): Promise<void> {
this.logger.info('Initializing real-time monitor', {
updateInterval: this.config.updateInterval,
alerting: this.config.alertingEnabled,
dashboard: this.config.dashboardEnabled,
});
// Start monitoring loops
this.startMetricsCollection();
this.startHealthChecks();
if (this.config.alertingEnabled) {
this.startAlertProcessing();
}
// Initialize default health checks
this.initializeHealthChecks();
this.emit('monitor:initialized');
}
async shutdown(): Promise<void> {
this.logger.info('Shutting down real-time monitor');
// Stop all intervals
if (this.monitoringInterval) clearInterval(this.monitoringInterval);
if (this.healthCheckInterval) clearInterval(this.healthCheckInterval);
if (this.alertProcessor) clearInterval(this.alertProcessor);
// Flush any remaining metrics
await this.flushMetrics();
this.emit('monitor:shutdown');
}
// === METRICS COLLECTION ===
private startMetricsCollection(): void {
this.monitoringInterval = setInterval(() => {
this.collectSystemMetrics();
this.processMetricsBuffer();
this.cleanupOldMetrics();
}, this.config.updateInterval);
this.logger.info('Started metrics collection', {
interval: this.config.updateInterval,
});
}
private async collectSystemMetrics(): Promise<void> {
try {
// Update system metrics
this.systemMetrics = {
...this.systemMetrics,
timestamp: new Date(),
cpuUsage: await this.getCpuUsage(),
memoryUsage: await this.getMemoryUsage(),
diskUsage: await this.getDiskUsage(),
networkUsage: await this.getNetworkUsage(),
};
// Record as time series
this.recordMetric('system.cpu', this.systemMetrics.cpuUsage);
this.recordMetric('system.memory', this.systemMetrics.memoryUsage);
this.recordMetric('system.disk', this.systemMetrics.diskUsage);
this.recordMetric('system.network', this.systemMetrics.networkUsage);
// Update swarm-level metrics
await this.updateSwarmLevelMetrics();
} catch (error) {
this.logger.error('Failed to collect system metrics', error);
}
}
private async updateSwarmLevelMetrics(): Promise<void> {
const agents = Array.from(this.agentMetrics.values());
this.swarmMetrics = {
...this.swarmMetrics,
agentUtilization: this.calculateAgentUtilization(agents),
throughput: this.calculateSwarmThroughput(agents),
latency: this.calculateAverageLatency(agents),
efficiency: this.calculateSwarmEfficiency(agents),
reliability: this.calculateSwarmReliability(agents),
averageQuality: this.calculateAverageQuality(agents),
};
// Record swarm metrics
this.recordMetric('swarm.utilization', this.swarmMetrics.agentUtilization);
this.recordMetric('swarm.throughput', this.swarmMetrics.throughput);
this.recordMetric('swarm.latency', this.swarmMetrics.latency);
this.recordMetric('swarm.efficiency', this.swarmMetrics.efficiency);
this.recordMetric('swarm.reliability', this.swarmMetrics.reliability);
}
recordMetric(name: string, value: number, tags: Record<string, string> = {}): void {
const point: MetricPoint = {
timestamp: new Date(),
value,
tags,
};
// Add to buffer for batch processing
this.metricsBuffer.push({ ...point, tags: { ...tags, metric: name } });
// Immediate processing for critical metrics
if (this.isCriticalMetric(name)) {
this.processMetricPoint(name, point);
}
}
private processMetricsBuffer(): void {
if (this.metricsBuffer.length === 0) return;
// Group by metric name
const metricGroups = new Map<string, MetricPoint[]>();
for (const point of this.metricsBuffer) {
const metricName = point.tags.metric || 'unknown';
const group = metricGroups.get(metricName) || [];
group.push(point);
metricGroups.set(metricName, group);
}
// Process each metric group
for (const [metricName, points] of metricGroups) {
for (const point of points) {
this.processMetricPoint(metricName, point);
}
}
// Clear buffer
this.metricsBuffer = [];
}
private processMetricPoint(metricName: string, point: MetricPoint): void {
let series = this.timeSeries.get(metricName);
if (!series) {
series = {
name: metricName,
points: [],
aggregations: {
min: point.value,
max: point.value,
avg: point.value,
sum: point.value,
count: 1,
},
lastUpdated: point.timestamp,
};
this.timeSeries.set(metricName, series);
}
// Add point
series.points.push(point);
series.lastUpdated = point.timestamp;
// Update aggregations
series.aggregations.count++;
series.aggregations.sum += point.value;
series.aggregations.avg = series.aggregations.sum / series.aggregations.count;
series.aggregations.min = Math.min(series.aggregations.min, point.value);
series.aggregations.max = Math.max(series.aggregations.max, point.value);
// Trigger alert checking for this metric
if (this.config.alertingEnabled) {
this.checkAlertsForMetric(metricName, point);
}
}
// === ALERTING ===
private startAlertProcessing(): void {
this.alertProcessor = setInterval(() => {
this.processAlerts();
}, 1000); // Process alerts every second
this.logger.info('Started alert processing');
}
private processAlerts(): void {
const now = new Date();
// Check for alert resolution
for (const [alertId, alert] of this.activeAlerts) {
if (!alert.resolved) {
const rule = this.alertRules.get(alert.context.ruleId);
if (rule && this.isAlertResolved(rule, alert)) {
this.resolveAlert(alertId, 'condition_resolved');
}
}
}
// Clean up old resolved alerts
this.cleanupResolvedAlerts();
}
private checkAlertsForMetric(metricName: string, point: MetricPoint): void {
for (const [ruleId, rule] of this.alertRules) {
if (rule.enabled && rule.metric === metricName) {
this.evaluateAlertRule(rule, point);
}
}
}
private evaluateAlertRule(rule: AlertRule, point: MetricPoint): void {
const conditionMet = this.evaluateCondition(rule.condition, point.value, rule.threshold);
if (conditionMet) {
// Check if we already have an active alert for this rule
const existingAlert = Array.from(this.activeAlerts.values()).find(
(alert) => alert.context.ruleId === rule.id && !alert.resolved,
);
if (!existingAlert) {
this.createAlert(rule, point);
}
}
}
private createAlert(rule: AlertRule, triggeringPoint: MetricPoint): void {
const alertId = `alert-${Date.now()}-${Math.random().toString(36).slice(2)}`;
const alert: Alert = {
id: alertId,
timestamp: new Date(),
level: rule.severity,
type: this.getAlertTypeFromMetric(rule.metric),
message: `${rule.name}: ${rule.metric} ${rule.condition} ${rule.threshold} (current: ${triggeringPoint.value})`,
source: 'real-time-monitor',
context: {
ruleId: rule.id,
metric: rule.metric,
value: triggeringPoint.value,
threshold: rule.threshold,
tags: { ...rule.tags, ...triggeringPoint.tags },
},
acknowledged: false,
resolved: false,
escalationLevel: 0,
};
this.activeAlerts.set(alertId, alert);
this.alertHistory.push(alert);
this.logger.warn('Alert created', {
alertId,
rule: rule.name,
metric: rule.metric,
value: triggeringPoint.value,
threshold: rule.threshold,
});
this.emit('alert:created', { alert });
// Execute alert actions
this.executeAlertActions(rule, alert);
}
private executeAlertActions(rule: AlertRule, alert: Alert): void {
for (const action of rule.actions) {
if (!action.enabled) continue;
try {
switch (action.type) {
case 'log':
this.logger.warn(`ALERT: ${alert.message}`, alert.context);
break;
case 'email':
this.sendEmailAlert(alert, action.config);
break;
case 'webhook':
this.sendWebhookAlert(alert, action.config);
break;
case 'auto-scale':
this.triggerAutoScale(alert, action.config);
break;
case 'restart':
this.triggerRestart(alert, action.config);
break;
default:
this.logger.warn('Unknown alert action type', { type: action.type });
}
} catch (error) {
this.logger.error('Failed to execute alert action', {
alertId: alert.id,
actionType: action.type,
error,
});
}
}
}
private resolveAlert(alertId: string, reason: string): void {
const alert = this.activeAlerts.get(alertId);
if (!alert) return;
alert.resolved = true;
alert.context.resolutionReason = reason;
alert.context.resolvedAt = new Date();
this.logger.info('Alert resolved', { alertId, reason });
this.emit('alert:resolved', { alert, reason });
}
// === HEALTH CHECKS ===
private startHealthChecks(): void {
this.healthCheckInterval = setInterval(() => {
this.performHealthChecks();
}, 30000); // Every 30 seconds
this.logger.info('Started health checks');
}
private async performHealthChecks(): Promise<void> {
const checks = Array.from(this.healthChecks.values());
const promises = checks.map((check) => this.executeHealthCheck(check));
await Promise.allSettled(promises);
}
private async executeHealthCheck(check: HealthCheck): Promise<void> {
try {
let isHealthy = false;
switch (check.type) {
case 'http':
isHealthy = await this.performHttpHealthCheck(check);
break;
case 'tcp':
isHealthy = await this.performTcpHealthCheck(check);
break;
case 'custom':
if (check.customCheck) {
isHealthy = await check.customCheck();
}
break;
}
this.recordMetric(`healthcheck.${check.name}`, isHealthy ? 1 : 0, {
type: check.type,
target: check.target,
});
} catch (error) {
this.logger.error('Health check failed', { check: check.name, error });
this.recordMetric(`healthcheck.${check.name}`, 0, {
type: check.type,
target: check.target,
error: error instanceof Error ? error.message : String(error),
});
}
}
// === DASHBOARD MANAGEMENT ===
createDashboard(title: string, panels: DashboardPanel[]): string {
const dashboardId = `dashboard-${Date.now()}`;
const dashboard: MonitoringDashboard = {
title,
panels,
refreshInterval: 30000,
timeRange: {
start: new Date(Date.now() - 3600000), // Last hour
end: new Date(),
},
filters: {},
};
this.dashboards.set(dashboardId, dashboard);
this.emit('dashboard:created', { dashboardId, dashboard });
return dashboardId;
}
getDashboardData(dashboardId: string): any {
const dashboard = this.dashboards.get(dashboardId);
if (!dashboard) return null;
const data: any = {
dashboard,
panels: [],
};
for (const panel of dashboard.panels) {
const panelData = {
id: panel.id,
title: panel.title,
type: panel.type,
data: this.getPanelData(panel, dashboard.timeRange),
};
data.panels.push(panelData);
}
return data;
}
private getPanelData(panel: DashboardPanel, timeRange: { start: Date; end: Date }): any {
const data: any = {};
for (const metricName of panel.metrics) {
const series = this.timeSeries.get(metricName);
if (series) {
// Filter points by time range
const filteredPoints = series.points.filter(
(point) => point.timestamp >= timeRange.start && point.timestamp <= timeRange.end,
);
data[metricName] = {
points: filteredPoints,
aggregations: this.calculateAggregations(filteredPoints),
};
}
}
return data;
}
// === UTILITY METHODS ===
private async getCpuUsage(): Promise<number> {
// Placeholder - would use actual system APIs
return Math.random() * 100;
}
private async getMemoryUsage(): Promise<number> {
// Placeholder - would use actual system APIs
return Math.random() * 100;
}
private async getDiskUsage(): Promise<number> {
// Placeholder - would use actual system APIs
return Math.random() * 100;
}
private async getNetworkUsage(): Promise<number> {
// Placeholder - would use actual system APIs
return Math.random() * 1024 * 1024; // bytes
}
private updateAgentMetrics(agentId: string, metrics: AgentMetrics): void {
this.agentMetrics.set(agentId, metrics);
// Record individual agent metrics
this.recordMetric('agent.cpu', metrics.cpuUsage, { agentId });
this.recordMetric('agent.memory', metrics.memoryUsage, { agentId });
this.recordMetric('agent.tasks.completed', metrics.tasksCompleted, { agentId });
this.recordMetric('agent.tasks.failed', metrics.tasksFailed, { agentId });
this.recordMetric('agent.response.time', metrics.responseTime, { agentId });
}
private updateSystemMetrics(data: Partial<SystemMetrics>): void {
this.systemMetrics = { ...this.systemMetrics, ...data };
}
private updateSwarmMetrics(metrics: SwarmMetrics): void {
this.swarmMetrics = { ...this.swarmMetrics, ...metrics };
}
private handleError(data: any): void {
this.recordMetric('error.count', 1, {
type: data.type || 'unknown',
source: data.source || 'unknown',
});
// Create critical alert for errors
if (data.severity === 'critical') {
const alertId = `error-alert-${Date.now()}`;
const alert: Alert = {
id: alertId,
timestamp: new Date(),
level: 'critical',
type: 'system',
message: `Critical error: ${data.message}`,
source: data.source || 'unknown',
context: data,
acknowledged: false,
resolved: false,
escalationLevel: 0,
};
this.activeAlerts.set(alertId, alert);
this.emit('alert:created', { alert });
}
}
private isCriticalMetric(name: string): boolean {
const criticalMetrics = [
'system.cpu',
'system.memory',
'system.disk',
'agent.health',
'task.failed',
'error.count',
];
return criticalMetrics.includes(name);
}
private evaluateCondition(condition: string, value: number, threshold: number): boolean {
switch (condition) {
case 'gt':
return value > threshold;
case 'gte':
return value >= threshold;
case 'lt':
return value < threshold;
case 'lte':
return value <= threshold;
case 'eq':
return value === threshold;
default:
return false;
}
}
private isAlertResolved(rule: AlertRule, alert: Alert): boolean {
// Get recent metric values
const series = this.timeSeries.get(rule.metric);
if (!series || series.points.length === 0) return false;
// Check if condition is no longer met
const recentPoints = series.points.slice(-5); // Last 5 points
const allResolved = recentPoints.every(
(point) => !this.evaluateCondition(rule.condition, point.value, rule.threshold),
);
return allResolved;
}
private getAlertTypeFromMetric(metric: string): AlertType {
if (metric.includes('system')) return 'system';
if (metric.includes('agent')) return 'agent';
if (metric.includes('task')) return 'task';
if (metric.includes('swarm')) return 'swarm';
if (metric.includes('performance')) return 'performance';
if (metric.includes('resource')) return 'resource';
return 'custom';
}
private calculateAgentUtilization(agents: AgentMetrics[]): number {
if (agents.length === 0) return 0;
const totalUtilization = agents.reduce((sum, agent) => sum + agent.cpuUsage, 0);
return totalUtilization / agents.length;
}
private calculateSwarmThroughput(agents: AgentMetrics[]): number {
return agents.reduce((sum, agent) => sum + (agent.tasksCompleted || 0), 0);
}
private calculateAverageLatency(agents: AgentMetrics[]): number {
if (agents.length === 0) return 0;
const totalLatency = agents.reduce((sum, agent) => sum + agent.responseTime, 0);
return totalLatency / agents.length;
}
private calculateSwarmEfficiency(agents: AgentMetrics[]): number {
if (agents.length === 0) return 0;
const totalTasks = agents.reduce(
(sum, agent) => sum + (agent.tasksCompleted || 0) + (agent.tasksFailed || 0),
0,
);
const completedTasks = agents.reduce((sum, agent) => sum + (agent.tasksCompleted || 0), 0);
return totalTasks > 0 ? completedTasks / totalTasks : 1;
}
private calculateSwarmReliability(agents: AgentMetrics[]): number {
if (agents.length === 0) return 1;
const totalReliability = agents.reduce((sum, agent) => sum + (agent.successRate || 1), 0);
return totalReliability / agents.length;
}
private calculateAverageQuality(agents: AgentMetrics[]): number {
if (agents.length === 0) return 0.8;
const totalQuality = agents.reduce((sum, agent) => sum + (agent.codeQuality || 0.8), 0);
return totalQuality / agents.length;
}
private calculateAggregations(points: MetricPoint[]): any {
if (points.length === 0) {
return { min: 0, max: 0, avg: 0, sum: 0, count: 0 };
}
const values = points.map((p) => p.value);
return {
min: Math.min(...values),
max: Math.max(...values),
avg: values.reduce((sum, val) => sum + val, 0) / values.length,
sum: values.reduce((sum, val) => sum + val, 0),
count: values.length,
};
}
private cleanupOldMetrics(): void {
const cutoff = new Date(Date.now() - this.config.retentionPeriod);
for (const [name, series] of this.timeSeries) {
series.points = series.points.filter((point) => point.timestamp > cutoff);
if (series.points.length === 0) {
this.timeSeries.delete(name);
}
}
}
private cleanupResolvedAlerts(): void {
const cutoff = new Date(Date.now() - 86400000); // 24 hours
// Remove old resolved alerts from active alerts
for (const [alertId, alert] of this.activeAlerts) {
if (alert.resolved && alert.timestamp < cutoff) {
this.activeAlerts.delete(alertId);
}
}
// Trim alert history
this.alertHistory = this.alertHistory.filter((alert) => alert.timestamp > cutoff).slice(-1000); // Keep last 1000 alerts max
}
private async flushMetrics(): Promise<void> {
if (this.metricsBuffer.length > 0) {
this.processMetricsBuffer();
}
// Persist metrics to memory if enabled
if (this.config.exportEnabled) {
await this.exportMetrics();
}
}
private async exportMetrics(): Promise<void> {
try {
const exportData = {
timestamp: new Date(),
timeSeries: Array.from(this.timeSeries.entries()),
systemMetrics: this.systemMetrics,
swarmMetrics: this.swarmMetrics,
activeAlerts: Array.from(this.activeAlerts.values()),
};
await this.memory.store('monitoring:export', exportData, {
type: 'monitoring-export',
partition: 'metrics',
});
} catch (error) {
this.logger.error('Failed to export metrics', error);
}
}
private initializeDefaultAlertRules(): void {
const rules: AlertRule[] = [
{
id: 'cpu-warning',
name: 'High CPU Usage',
enabled: true,
metric: 'system.cpu',
condition: 'gt',
threshold: this.config.alertThresholds.cpu.warning,
duration: 60000,
severity: 'warning',
tags: { category: 'system' },
actions: [{ type: 'log', config: {}, enabled: true }],
suppressions: [],
},
{
id: 'memory-critical',
name: 'Critical Memory Usage',
enabled: true,
metric: 'system.memory',
condition: 'gt',
threshold: this.config.alertThresholds.memory.critical,
duration: 30000,
severity: 'critical',
tags: { category: 'system' },
actions: [
{ type: 'log', config: {}, enabled: true },
{ type: 'auto-scale', config: { action: 'scale-down' }, enabled: true },
],
suppressions: [],
},
];
rules.forEach((rule) => this.alertRules.set(rule.id, rule));
}
private initializeDefaultDashboards(): void {
const systemDashboard = this.createDashboard('System Overview', [
{
id: 'cpu-panel',
title: 'CPU Usage',
type: 'line',
metrics: ['system.cpu'],
config: {
width: 6,
height: 4,
position: { x: 0, y: 0 },
visualization: { yAxis: { max: 100 } },
},
},
{
id: 'memory-panel',
title: 'Memory Usage',
type: 'gauge',
metrics: ['system.memory'],
config: {
width: 6,
height: 4,
position: { x: 6, y: 0 },
visualization: { max: 100, threshold: [70, 90] },
},
},
]);
this.logger.info('Created default dashboard', { dashboardId: systemDashboard });
}
private initializeHealthChecks(): void {
// Add default health checks
this.healthChecks.set('system', {
name: 'system',
type: 'custom',
target: 'local',
interval: 30000,
timeout: 5000,
retries: 3,
customCheck: async () => {
// Basic system health check
return this.systemMetrics.cpuUsage < 95 && this.systemMetrics.memoryUsage < 95;
},
});
}
private async performHttpHealthCheck(check: HealthCheck): Promise<boolean> {
// Placeholder for HTTP health check
return true;
}
private async performTcpHealthCheck(check: HealthCheck): Promise<boolean> {
// Placeholder for TCP health check
return true;
}
private async sendEmailAlert(alert: Alert, config: any): Promise<void> {
// Placeholder for email alert
this.logger.info('Email alert sent', { alertId: alert.id });
}
private async sendWebhookAlert(alert: Alert, config: any): Promise<void> {
// Placeholder for webhook alert
this.logger.info('Webhook alert sent', { alertId: alert.id });
}
private async triggerAutoScale(alert: Alert, config: any): Promise<void> {
// Placeholder for auto-scaling
this.logger.info('Auto-scale triggered', { alertId: alert.id, action: config.action });
this.eventBus.emit('autoscale:triggered', { alert, config });
}
private async triggerRestart(alert: Alert, config: any): Promise<void> {
// Placeholder for restart action
this.logger.info('Restart triggered', { alertId: alert.id });
this.eventBus.emit('restart:triggered', { alert, config });
}
private initializeSystemMetrics(): SystemMetrics {
return {
timestamp: new Date(),
cpuUsage: 0,
memoryUsage: 0,
diskUsage: 0,
networkUsage: 0,
activeSwarms: 0,
totalAgents: 0,
activeAgents: 0,
totalTasks: 0,
runningTasks: 0,
throughput: 0,
latency: 0,
errorRate: 0,
successRate: 100,
resourceUtilization: {},
queueLengths: {},
};
}
private initializeSwarmMetrics(): SwarmMetrics {
return {
throughput: 0,
latency: 0,
efficiency: 1.0,
reliability: 1.0,
averageQuality: 0.8,
defectRate: 0,
reworkRate: 0,
resourceUtilization: {},
costEfficiency: 1.0,
agentUtilization: 0,
agentSatisfaction: 0.8,
collaborationEffectiveness: 0.8,
scheduleVariance: 0,
deadlineAdherence: 1.0,
};
}
// === PUBLIC API ===
getSystemMetrics(): SystemMetrics {
return { ...this.systemMetrics };
}
getSwarmMetrics(): SwarmMetrics {
return { ...this.swarmMetrics };
}
getActiveAlerts(): Alert[] {
return Array.from(this.activeAlerts.values());
}
getAlertHistory(limit: number = 100): Alert[] {
return this.alertHistory.slice(-limit);
}
getTimeSeries(metricName: string): TimeSeries | undefined {
return this.timeSeries.get(metricName);
}
getAllTimeSeries(): TimeSeries[] {
return Array.from(this.timeSeries.values());
}
acknowledgeAlert(alertId: string, acknowledgedBy: string): void {
const alert = this.activeAlerts.get(alertId);
if (alert) {
alert.acknowledged = true;
alert.assignedTo = acknowledgedBy;
this.emit('alert:acknowledged', { alert, acknowledgedBy });
}
}
createAlertRule(rule: Omit<AlertRule, 'id'>): string {
const ruleId = `rule-${Date.now()}`;
this.alertRules.set(ruleId, { ...rule, id: ruleId });
return ruleId;
}
updateAlertRule(ruleId: string, updates: Partial<AlertRule>): void {
const rule = this.alertRules.get(ruleId);
if (rule) {
this.alertRules.set(ruleId, { ...rule, ...updates });
}
}
deleteAlertRule(ruleId: string): void {
this.alertRules.delete(ruleId);
}
getAlertRules(): AlertRule[] {
return Array.from(this.alertRules.values());
}
getMonitoringStatistics(): {
metricsCount: number;
activeAlerts: number;
alertRules: number;
healthChecks: number;
dashboards: number;
uptime: number;
} {
return {
metricsCount: this.timeSeries.size,
activeAlerts: this.activeAlerts.size,
alertRules: this.alertRules.size,
healthChecks: this.healthChecks.size,
dashboards: this.dashboards.size,
uptime: Date.now() - this.lastMetricsUpdate.getTime(),
};
}
}