UNPKG

bc-code-intelligence-mcp

Version:

BC Code Intelligence MCP Server - Complete Specialist Bundle with AI-driven expert consultation, seamless handoffs, and context-preserving workflows

463 lines 18.8 kB
/** * Production Monitoring & Health Management System * * Comprehensive monitoring, health checks, metrics collection, and alerting * for production deployments with Docker, Kubernetes, and cloud integration. */ import { EventEmitter } from 'events'; import { readFile, writeFile } from 'fs/promises'; export class ProductionMonitor extends EventEmitter { performanceMonitor; securityManager; cacheManager; enableFileLogging; metricsRetentionHours; healthChecks = new Map(); activeAlerts = new Map(); alertHistory = []; alertCooldowns = new Map(); metricsHistory = []; startTime = Date.now(); healthCheckInterval; metricsCollectionInterval; defaultAlertRules = [ { id: 'high_memory_usage', name: 'High Memory Usage', condition: 'memory_usage_percentage > 85', severity: 'warning', cooldown_minutes: 15, enabled: true }, { id: 'critical_memory_usage', name: 'Critical Memory Usage', condition: 'memory_usage_percentage > 95', severity: 'critical', cooldown_minutes: 5, enabled: true }, { id: 'high_error_rate', name: 'High Error Rate', condition: 'error_rate_percentage > 5', severity: 'warning', cooldown_minutes: 10, enabled: true }, { id: 'slow_response_time', name: 'Slow Response Times', condition: 'p95_response_time_ms > 2000', severity: 'warning', cooldown_minutes: 20, enabled: true }, { id: 'cache_performance', name: 'Poor Cache Performance', condition: 'cache_hit_rate < 50', severity: 'info', cooldown_minutes: 30, enabled: true }, { id: 'security_incidents', name: 'Security Incidents', condition: 'failed_authentications_last_hour > 10', severity: 'critical', cooldown_minutes: 5, enabled: true } ]; alertRules = [...this.defaultAlertRules]; constructor(performanceMonitor, securityManager, cacheManager, enableFileLogging = true, metricsRetentionHours = 24) { super(); this.performanceMonitor = performanceMonitor; this.securityManager = securityManager; this.cacheManager = cacheManager; this.enableFileLogging = enableFileLogging; this.metricsRetentionHours = metricsRetentionHours; this.setupDefaultHealthChecks(); this.startMonitoring(); console.log('📊 Production monitor initialized'); } /** * Register a custom health check */ registerHealthCheck(name, checkFn) { this.healthChecks.set(name, checkFn); console.log(`✅ Health check registered: ${name}`); } /** * Run all health checks and return results */ async runHealthChecks() { const results = []; let overallStatus = 'healthy'; for (const [name, checkFn] of this.healthChecks.entries()) { try { const result = await checkFn(); results.push(result); // Update overall status if (result.status === 'unhealthy') { overallStatus = 'unhealthy'; } else if (result.status === 'degraded' && overallStatus === 'healthy') { overallStatus = 'degraded'; } } catch (error) { results.push({ service: name, status: 'unhealthy', message: `Health check failed: ${error instanceof Error ? error.message : String(error)}`, response_time_ms: 0, timestamp: Date.now() }); overallStatus = 'unhealthy'; } } return { status: overallStatus, checks: results }; } /** * Collect current system metrics */ async collectMetrics() { const memoryUsage = process.memoryUsage(); const totalMemoryMB = process.platform === 'linux' ? (await this.getSystemMemory()) / 1024 / 1024 : 1024; // Fallback estimate const performanceStats = this.performanceMonitor.getOverallSummary(); const healthStats = this.performanceMonitor.getSystemHealth(); const cacheStats = this.cacheManager?.getStats() || { hit_rate: 0 }; const securityStats = this.securityManager?.getSecurityStats() || { successful_authentications: 0, failed_authentications: 0, suspicious_activities: 0, rate_limited_users: 0 }; const metrics = { timestamp: Date.now(), system: { uptime_seconds: (Date.now() - this.startTime) / 1000, memory_usage_mb: memoryUsage.heapUsed / 1024 / 1024, memory_usage_percentage: (memoryUsage.heapUsed / (totalMemoryMB * 1024 * 1024)) * 100, cpu_usage_percentage: await this.getCPUUsage(), process_id: process.pid, node_version: process.version }, application: { active_layers: healthStats.active_layers, total_topics: healthStats.total_topics, cache_hit_rate: cacheStats.hit_rate || 0, average_response_time_ms: healthStats.average_response_time_ms, requests_per_minute: performanceStats.operations_per_second * 60, error_rate_percentage: healthStats.error_rate }, performance: { slowest_operations: performanceStats.slowest_operations.map(op => ({ operation: op.operation, duration_ms: op.duration_ms })).slice(0, 5), operations_per_second: performanceStats.operations_per_second, p95_response_time_ms: performanceStats.p95_duration_ms, p99_response_time_ms: performanceStats.p99_duration_ms }, security: { active_sessions: 0, // Would be provided by session manager failed_authentications_last_hour: this.getRecentFailedAuthentications(), rate_limited_requests: securityStats.rate_limited_users, suspicious_activities: securityStats.suspicious_activities } }; // Store metrics history this.metricsHistory.push(metrics); // Maintain retention window const cutoffTime = Date.now() - (this.metricsRetentionHours * 60 * 60 * 1000); this.metricsHistory = this.metricsHistory.filter(m => m.timestamp > cutoffTime); // Evaluate alert rules await this.evaluateAlerts(metrics); return metrics; } /** * Get metrics history for time range */ getMetricsHistory(hoursBack = 1) { const cutoffTime = Date.now() - (hoursBack * 60 * 60 * 1000); return this.metricsHistory.filter(m => m.timestamp >= cutoffTime); } /** * Get active alerts */ getActiveAlerts() { return Array.from(this.activeAlerts.values()); } /** * Get alert history */ getAlertHistory(limit = 100) { return this.alertHistory.slice(-limit); } /** * Resolve an active alert */ resolveAlert(alertId) { const alert = this.activeAlerts.get(alertId); if (alert) { alert.resolved = true; alert.resolved_at = Date.now(); this.activeAlerts.delete(alertId); this.emit('alert_resolved', alert); return true; } return false; } /** * Export comprehensive system status for monitoring dashboards */ async getSystemStatus() { const healthResults = await this.runHealthChecks(); const currentMetrics = await this.collectMetrics(); const performanceSummary = this.performanceMonitor.getOverallSummary(); return { overall_health: healthResults.status, health_checks: healthResults.checks, current_metrics: currentMetrics, active_alerts: this.getActiveAlerts(), performance_summary: performanceSummary, uptime_seconds: (Date.now() - this.startTime) / 1000 }; } /** * Export monitoring data for external systems (Prometheus, etc.) */ exportMetricsForPrometheus() { if (this.metricsHistory.length === 0) return ''; const latest = this.metricsHistory[this.metricsHistory.length - 1]; const lines = []; // System metrics lines.push(`# HELP bckb_memory_usage_bytes Memory usage in bytes`); lines.push(`# TYPE bckb_memory_usage_bytes gauge`); lines.push(`bckb_memory_usage_bytes ${latest.system.memory_usage_mb * 1024 * 1024}`); lines.push(`# HELP bckb_memory_usage_percentage Memory usage percentage`); lines.push(`# TYPE bckb_memory_usage_percentage gauge`); lines.push(`bckb_memory_usage_percentage ${latest.system.memory_usage_percentage}`); // Application metrics lines.push(`# HELP bckb_active_layers Number of active knowledge layers`); lines.push(`# TYPE bckb_active_layers gauge`); lines.push(`bckb_active_layers ${latest.application.active_layers}`); lines.push(`# HELP bckb_total_topics Total number of topics available`); lines.push(`# TYPE bckb_total_topics gauge`); lines.push(`bckb_total_topics ${latest.application.total_topics}`); lines.push(`# HELP bckb_cache_hit_rate Cache hit rate percentage`); lines.push(`# TYPE bckb_cache_hit_rate gauge`); lines.push(`bckb_cache_hit_rate ${latest.application.cache_hit_rate}`); lines.push(`# HELP bckb_response_time_ms Average response time in milliseconds`); lines.push(`# TYPE bckb_response_time_ms gauge`); lines.push(`bckb_response_time_ms ${latest.application.average_response_time_ms}`); lines.push(`# HELP bckb_error_rate_percentage Error rate percentage`); lines.push(`# TYPE bckb_error_rate_percentage gauge`); lines.push(`bckb_error_rate_percentage ${latest.application.error_rate_percentage}`); // Alert metrics lines.push(`# HELP bckb_active_alerts Number of active alerts`); lines.push(`# TYPE bckb_active_alerts gauge`); lines.push(`bckb_active_alerts ${this.activeAlerts.size}`); return lines.join('\n'); } /** * Shutdown monitoring and cleanup */ shutdown() { if (this.healthCheckInterval) { clearInterval(this.healthCheckInterval); } if (this.metricsCollectionInterval) { clearInterval(this.metricsCollectionInterval); } console.log('📊 Production monitor shutdown complete'); } // Private implementation methods setupDefaultHealthChecks() { // Basic system health this.registerHealthCheck('system', async () => { const start = Date.now(); const memUsage = process.memoryUsage(); const duration = Date.now() - start; const status = memUsage.heapUsed / (1024 * 1024 * 1024) > 1 ? 'degraded' : 'healthy'; return { service: 'system', status, message: status === 'healthy' ? 'System operating normally' : 'High memory usage detected', details: { memory_heap_mb: Math.round(memUsage.heapUsed / 1024 / 1024), memory_external_mb: Math.round(memUsage.external / 1024 / 1024) }, response_time_ms: duration, timestamp: Date.now() }; }); // Cache health if (this.cacheManager) { this.registerHealthCheck('cache', async () => { const start = Date.now(); const cacheStats = this.cacheManager.getStats(); const duration = Date.now() - start; const status = cacheStats.hit_rate > 60 ? 'healthy' : cacheStats.hit_rate > 30 ? 'degraded' : 'unhealthy'; return { service: 'cache', status, message: `Cache hit rate: ${cacheStats.hit_rate.toFixed(1)}%`, details: cacheStats, response_time_ms: duration, timestamp: Date.now() }; }); } // Security health if (this.securityManager) { this.registerHealthCheck('security', async () => { const start = Date.now(); const securityStats = this.securityManager.getSecurityStats(); const duration = Date.now() - start; const recentFailures = this.getRecentFailedAuthentications(); const status = recentFailures > 50 ? 'unhealthy' : recentFailures > 20 ? 'degraded' : 'healthy'; return { service: 'security', status, message: `${recentFailures} failed authentications in last hour`, details: securityStats, response_time_ms: duration, timestamp: Date.now() }; }); } } startMonitoring() { // Run health checks every minute this.healthCheckInterval = setInterval(async () => { try { await this.runHealthChecks(); } catch (error) { console.error('📊 Health check error:', error); } }, 60000); // Collect metrics every 30 seconds this.metricsCollectionInterval = setInterval(async () => { try { await this.collectMetrics(); } catch (error) { console.error('📊 Metrics collection error:', error); } }, 30000); } async evaluateAlerts(metrics) { for (const rule of this.alertRules.filter(r => r.enabled)) { try { if (this.alertCooldowns.has(rule.id) && Date.now() < this.alertCooldowns.get(rule.id)) { continue; // Still in cooldown } const shouldAlert = this.evaluateCondition(rule.condition, metrics); if (shouldAlert && !this.activeAlerts.has(rule.id)) { const alert = this.createAlert(rule, metrics); this.activeAlerts.set(alert.id, alert); this.alertHistory.push(alert); // Set cooldown this.alertCooldowns.set(rule.id, Date.now() + (rule.cooldown_minutes * 60 * 1000)); this.emit('alert_triggered', alert); if (this.enableFileLogging) { await this.logAlertToFile(alert); } console.warn(`🚨 ALERT [${rule.severity.toUpperCase()}]: ${rule.name}`); } } catch (error) { console.error(`Alert rule evaluation error for ${rule.id}:`, error); } } } evaluateCondition(condition, metrics) { // Simple condition evaluator - in production, would use a more robust parser const context = { memory_usage_percentage: metrics.system.memory_usage_percentage, error_rate_percentage: metrics.application.error_rate_percentage, p95_response_time_ms: metrics.performance.p95_response_time_ms, cache_hit_rate: metrics.application.cache_hit_rate, failed_authentications_last_hour: metrics.security.failed_authentications_last_hour }; try { // Replace variables in condition let evaluable = condition; for (const [key, value] of Object.entries(context)) { evaluable = evaluable.replace(new RegExp(key, 'g'), String(value)); } // Simple expression evaluation (would use safer parser in production) return eval(evaluable); } catch (error) { console.error('Condition evaluation error:', error); return false; } } createAlert(rule, metrics) { return { id: `${rule.id}_${Date.now()}`, rule_id: rule.id, message: `Alert: ${rule.name} - ${rule.condition}`, severity: rule.severity, timestamp: Date.now(), resolved: false, metadata: { metrics_snapshot: metrics, rule: rule } }; } async logAlertToFile(alert) { try { const logEntry = { timestamp: new Date(alert.timestamp).toISOString(), alert_id: alert.id, rule_id: alert.rule_id, message: alert.message, severity: alert.severity }; const logLine = JSON.stringify(logEntry) + '\n'; await writeFile('bckb-alerts.log', logLine, { flag: 'a' }); } catch (error) { console.error('Failed to write alert to log file:', error); } } getRecentFailedAuthentications() { if (!this.securityManager) return 0; const auditLog = this.securityManager.getAuditLog(1000); const oneHourAgo = Date.now() - (60 * 60 * 1000); return auditLog.filter(event => event.event_type === 'auth_failure' && event.timestamp > oneHourAgo).length; } async getSystemMemory() { try { if (process.platform === 'linux') { const meminfo = await readFile('/proc/meminfo', 'utf8'); const match = meminfo.match(/MemTotal:\s*(\d+)\s*kB/); return match ? parseInt(match[1]) * 1024 : 1024 * 1024 * 1024; // Fallback to 1GB } } catch (error) { // Fallback for non-Linux systems } return 1024 * 1024 * 1024; // 1GB default } async getCPUUsage() { // Simplified CPU usage calculation const usage = process.cpuUsage(); return ((usage.user + usage.system) / 1000000) * 100; // Convert to percentage (rough estimate) } } //# sourceMappingURL=production-monitor.js.map