UNPKG

task-master-neo-sdlc

Version:

Enhanced task management system with Neo SDLC agents and MCP tools for comprehensive, AI-driven software development lifecycle management.

364 lines (318 loc) 12.7 kB
import { readJSON, writeJSON } from '../../utils/file-utils.js'; import { log } from '../../utils/logging.js'; export class MonitoringSystem { constructor(knowledgeGraph) { this.knowledgeGraph = knowledgeGraph; this.thresholds = new Map(); this.alerts = new Map(); this.snapshots = []; this.maxSnapshots = 100; } async initialize() { // Set default thresholds this.thresholds.set('agent_load', { warning: 0.8, critical: 0.95 }); this.thresholds.set('workflow_completion_rate', { warning: 0.7, critical: 0.5 }); this.thresholds.set('error_rate', { warning: 0.1, critical: 0.2 }); this.thresholds.set('quality_score', { warning: 0.7, critical: 0.5 }); this.thresholds.set('response_time', { warning: 5000, critical: 10000 }); await this.knowledgeGraph.addNode({ id: 'monitoring', type: 'system', data: { thresholds: Object.fromEntries(this.thresholds), status: 'active', startTime: Date.now() } }); log.info('Monitoring system initialized'); } async takeSnapshot() { const timestamp = Date.now(); const metrics = { timestamp, system: { uptime: timestamp - this.startTime, memory: process.memoryUsage(), activeAlerts: this.alerts.size }, agents: await this.collectAgentMetrics(), workflows: await this.collectWorkflowMetrics() }; this.snapshots.push(metrics); if (this.snapshots.length > this.maxSnapshots) { this.snapshots.shift(); } await this.checkThresholds(metrics); return metrics; } async collectAgentMetrics() { const agents = await this.knowledgeGraph.findNodes('agent'); return { total: agents.length, active: agents.filter(a => a.data.status === 'busy').length, metrics: { avgQuality: this.calculateAverage(agents, 'metrics.qualityScore'), avgSuccessRate: this.calculateAverage(agents, 'metrics.successRate'), avgCompletionTime: this.calculateAverage(agents, 'metrics.avgCompletionTime') } }; } async collectWorkflowMetrics() { const workflows = await this.knowledgeGraph.findNodes('workflow'); const completed = workflows.filter(w => w.data.status === 'completed'); const failed = workflows.filter(w => w.data.status === 'failed'); return { total: workflows.length, active: workflows.filter(w => w.data.status === 'in_progress').length, completed: completed.length, failed: failed.length, metrics: { successRate: completed.length / (completed.length + failed.length) || 1, avgQuality: this.calculateAverage(completed, 'metrics.qualityScore'), avgDuration: this.calculateAverage(completed, 'metrics.totalTime') } }; } calculateAverage(items, path) { const values = items .map(item => path.split('.').reduce((obj, key) => obj?.[key], item.data)) .filter(v => v !== undefined && v !== null); return values.length ? values.reduce((sum, val) => sum + val, 0) / values.length : 0; } async checkThresholds(metrics) { const violations = []; // Check agent load const agentLoad = metrics.agents.active / metrics.agents.total; const loadThreshold = this.thresholds.get('agent_load'); if (agentLoad > loadThreshold.critical) { violations.push({ metric: 'agent_load', value: agentLoad, threshold: loadThreshold.critical, level: 'critical' }); } else if (agentLoad > loadThreshold.warning) { violations.push({ metric: 'agent_load', value: agentLoad, threshold: loadThreshold.warning, level: 'warning' }); } // Check workflow completion rate const completionRate = metrics.workflows.metrics.successRate; const completionThreshold = this.thresholds.get('workflow_completion_rate'); if (completionRate < completionThreshold.critical) { violations.push({ metric: 'workflow_completion_rate', value: completionRate, threshold: completionThreshold.critical, level: 'critical' }); } else if (completionRate < completionThreshold.warning) { violations.push({ metric: 'workflow_completion_rate', value: completionRate, threshold: completionThreshold.warning, level: 'warning' }); } // Create alerts for violations for (const violation of violations) { await this.createAlert(violation); } return violations; } async createAlert(violation) { const alertId = `${violation.metric}-${Date.now()}`; const alert = { id: alertId, ...violation, created: Date.now(), status: 'active' }; this.alerts.set(alertId, alert); // Add to knowledge graph await this.knowledgeGraph.addNode({ id: alertId, type: 'alert', data: alert }); log.warn(`Alert created: ${violation.metric} (${violation.level})`); return alert; } async resolveAlert(alertId) { const alert = this.alerts.get(alertId); if (!alert) { throw new Error('Alert not found'); } alert.status = 'resolved'; alert.resolvedAt = Date.now(); this.alerts.delete(alertId); // Update knowledge graph await this.knowledgeGraph.updateContext({ id: alertId, type: 'alert', status: 'resolved', resolvedAt: alert.resolvedAt }); return alert; } /** * Checks for specific conditions that might indicate a need for maintenance. * This is different from immediate threshold violations and looks at trends or prolonged states. * @param {number} checkDurationMs - How far back to look for persistent issues. * @returns {Promise<Array<object>>} A list of potential maintenance alerts. */ async checkForMaintenanceAlerts(checkDurationMs = 60 * 60 * 1000) { // Default: 1 hour console.log(`Checking for maintenance alerts over the last ${checkDurationMs / 1000}s`); const now = Date.now(); const relevantSnapshots = this.snapshots.filter(s => now - s.timestamp <= checkDurationMs); const maintenanceAlerts = []; if (relevantSnapshots.length < 2) { // Need at least 2 snapshots for trend analysis console.log('Not enough data for maintenance check.'); return []; } // Example Check 1: Prolonged high memory usage const memoryUsage = relevantSnapshots.map(s => s.system?.memory?.heapUsed || 0); const avgMemory = memoryUsage.reduce((sum, val) => sum + val, 0) / memoryUsage.length; // Assuming a threshold (e.g., 85% of a hypothetical limit, or a significant increase) const memoryLimitThreshold = 500 * 1024 * 1024; // Example: 500MB if (avgMemory > memoryLimitThreshold * 0.85 && memoryUsage.every(m => m > memoryLimitThreshold * 0.8)) { maintenanceAlerts.push({ type: 'memory_leak_suspected', details: `Average heap usage (${(avgMemory / 1024 / 1024).toFixed(2)}MB) consistently high over the check period.`, severity: 'medium' }); } // Example Check 2: Consistently low workflow success rate const workflowSuccessRates = relevantSnapshots.map(s => s.workflows?.metrics?.successRate ?? 1.0); const avgSuccessRate = workflowSuccessRates.reduce((sum, val) => sum + val, 0) / workflowSuccessRates.length; const lowRateThreshold = this.thresholds.get('workflow_completion_rate')?.warning || 0.7; // Use warning threshold if (avgSuccessRate < lowRateThreshold && workflowSuccessRates.every(r => r < lowRateThreshold * 1.1)) { // Consistently below or near warning maintenanceAlerts.push({ type: 'persistent_workflow_failures', details: `Average workflow success rate (${(avgSuccessRate * 100).toFixed(1)}%) remained low over the check period.`, severity: 'high' }); } // Example Check 3: Gradually increasing error rate (even if below critical) const errorRates = relevantSnapshots.map(s => s.error_rate || 0); // Assuming error_rate exists in snapshot const trend = errorRates[errorRates.length - 1] - errorRates[0]; if (trend > 0.05 && errorRates[errorRates.length - 1] > (this.thresholds.get('error_rate')?.warning || 0.1)) { // Increasing trend and above warning maintenanceAlerts.push({ type: 'increasing_error_rate', details: `Error rate shows an increasing trend, ending at ${(errorRates[errorRates.length - 1] * 100).toFixed(1)}%.`, severity: 'medium' }); } // Add generated maintenance alerts to KG (optional, could be handled differently) for (const alert of maintenanceAlerts) { const alertId = `maintAlert:${alert.type}_${Date.now()}`; await this.knowledgeGraph.addNode({ id: alertId, type: 'maintenance_alert', data: { ...alert, timestamp: now } }); } console.log(`Found ${maintenanceAlerts.length} potential maintenance issues.`); return maintenanceAlerts; } async getMetricsReport(timeframe = '1h') { const now = Date.now(); const timeframeMs = this.parseTimeframe(timeframe); const relevantSnapshots = this.snapshots.filter(s => now - s.timestamp <= timeframeMs ); if (relevantSnapshots.length === 0) { return { timeframe, snapshots: 0, message: 'No data available for specified timeframe' }; } const first = relevantSnapshots[0]; const last = relevantSnapshots[relevantSnapshots.length - 1]; const duration = last.timestamp - first.timestamp; const report = { timeframe, snapshots: relevantSnapshots.length, duration, system: { avgMemory: this.calculateTrend(relevantSnapshots, 'system.memory.heapUsed'), alertsCreated: this.countNewAlerts(relevantSnapshots[0].timestamp) }, agents: { avgLoad: this.calculateTrend(relevantSnapshots, 'agents.active', 'agents.total'), qualityTrend: this.calculateTrend(relevantSnapshots, 'agents.metrics.avgQuality') }, workflows: { completionRate: this.calculateTrend(relevantSnapshots, 'workflows.completed', 'workflows.total'), qualityTrend: this.calculateTrend(relevantSnapshots, 'workflows.metrics.avgQuality') }, activeAlerts: Array.from(this.alerts.values()) }; return report; } parseTimeframe(timeframe) { const unit = timeframe.slice(-1); const value = parseInt(timeframe.slice(0, -1)); switch (unit) { case 'h': return value * 60 * 60 * 1000; case 'd': return value * 24 * 60 * 60 * 1000; case 'm': return value * 60 * 1000; default: throw new Error('Invalid timeframe format'); } } calculateTrend(snapshots, valuePath, totalPath = null) { const values = snapshots.map(s => { const value = valuePath.split('.').reduce((obj, key) => obj?.[key], s); if (totalPath) { const total = totalPath.split('.').reduce((obj, key) => obj?.[key], s); return value / total; } return value; }); return { start: values[0], end: values[values.length - 1], min: Math.min(...values), max: Math.max(...values), avg: values.reduce((sum, val) => sum + val, 0) / values.length, trend: values[values.length - 1] - values[0] }; } countNewAlerts(since) { return Array.from(this.alerts.values()) .filter(alert => alert.created >= since).length; } /** * Logs a significant audit event to the Knowledge Graph. * @param {string} eventType - Type of event (e.g., 'agent_action', 'security_policy_change', 'deployment_start'). * @param {string} description - Description of the event. * @param {object} details - Additional structured details about the event (e.g., { agentId, action, targetId, status }). * @returns {Promise<object>} The created audit log node data. */ async logAuditEvent(eventType, description, details = {}) { const timestamp = Date.now(); const eventId = `auditEvent:${eventType}_${timestamp}`; log.info(`Audit Event: [${eventType}] ${description}`, details); // Log locally as well const eventData = { id: eventId, timestamp, eventType, description, details // Could include user/agent triggering the event if available in context }; // Add audit event node to knowledge graph await this.knowledgeGraph.addNode({ id: eventId, type: 'audit_log_event', data: eventData // Optionally add edges to related entities mentioned in details }); return eventData; } }