task-master-neo-sdlc
Version:
Enhanced task management system with Neo SDLC agents and MCP tools for comprehensive, AI-driven software development lifecycle management.
364 lines (318 loc) • 12.7 kB
JavaScript
import { readJSON, writeJSON } from '../../utils/file-utils.js';
import { log } from '../../utils/logging.js';
export class MonitoringSystem {
constructor(knowledgeGraph) {
this.knowledgeGraph = knowledgeGraph;
this.thresholds = new Map();
this.alerts = new Map();
this.snapshots = [];
this.maxSnapshots = 100;
}
async initialize() {
// Set default thresholds
this.thresholds.set('agent_load', { warning: 0.8, critical: 0.95 });
this.thresholds.set('workflow_completion_rate', { warning: 0.7, critical: 0.5 });
this.thresholds.set('error_rate', { warning: 0.1, critical: 0.2 });
this.thresholds.set('quality_score', { warning: 0.7, critical: 0.5 });
this.thresholds.set('response_time', { warning: 5000, critical: 10000 });
await this.knowledgeGraph.addNode({
id: 'monitoring',
type: 'system',
data: {
thresholds: Object.fromEntries(this.thresholds),
status: 'active',
startTime: Date.now()
}
});
log.info('Monitoring system initialized');
}
async takeSnapshot() {
const timestamp = Date.now();
const metrics = {
timestamp,
system: {
uptime: timestamp - this.startTime,
memory: process.memoryUsage(),
activeAlerts: this.alerts.size
},
agents: await this.collectAgentMetrics(),
workflows: await this.collectWorkflowMetrics()
};
this.snapshots.push(metrics);
if (this.snapshots.length > this.maxSnapshots) {
this.snapshots.shift();
}
await this.checkThresholds(metrics);
return metrics;
}
async collectAgentMetrics() {
const agents = await this.knowledgeGraph.findNodes('agent');
return {
total: agents.length,
active: agents.filter(a => a.data.status === 'busy').length,
metrics: {
avgQuality: this.calculateAverage(agents, 'metrics.qualityScore'),
avgSuccessRate: this.calculateAverage(agents, 'metrics.successRate'),
avgCompletionTime: this.calculateAverage(agents, 'metrics.avgCompletionTime')
}
};
}
async collectWorkflowMetrics() {
const workflows = await this.knowledgeGraph.findNodes('workflow');
const completed = workflows.filter(w => w.data.status === 'completed');
const failed = workflows.filter(w => w.data.status === 'failed');
return {
total: workflows.length,
active: workflows.filter(w => w.data.status === 'in_progress').length,
completed: completed.length,
failed: failed.length,
metrics: {
successRate: completed.length / (completed.length + failed.length) || 1,
avgQuality: this.calculateAverage(completed, 'metrics.qualityScore'),
avgDuration: this.calculateAverage(completed, 'metrics.totalTime')
}
};
}
calculateAverage(items, path) {
const values = items
.map(item => path.split('.').reduce((obj, key) => obj?.[key], item.data))
.filter(v => v !== undefined && v !== null);
return values.length ?
values.reduce((sum, val) => sum + val, 0) / values.length : 0;
}
async checkThresholds(metrics) {
const violations = [];
// Check agent load
const agentLoad = metrics.agents.active / metrics.agents.total;
const loadThreshold = this.thresholds.get('agent_load');
if (agentLoad > loadThreshold.critical) {
violations.push({
metric: 'agent_load',
value: agentLoad,
threshold: loadThreshold.critical,
level: 'critical'
});
} else if (agentLoad > loadThreshold.warning) {
violations.push({
metric: 'agent_load',
value: agentLoad,
threshold: loadThreshold.warning,
level: 'warning'
});
}
// Check workflow completion rate
const completionRate = metrics.workflows.metrics.successRate;
const completionThreshold = this.thresholds.get('workflow_completion_rate');
if (completionRate < completionThreshold.critical) {
violations.push({
metric: 'workflow_completion_rate',
value: completionRate,
threshold: completionThreshold.critical,
level: 'critical'
});
} else if (completionRate < completionThreshold.warning) {
violations.push({
metric: 'workflow_completion_rate',
value: completionRate,
threshold: completionThreshold.warning,
level: 'warning'
});
}
// Create alerts for violations
for (const violation of violations) {
await this.createAlert(violation);
}
return violations;
}
async createAlert(violation) {
const alertId = `${violation.metric}-${Date.now()}`;
const alert = {
id: alertId,
...violation,
created: Date.now(),
status: 'active'
};
this.alerts.set(alertId, alert);
// Add to knowledge graph
await this.knowledgeGraph.addNode({
id: alertId,
type: 'alert',
data: alert
});
log.warn(`Alert created: ${violation.metric} (${violation.level})`);
return alert;
}
async resolveAlert(alertId) {
const alert = this.alerts.get(alertId);
if (!alert) {
throw new Error('Alert not found');
}
alert.status = 'resolved';
alert.resolvedAt = Date.now();
this.alerts.delete(alertId);
// Update knowledge graph
await this.knowledgeGraph.updateContext({
id: alertId,
type: 'alert',
status: 'resolved',
resolvedAt: alert.resolvedAt
});
return alert;
}
/**
* Checks for specific conditions that might indicate a need for maintenance.
* This is different from immediate threshold violations and looks at trends or prolonged states.
* @param {number} checkDurationMs - How far back to look for persistent issues.
* @returns {Promise<Array<object>>} A list of potential maintenance alerts.
*/
async checkForMaintenanceAlerts(checkDurationMs = 60 * 60 * 1000) { // Default: 1 hour
console.log(`Checking for maintenance alerts over the last ${checkDurationMs / 1000}s`);
const now = Date.now();
const relevantSnapshots = this.snapshots.filter(s => now - s.timestamp <= checkDurationMs);
const maintenanceAlerts = [];
if (relevantSnapshots.length < 2) { // Need at least 2 snapshots for trend analysis
console.log('Not enough data for maintenance check.');
return [];
}
// Example Check 1: Prolonged high memory usage
const memoryUsage = relevantSnapshots.map(s => s.system?.memory?.heapUsed || 0);
const avgMemory = memoryUsage.reduce((sum, val) => sum + val, 0) / memoryUsage.length;
// Assuming a threshold (e.g., 85% of a hypothetical limit, or a significant increase)
const memoryLimitThreshold = 500 * 1024 * 1024; // Example: 500MB
if (avgMemory > memoryLimitThreshold * 0.85 && memoryUsage.every(m => m > memoryLimitThreshold * 0.8)) {
maintenanceAlerts.push({
type: 'memory_leak_suspected',
details: `Average heap usage (${(avgMemory / 1024 / 1024).toFixed(2)}MB) consistently high over the check period.`,
severity: 'medium'
});
}
// Example Check 2: Consistently low workflow success rate
const workflowSuccessRates = relevantSnapshots.map(s => s.workflows?.metrics?.successRate ?? 1.0);
const avgSuccessRate = workflowSuccessRates.reduce((sum, val) => sum + val, 0) / workflowSuccessRates.length;
const lowRateThreshold = this.thresholds.get('workflow_completion_rate')?.warning || 0.7; // Use warning threshold
if (avgSuccessRate < lowRateThreshold && workflowSuccessRates.every(r => r < lowRateThreshold * 1.1)) { // Consistently below or near warning
maintenanceAlerts.push({
type: 'persistent_workflow_failures',
details: `Average workflow success rate (${(avgSuccessRate * 100).toFixed(1)}%) remained low over the check period.`,
severity: 'high'
});
}
// Example Check 3: Gradually increasing error rate (even if below critical)
const errorRates = relevantSnapshots.map(s => s.error_rate || 0); // Assuming error_rate exists in snapshot
const trend = errorRates[errorRates.length - 1] - errorRates[0];
if (trend > 0.05 && errorRates[errorRates.length - 1] > (this.thresholds.get('error_rate')?.warning || 0.1)) { // Increasing trend and above warning
maintenanceAlerts.push({
type: 'increasing_error_rate',
details: `Error rate shows an increasing trend, ending at ${(errorRates[errorRates.length - 1] * 100).toFixed(1)}%.`,
severity: 'medium'
});
}
// Add generated maintenance alerts to KG (optional, could be handled differently)
for (const alert of maintenanceAlerts) {
const alertId = `maintAlert:${alert.type}_${Date.now()}`;
await this.knowledgeGraph.addNode({
id: alertId,
type: 'maintenance_alert',
data: { ...alert, timestamp: now }
});
}
console.log(`Found ${maintenanceAlerts.length} potential maintenance issues.`);
return maintenanceAlerts;
}
async getMetricsReport(timeframe = '1h') {
const now = Date.now();
const timeframeMs = this.parseTimeframe(timeframe);
const relevantSnapshots = this.snapshots.filter(s =>
now - s.timestamp <= timeframeMs
);
if (relevantSnapshots.length === 0) {
return {
timeframe,
snapshots: 0,
message: 'No data available for specified timeframe'
};
}
const first = relevantSnapshots[0];
const last = relevantSnapshots[relevantSnapshots.length - 1];
const duration = last.timestamp - first.timestamp;
const report = {
timeframe,
snapshots: relevantSnapshots.length,
duration,
system: {
avgMemory: this.calculateTrend(relevantSnapshots, 'system.memory.heapUsed'),
alertsCreated: this.countNewAlerts(relevantSnapshots[0].timestamp)
},
agents: {
avgLoad: this.calculateTrend(relevantSnapshots, 'agents.active', 'agents.total'),
qualityTrend: this.calculateTrend(relevantSnapshots, 'agents.metrics.avgQuality')
},
workflows: {
completionRate: this.calculateTrend(relevantSnapshots, 'workflows.completed', 'workflows.total'),
qualityTrend: this.calculateTrend(relevantSnapshots, 'workflows.metrics.avgQuality')
},
activeAlerts: Array.from(this.alerts.values())
};
return report;
}
parseTimeframe(timeframe) {
const unit = timeframe.slice(-1);
const value = parseInt(timeframe.slice(0, -1));
switch (unit) {
case 'h': return value * 60 * 60 * 1000;
case 'd': return value * 24 * 60 * 60 * 1000;
case 'm': return value * 60 * 1000;
default: throw new Error('Invalid timeframe format');
}
}
calculateTrend(snapshots, valuePath, totalPath = null) {
const values = snapshots.map(s => {
const value = valuePath.split('.').reduce((obj, key) => obj?.[key], s);
if (totalPath) {
const total = totalPath.split('.').reduce((obj, key) => obj?.[key], s);
return value / total;
}
return value;
});
return {
start: values[0],
end: values[values.length - 1],
min: Math.min(...values),
max: Math.max(...values),
avg: values.reduce((sum, val) => sum + val, 0) / values.length,
trend: values[values.length - 1] - values[0]
};
}
countNewAlerts(since) {
return Array.from(this.alerts.values())
.filter(alert => alert.created >= since).length;
}
/**
* Logs a significant audit event to the Knowledge Graph.
* @param {string} eventType - Type of event (e.g., 'agent_action', 'security_policy_change', 'deployment_start').
* @param {string} description - Description of the event.
* @param {object} details - Additional structured details about the event (e.g., { agentId, action, targetId, status }).
* @returns {Promise<object>} The created audit log node data.
*/
async logAuditEvent(eventType, description, details = {}) {
const timestamp = Date.now();
const eventId = `auditEvent:${eventType}_${timestamp}`;
log.info(`Audit Event: [${eventType}] ${description}`, details); // Log locally as well
const eventData = {
id: eventId,
timestamp,
eventType,
description,
details
// Could include user/agent triggering the event if available in context
};
// Add audit event node to knowledge graph
await this.knowledgeGraph.addNode({
id: eventId,
type: 'audit_log_event',
data: eventData
// Optionally add edges to related entities mentioned in details
});
return eventData;
}
}