UNPKG

vibe-coder-mcp

Version:

Production-ready MCP server with complete agent integration, multi-transport support, and comprehensive development automation tools for AI-assisted workflows.

442 lines (441 loc) 17.3 kB
import { AgentOrchestrator } from './agent-orchestrator.js'; import { TaskStreamer } from './task-streamer.js'; import { FeedbackProcessor } from './feedback-processor.js'; import { AppError } from '../../../utils/errors.js'; import logger from '../../../logger.js'; export class ExecutionWatchdog { static instance = null; configs = new Map(); monitors = new Map(); agentHealth = new Map(); agentOrchestrator; taskStreamer; feedbackProcessor; watchdogTimer; healthCheckTimer; stats; escalationProcedures; constructor() { this.agentOrchestrator = AgentOrchestrator.getInstance(); this.taskStreamer = TaskStreamer.getInstance(); this.feedbackProcessor = FeedbackProcessor.getInstance(); this.stats = { totalTasksMonitored: 0, activeMonitors: 0, timeoutsDetected: 0, escalationsTriggered: 0, recoveredTasks: 0, averageTaskDuration: 0, agentsMonitored: 0, unhealthyAgents: 0, lastStatsUpdate: new Date() }; this.escalationProcedures = [ { level: 1, action: 'reassign_task', delayMinutes: 5, description: 'Reassign task to another agent', autoExecute: true }, { level: 2, action: 'restart_agent', delayMinutes: 10, description: 'Request agent restart', autoExecute: false }, { level: 3, action: 'human_intervention', delayMinutes: 15, description: 'Escalate to human operator', autoExecute: true } ]; this.initializeDefaultConfigs(); this.startWatchdog(); logger.info('Execution watchdog initialized'); } static getInstance() { if (!ExecutionWatchdog.instance) { ExecutionWatchdog.instance = new ExecutionWatchdog(); } return ExecutionWatchdog.instance; } async startMonitoring(taskId, agentId, task) { try { const config = this.getConfigForTaskType(task.type); const now = new Date(); const monitor = { taskId, agentId, startTime: now, lastHeartbeat: now, timeoutAt: new Date(now.getTime() + config.timeoutMinutes * 60000), warningAt: new Date(now.getTime() + config.warningThresholdMinutes * 60000), status: 'monitoring', retryCount: 0, escalationLevel: 0, taskType: task.type, estimatedDuration: task.estimatedHours ? task.estimatedHours * 60 : undefined }; this.monitors.set(taskId, monitor); this.stats.totalTasksMonitored++; this.stats.activeMonitors = this.monitors.size; if (!this.agentHealth.has(agentId)) { await this.initializeAgentHealth(agentId); } const agentHealthInfo = this.agentHealth.get(agentId); agentHealthInfo.totalTasksAssigned++; agentHealthInfo.lastSeen = now; logger.info({ taskId, agentId, timeoutAt: monitor.timeoutAt, taskType: task.type }, 'Task monitoring started'); } catch (error) { logger.error({ err: error, taskId, agentId }, 'Failed to start task monitoring'); throw new AppError('Task monitoring startup failed', { cause: error }); } } async stopMonitoring(taskId, completed = true) { try { const monitor = this.monitors.get(taskId); if (!monitor) { logger.warn({ taskId }, 'Attempted to stop monitoring non-existent task'); return; } const duration = Date.now() - monitor.startTime.getTime(); const agentHealthInfo = this.agentHealth.get(monitor.agentId); if (agentHealthInfo) { if (completed) { agentHealthInfo.totalTasksCompleted++; agentHealthInfo.averageCompletionTime = (agentHealthInfo.averageCompletionTime + duration) / 2; if (monitor.status === 'timeout' || monitor.status === 'escalated') { this.stats.recoveredTasks++; monitor.status = 'recovered'; } } else { agentHealthInfo.consecutiveTimeouts++; } agentHealthInfo.healthScore = this.calculateHealthScore(agentHealthInfo); agentHealthInfo.status = this.determineHealthStatus(agentHealthInfo); } this.stats.averageTaskDuration = (this.stats.averageTaskDuration + duration) / 2; this.monitors.delete(taskId); this.stats.activeMonitors = this.monitors.size; logger.info({ taskId, agentId: monitor.agentId, duration: Math.round(duration / 1000), completed }, 'Task monitoring stopped'); } catch (error) { logger.error({ err: error, taskId }, 'Failed to stop task monitoring'); throw new AppError('Task monitoring shutdown failed', { cause: error }); } } updateTaskHeartbeat(taskId) { const monitor = this.monitors.get(taskId); if (monitor) { monitor.lastHeartbeat = new Date(); if (monitor.status === 'warning') { monitor.status = 'monitoring'; } const agentHealthInfo = this.agentHealth.get(monitor.agentId); if (agentHealthInfo) { agentHealthInfo.lastSeen = new Date(); agentHealthInfo.consecutiveTimeouts = 0; } logger.debug({ taskId, agentId: monitor.agentId }, 'Task heartbeat updated'); } } configureTaskType(config) { this.configs.set(config.taskType, config); logger.info({ taskType: config.taskType, config }, 'Watchdog configuration updated'); } getStats() { this.stats.lastStatsUpdate = new Date(); this.stats.agentsMonitored = this.agentHealth.size; this.stats.unhealthyAgents = Array.from(this.agentHealth.values()) .filter(health => health.status === 'unhealthy' || health.status === 'offline').length; return { ...this.stats }; } getAgentHealth(agentId) { if (agentId) { const health = this.agentHealth.get(agentId); return health ? [health] : []; } return Array.from(this.agentHealth.values()); } getActiveMonitors() { return Array.from(this.monitors.values()); } async forceTimeoutCheck() { await this.checkTimeouts(); } initializeDefaultConfigs() { const defaultConfigs = [ { taskType: 'frontend', timeoutMinutes: 45, warningThresholdMinutes: 30, maxRetries: 2, escalationDelayMinutes: 5, healthCheckIntervalMinutes: 10 }, { taskType: 'backend', timeoutMinutes: 60, warningThresholdMinutes: 40, maxRetries: 2, escalationDelayMinutes: 5, healthCheckIntervalMinutes: 10 }, { taskType: 'database', timeoutMinutes: 30, warningThresholdMinutes: 20, maxRetries: 3, escalationDelayMinutes: 3, healthCheckIntervalMinutes: 5 }, { taskType: 'testing', timeoutMinutes: 20, warningThresholdMinutes: 15, maxRetries: 1, escalationDelayMinutes: 2, healthCheckIntervalMinutes: 5 }, { taskType: 'documentation', timeoutMinutes: 25, warningThresholdMinutes: 18, maxRetries: 1, escalationDelayMinutes: 3, healthCheckIntervalMinutes: 10 }, { taskType: 'general', timeoutMinutes: 30, warningThresholdMinutes: 20, maxRetries: 2, escalationDelayMinutes: 5, healthCheckIntervalMinutes: 10 } ]; defaultConfigs.forEach(config => { this.configs.set(config.taskType, config); }); logger.debug({ configCount: defaultConfigs.length }, 'Default watchdog configurations initialized'); } getConfigForTaskType(taskType) { return this.configs.get(taskType) || this.configs.get('general'); } startWatchdog() { this.watchdogTimer = setInterval(() => { this.checkTimeouts().catch(error => { logger.error({ err: error }, 'Error in watchdog timeout check'); }); }, 30000); this.healthCheckTimer = setInterval(() => { this.performHealthChecks().catch(error => { logger.error({ err: error }, 'Error in agent health check'); }); }, 120000); logger.debug('Watchdog timers started'); } async checkTimeouts() { const now = new Date(); for (const monitor of this.monitors.values()) { try { if (monitor.status === 'monitoring' && now >= monitor.warningAt) { monitor.status = 'warning'; logger.warn({ taskId: monitor.taskId, agentId: monitor.agentId, timeoutIn: Math.round((monitor.timeoutAt.getTime() - now.getTime()) / 60000) }, 'Task approaching timeout'); } if (monitor.status !== 'timeout' && now >= monitor.timeoutAt) { monitor.status = 'timeout'; this.stats.timeoutsDetected++; logger.error({ taskId: monitor.taskId, agentId: monitor.agentId, duration: Math.round((now.getTime() - monitor.startTime.getTime()) / 60000) }, 'Task timeout detected'); await this.handleTimeout(monitor); } } catch (error) { logger.error({ err: error, taskId: monitor.taskId }, 'Error checking task timeout'); } } } async handleTimeout(monitor) { try { const config = this.getConfigForTaskType(monitor.taskType); const agentHealthInfo = this.agentHealth.get(monitor.agentId); if (agentHealthInfo) { agentHealthInfo.consecutiveTimeouts++; agentHealthInfo.healthScore = this.calculateHealthScore(agentHealthInfo); agentHealthInfo.status = this.determineHealthStatus(agentHealthInfo); } if (monitor.retryCount < config.maxRetries) { monitor.retryCount++; await this.retryTask(monitor); } else { await this.escalateTask(monitor); } } catch (error) { logger.error({ err: error, taskId: monitor.taskId }, 'Failed to handle task timeout'); } } async retryTask(monitor) { try { await this.taskStreamer.releaseTask(monitor.taskId, monitor.agentId); const config = this.getConfigForTaskType(monitor.taskType); const now = new Date(); monitor.startTime = now; monitor.lastHeartbeat = now; monitor.timeoutAt = new Date(now.getTime() + config.timeoutMinutes * 60000); monitor.warningAt = new Date(now.getTime() + config.warningThresholdMinutes * 60000); monitor.status = 'monitoring'; logger.info({ taskId: monitor.taskId, agentId: monitor.agentId, retryCount: monitor.retryCount }, 'Task retry initiated'); } catch (error) { logger.error({ err: error, taskId: monitor.taskId }, 'Failed to retry task'); } } async escalateTask(monitor) { try { monitor.status = 'escalated'; monitor.escalationLevel++; this.stats.escalationsTriggered++; const procedure = this.escalationProcedures[monitor.escalationLevel - 1]; if (procedure) { logger.warn({ taskId: monitor.taskId, agentId: monitor.agentId, escalationLevel: monitor.escalationLevel, action: procedure.action }, 'Task escalation triggered'); if (procedure.autoExecute) { await this.executeEscalationAction(monitor, procedure); } } else { logger.error({ taskId: monitor.taskId, escalationLevel: monitor.escalationLevel }, 'No escalation procedure defined for level'); } } catch (error) { logger.error({ err: error, taskId: monitor.taskId }, 'Failed to escalate task'); } } async executeEscalationAction(monitor, procedure) { try { switch (procedure.action) { case 'reassign_task': await this.taskStreamer.releaseTask(monitor.taskId, monitor.agentId); logger.info({ taskId: monitor.taskId }, 'Task reassigned due to escalation'); break; case 'human_intervention': logger.error({ taskId: monitor.taskId, agentId: monitor.agentId }, 'HUMAN INTERVENTION REQUIRED: Task escalated to maximum level'); break; case 'task_cancellation': await this.stopMonitoring(monitor.taskId, false); logger.warn({ taskId: monitor.taskId }, 'Task cancelled due to escalation'); break; default: logger.warn({ action: procedure.action }, 'Escalation action not implemented'); } } catch (error) { logger.error({ err: error, action: procedure.action }, 'Failed to execute escalation action'); } } async initializeAgentHealth(agentId) { const health = { agentId, lastSeen: new Date(), consecutiveTimeouts: 0, totalTasksAssigned: 0, totalTasksCompleted: 0, averageCompletionTime: 0, healthScore: 1.0, status: 'healthy', lastHealthCheck: new Date() }; this.agentHealth.set(agentId, health); logger.debug({ agentId }, 'Agent health tracking initialized'); } async performHealthChecks() { const now = new Date(); for (const health of this.agentHealth.values()) { try { const timeSinceLastSeen = now.getTime() - health.lastSeen.getTime(); const maxIdleTime = 10 * 60 * 1000; if (timeSinceLastSeen > maxIdleTime) { health.status = 'offline'; } else { health.healthScore = this.calculateHealthScore(health); health.status = this.determineHealthStatus(health); } health.lastHealthCheck = now; } catch (error) { logger.error({ err: error, agentId: health.agentId }, 'Error in agent health check'); } } } calculateHealthScore(health) { const completionRate = health.totalTasksAssigned > 0 ? health.totalTasksCompleted / health.totalTasksAssigned : 1.0; const timeoutPenalty = Math.min(health.consecutiveTimeouts * 0.2, 0.8); const baseScore = completionRate - timeoutPenalty; return Math.max(0, Math.min(1, baseScore)); } determineHealthStatus(health) { if (health.healthScore >= 0.8) return 'healthy'; if (health.healthScore >= 0.5) return 'degraded'; return 'unhealthy'; } destroy() { if (this.watchdogTimer) { clearInterval(this.watchdogTimer); } if (this.healthCheckTimer) { clearInterval(this.healthCheckTimer); } this.monitors.clear(); this.agentHealth.clear(); this.configs.clear(); ExecutionWatchdog.instance = null; logger.info('Execution watchdog destroyed'); } }