vibe-coder-mcp
Version:
Production-ready MCP server with complete agent integration, multi-transport support, and comprehensive development automation tools for AI-assisted workflows.
442 lines (441 loc) • 17.3 kB
JavaScript
import { AgentOrchestrator } from './agent-orchestrator.js';
import { TaskStreamer } from './task-streamer.js';
import { FeedbackProcessor } from './feedback-processor.js';
import { AppError } from '../../../utils/errors.js';
import logger from '../../../logger.js';
export class ExecutionWatchdog {
static instance = null;
configs = new Map();
monitors = new Map();
agentHealth = new Map();
agentOrchestrator;
taskStreamer;
feedbackProcessor;
watchdogTimer;
healthCheckTimer;
stats;
escalationProcedures;
constructor() {
this.agentOrchestrator = AgentOrchestrator.getInstance();
this.taskStreamer = TaskStreamer.getInstance();
this.feedbackProcessor = FeedbackProcessor.getInstance();
this.stats = {
totalTasksMonitored: 0,
activeMonitors: 0,
timeoutsDetected: 0,
escalationsTriggered: 0,
recoveredTasks: 0,
averageTaskDuration: 0,
agentsMonitored: 0,
unhealthyAgents: 0,
lastStatsUpdate: new Date()
};
this.escalationProcedures = [
{
level: 1,
action: 'reassign_task',
delayMinutes: 5,
description: 'Reassign task to another agent',
autoExecute: true
},
{
level: 2,
action: 'restart_agent',
delayMinutes: 10,
description: 'Request agent restart',
autoExecute: false
},
{
level: 3,
action: 'human_intervention',
delayMinutes: 15,
description: 'Escalate to human operator',
autoExecute: true
}
];
this.initializeDefaultConfigs();
this.startWatchdog();
logger.info('Execution watchdog initialized');
}
static getInstance() {
if (!ExecutionWatchdog.instance) {
ExecutionWatchdog.instance = new ExecutionWatchdog();
}
return ExecutionWatchdog.instance;
}
async startMonitoring(taskId, agentId, task) {
try {
const config = this.getConfigForTaskType(task.type);
const now = new Date();
const monitor = {
taskId,
agentId,
startTime: now,
lastHeartbeat: now,
timeoutAt: new Date(now.getTime() + config.timeoutMinutes * 60000),
warningAt: new Date(now.getTime() + config.warningThresholdMinutes * 60000),
status: 'monitoring',
retryCount: 0,
escalationLevel: 0,
taskType: task.type,
estimatedDuration: task.estimatedHours ? task.estimatedHours * 60 : undefined
};
this.monitors.set(taskId, monitor);
this.stats.totalTasksMonitored++;
this.stats.activeMonitors = this.monitors.size;
if (!this.agentHealth.has(agentId)) {
await this.initializeAgentHealth(agentId);
}
const agentHealthInfo = this.agentHealth.get(agentId);
agentHealthInfo.totalTasksAssigned++;
agentHealthInfo.lastSeen = now;
logger.info({
taskId,
agentId,
timeoutAt: monitor.timeoutAt,
taskType: task.type
}, 'Task monitoring started');
}
catch (error) {
logger.error({ err: error, taskId, agentId }, 'Failed to start task monitoring');
throw new AppError('Task monitoring startup failed', { cause: error });
}
}
async stopMonitoring(taskId, completed = true) {
try {
const monitor = this.monitors.get(taskId);
if (!monitor) {
logger.warn({ taskId }, 'Attempted to stop monitoring non-existent task');
return;
}
const duration = Date.now() - monitor.startTime.getTime();
const agentHealthInfo = this.agentHealth.get(monitor.agentId);
if (agentHealthInfo) {
if (completed) {
agentHealthInfo.totalTasksCompleted++;
agentHealthInfo.averageCompletionTime =
(agentHealthInfo.averageCompletionTime + duration) / 2;
if (monitor.status === 'timeout' || monitor.status === 'escalated') {
this.stats.recoveredTasks++;
monitor.status = 'recovered';
}
}
else {
agentHealthInfo.consecutiveTimeouts++;
}
agentHealthInfo.healthScore = this.calculateHealthScore(agentHealthInfo);
agentHealthInfo.status = this.determineHealthStatus(agentHealthInfo);
}
this.stats.averageTaskDuration =
(this.stats.averageTaskDuration + duration) / 2;
this.monitors.delete(taskId);
this.stats.activeMonitors = this.monitors.size;
logger.info({
taskId,
agentId: monitor.agentId,
duration: Math.round(duration / 1000),
completed
}, 'Task monitoring stopped');
}
catch (error) {
logger.error({ err: error, taskId }, 'Failed to stop task monitoring');
throw new AppError('Task monitoring shutdown failed', { cause: error });
}
}
updateTaskHeartbeat(taskId) {
const monitor = this.monitors.get(taskId);
if (monitor) {
monitor.lastHeartbeat = new Date();
if (monitor.status === 'warning') {
monitor.status = 'monitoring';
}
const agentHealthInfo = this.agentHealth.get(monitor.agentId);
if (agentHealthInfo) {
agentHealthInfo.lastSeen = new Date();
agentHealthInfo.consecutiveTimeouts = 0;
}
logger.debug({ taskId, agentId: monitor.agentId }, 'Task heartbeat updated');
}
}
configureTaskType(config) {
this.configs.set(config.taskType, config);
logger.info({ taskType: config.taskType, config }, 'Watchdog configuration updated');
}
getStats() {
this.stats.lastStatsUpdate = new Date();
this.stats.agentsMonitored = this.agentHealth.size;
this.stats.unhealthyAgents = Array.from(this.agentHealth.values())
.filter(health => health.status === 'unhealthy' || health.status === 'offline').length;
return { ...this.stats };
}
getAgentHealth(agentId) {
if (agentId) {
const health = this.agentHealth.get(agentId);
return health ? [health] : [];
}
return Array.from(this.agentHealth.values());
}
getActiveMonitors() {
return Array.from(this.monitors.values());
}
async forceTimeoutCheck() {
await this.checkTimeouts();
}
initializeDefaultConfigs() {
const defaultConfigs = [
{
taskType: 'frontend',
timeoutMinutes: 45,
warningThresholdMinutes: 30,
maxRetries: 2,
escalationDelayMinutes: 5,
healthCheckIntervalMinutes: 10
},
{
taskType: 'backend',
timeoutMinutes: 60,
warningThresholdMinutes: 40,
maxRetries: 2,
escalationDelayMinutes: 5,
healthCheckIntervalMinutes: 10
},
{
taskType: 'database',
timeoutMinutes: 30,
warningThresholdMinutes: 20,
maxRetries: 3,
escalationDelayMinutes: 3,
healthCheckIntervalMinutes: 5
},
{
taskType: 'testing',
timeoutMinutes: 20,
warningThresholdMinutes: 15,
maxRetries: 1,
escalationDelayMinutes: 2,
healthCheckIntervalMinutes: 5
},
{
taskType: 'documentation',
timeoutMinutes: 25,
warningThresholdMinutes: 18,
maxRetries: 1,
escalationDelayMinutes: 3,
healthCheckIntervalMinutes: 10
},
{
taskType: 'general',
timeoutMinutes: 30,
warningThresholdMinutes: 20,
maxRetries: 2,
escalationDelayMinutes: 5,
healthCheckIntervalMinutes: 10
}
];
defaultConfigs.forEach(config => {
this.configs.set(config.taskType, config);
});
logger.debug({ configCount: defaultConfigs.length }, 'Default watchdog configurations initialized');
}
getConfigForTaskType(taskType) {
return this.configs.get(taskType) || this.configs.get('general');
}
startWatchdog() {
this.watchdogTimer = setInterval(() => {
this.checkTimeouts().catch(error => {
logger.error({ err: error }, 'Error in watchdog timeout check');
});
}, 30000);
this.healthCheckTimer = setInterval(() => {
this.performHealthChecks().catch(error => {
logger.error({ err: error }, 'Error in agent health check');
});
}, 120000);
logger.debug('Watchdog timers started');
}
async checkTimeouts() {
const now = new Date();
for (const monitor of this.monitors.values()) {
try {
if (monitor.status === 'monitoring' && now >= monitor.warningAt) {
monitor.status = 'warning';
logger.warn({
taskId: monitor.taskId,
agentId: monitor.agentId,
timeoutIn: Math.round((monitor.timeoutAt.getTime() - now.getTime()) / 60000)
}, 'Task approaching timeout');
}
if (monitor.status !== 'timeout' && now >= monitor.timeoutAt) {
monitor.status = 'timeout';
this.stats.timeoutsDetected++;
logger.error({
taskId: monitor.taskId,
agentId: monitor.agentId,
duration: Math.round((now.getTime() - monitor.startTime.getTime()) / 60000)
}, 'Task timeout detected');
await this.handleTimeout(monitor);
}
}
catch (error) {
logger.error({ err: error, taskId: monitor.taskId }, 'Error checking task timeout');
}
}
}
async handleTimeout(monitor) {
try {
const config = this.getConfigForTaskType(monitor.taskType);
const agentHealthInfo = this.agentHealth.get(monitor.agentId);
if (agentHealthInfo) {
agentHealthInfo.consecutiveTimeouts++;
agentHealthInfo.healthScore = this.calculateHealthScore(agentHealthInfo);
agentHealthInfo.status = this.determineHealthStatus(agentHealthInfo);
}
if (monitor.retryCount < config.maxRetries) {
monitor.retryCount++;
await this.retryTask(monitor);
}
else {
await this.escalateTask(monitor);
}
}
catch (error) {
logger.error({ err: error, taskId: monitor.taskId }, 'Failed to handle task timeout');
}
}
async retryTask(monitor) {
try {
await this.taskStreamer.releaseTask(monitor.taskId, monitor.agentId);
const config = this.getConfigForTaskType(monitor.taskType);
const now = new Date();
monitor.startTime = now;
monitor.lastHeartbeat = now;
monitor.timeoutAt = new Date(now.getTime() + config.timeoutMinutes * 60000);
monitor.warningAt = new Date(now.getTime() + config.warningThresholdMinutes * 60000);
monitor.status = 'monitoring';
logger.info({
taskId: monitor.taskId,
agentId: monitor.agentId,
retryCount: monitor.retryCount
}, 'Task retry initiated');
}
catch (error) {
logger.error({ err: error, taskId: monitor.taskId }, 'Failed to retry task');
}
}
async escalateTask(monitor) {
try {
monitor.status = 'escalated';
monitor.escalationLevel++;
this.stats.escalationsTriggered++;
const procedure = this.escalationProcedures[monitor.escalationLevel - 1];
if (procedure) {
logger.warn({
taskId: monitor.taskId,
agentId: monitor.agentId,
escalationLevel: monitor.escalationLevel,
action: procedure.action
}, 'Task escalation triggered');
if (procedure.autoExecute) {
await this.executeEscalationAction(monitor, procedure);
}
}
else {
logger.error({
taskId: monitor.taskId,
escalationLevel: monitor.escalationLevel
}, 'No escalation procedure defined for level');
}
}
catch (error) {
logger.error({ err: error, taskId: monitor.taskId }, 'Failed to escalate task');
}
}
async executeEscalationAction(monitor, procedure) {
try {
switch (procedure.action) {
case 'reassign_task':
await this.taskStreamer.releaseTask(monitor.taskId, monitor.agentId);
logger.info({ taskId: monitor.taskId }, 'Task reassigned due to escalation');
break;
case 'human_intervention':
logger.error({
taskId: monitor.taskId,
agentId: monitor.agentId
}, 'HUMAN INTERVENTION REQUIRED: Task escalated to maximum level');
break;
case 'task_cancellation':
await this.stopMonitoring(monitor.taskId, false);
logger.warn({ taskId: monitor.taskId }, 'Task cancelled due to escalation');
break;
default:
logger.warn({ action: procedure.action }, 'Escalation action not implemented');
}
}
catch (error) {
logger.error({ err: error, action: procedure.action }, 'Failed to execute escalation action');
}
}
async initializeAgentHealth(agentId) {
const health = {
agentId,
lastSeen: new Date(),
consecutiveTimeouts: 0,
totalTasksAssigned: 0,
totalTasksCompleted: 0,
averageCompletionTime: 0,
healthScore: 1.0,
status: 'healthy',
lastHealthCheck: new Date()
};
this.agentHealth.set(agentId, health);
logger.debug({ agentId }, 'Agent health tracking initialized');
}
async performHealthChecks() {
const now = new Date();
for (const health of this.agentHealth.values()) {
try {
const timeSinceLastSeen = now.getTime() - health.lastSeen.getTime();
const maxIdleTime = 10 * 60 * 1000;
if (timeSinceLastSeen > maxIdleTime) {
health.status = 'offline';
}
else {
health.healthScore = this.calculateHealthScore(health);
health.status = this.determineHealthStatus(health);
}
health.lastHealthCheck = now;
}
catch (error) {
logger.error({ err: error, agentId: health.agentId }, 'Error in agent health check');
}
}
}
calculateHealthScore(health) {
const completionRate = health.totalTasksAssigned > 0
? health.totalTasksCompleted / health.totalTasksAssigned
: 1.0;
const timeoutPenalty = Math.min(health.consecutiveTimeouts * 0.2, 0.8);
const baseScore = completionRate - timeoutPenalty;
return Math.max(0, Math.min(1, baseScore));
}
determineHealthStatus(health) {
if (health.healthScore >= 0.8)
return 'healthy';
if (health.healthScore >= 0.5)
return 'degraded';
return 'unhealthy';
}
destroy() {
if (this.watchdogTimer) {
clearInterval(this.watchdogTimer);
}
if (this.healthCheckTimer) {
clearInterval(this.healthCheckTimer);
}
this.monitors.clear();
this.agentHealth.clear();
this.configs.clear();
ExecutionWatchdog.instance = null;
logger.info('Execution watchdog destroyed');
}
}