UNPKG

vibe-coder-mcp

Version:

Production-ready MCP server with complete agent integration, multi-transport support, and comprehensive development automation tools for AI-assisted workflows.

434 lines (433 loc) 17.2 kB
import { AgentOrchestrator } from './agent-orchestrator.js'; import { TaskStreamer } from './task-streamer.js'; import { ExecutionWatchdog } from './execution-watchdog.js'; import { FeedbackProcessor } from './feedback-processor.js'; import { AppError, ValidationError } from '../../../utils/errors.js'; import logger from '../../../logger.js'; export class ErrorRecovery { static instance = null; agentOrchestrator; taskStreamer; executionWatchdog; feedbackProcessor; config; errors = new Map(); recoveryAttempts = new Map(); errorPatterns = new Map(); stats; constructor(config) { this.config = { maxRetryAttempts: 3, retryDelayMinutes: 2, exponentialBackoff: true, maxBackoffMinutes: 30, enablePatternAnalysis: true, patternAnalysisWindowHours: 24, autoRecoveryEnabled: true, escalationThreshold: 5, ...config }; this.agentOrchestrator = AgentOrchestrator.getInstance(); this.taskStreamer = TaskStreamer.getInstance(); this.executionWatchdog = ExecutionWatchdog.getInstance(); this.feedbackProcessor = FeedbackProcessor.getInstance(); this.stats = { totalErrors: 0, recoveredErrors: 0, failedRecoveries: 0, averageRecoveryTime: 0, mostCommonErrorType: 'unknown_error', mostEffectiveStrategy: 'retry_same_agent', patternsPrevented: 0, lastStatsUpdate: new Date() }; logger.info({ config: this.config }, 'Error recovery system initialized'); } static getInstance(config) { if (!ErrorRecovery.instance) { ErrorRecovery.instance = new ErrorRecovery(config); } return ErrorRecovery.instance; } async reportError(taskId, errorType, message, agentId, context, stackTrace) { try { const errorId = `error_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; const errorInfo = { id: errorId, taskId, agentId, errorType, message, stackTrace, timestamp: new Date(), context: context || {}, severity: this.determineSeverity(errorType, message), recoverable: this.isRecoverable(errorType) }; this.errors.set(errorId, errorInfo); this.stats.totalErrors++; logger.error({ errorId, taskId, agentId, errorType, severity: errorInfo.severity }, `Error reported: ${message}`); if (this.config.enablePatternAnalysis) { await this.analyzeErrorPattern(errorInfo); } if (this.config.autoRecoveryEnabled && errorInfo.recoverable) { await this.attemptRecovery(errorId); } return errorId; } catch (error) { logger.error({ err: error, taskId, errorType }, 'Failed to report error'); throw new AppError('Error reporting failed', { cause: error }); } } async attemptRecovery(errorId) { try { const errorInfo = this.errors.get(errorId); if (!errorInfo) { throw new ValidationError(`Error not found: ${errorId}`); } const attempts = this.recoveryAttempts.get(errorId) || []; if (attempts.length >= this.config.maxRetryAttempts) { logger.warn({ errorId, attempts: attempts.length }, 'Max recovery attempts exceeded'); await this.escalateError(errorId); return false; } const strategy = this.selectRecoveryStrategy(errorInfo, attempts); const attemptId = `attempt_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; const attempt = { id: attemptId, errorId, strategy, attemptNumber: attempts.length + 1, startedAt: new Date(), success: false }; if (!this.recoveryAttempts.has(errorId)) { this.recoveryAttempts.set(errorId, []); } this.recoveryAttempts.get(errorId).push(attempt); logger.info({ errorId, attemptId, strategy, attemptNumber: attempt.attemptNumber }, 'Starting recovery attempt'); if (attempts.length > 0) { const delay = this.calculateRetryDelay(attempt.attemptNumber); await this.sleep(delay); } const success = await this.executeRecoveryStrategy(errorInfo, attempt); attempt.completedAt = new Date(); attempt.success = success; if (success) { this.stats.recoveredErrors++; const recoveryTime = attempt.completedAt.getTime() - attempt.startedAt.getTime(); this.stats.averageRecoveryTime = (this.stats.averageRecoveryTime + recoveryTime) / 2; logger.info({ errorId, attemptId, strategy, recoveryTime: Math.round(recoveryTime / 1000) }, 'Recovery successful'); } else { this.stats.failedRecoveries++; logger.warn({ errorId, attemptId, strategy }, 'Recovery attempt failed'); if (attempts.length < this.config.maxRetryAttempts) { setTimeout(() => { this.attemptRecovery(errorId).catch(error => { logger.error({ err: error, errorId }, 'Failed to retry recovery'); }); }, 1000); } } return success; } catch (error) { logger.error({ err: error, errorId }, 'Failed to attempt recovery'); throw new AppError('Recovery attempt failed', { cause: error }); } } getError(errorId) { return this.errors.get(errorId) || null; } getRecoveryAttempts(errorId) { return this.recoveryAttempts.get(errorId) || []; } getErrorPatterns() { return Array.from(this.errorPatterns.values()); } getStats() { this.updateStats(); return { ...this.stats }; } cleanup(olderThanHours = 168) { const cutoffTime = new Date(Date.now() - olderThanHours * 60 * 60 * 1000); for (const [errorId, errorInfo] of this.errors.entries()) { if (errorInfo.timestamp < cutoffTime) { this.errors.delete(errorId); this.recoveryAttempts.delete(errorId); } } for (const [patternKey, pattern] of this.errorPatterns.entries()) { if (pattern.lastOccurrence < cutoffTime) { this.errorPatterns.delete(patternKey); } } logger.debug({ cutoffTime, olderThanHours }, 'Error recovery cleanup completed'); } determineSeverity(errorType, message) { const criticalTypes = ['system_error', 'resource_exhaustion']; const highTypes = ['agent_failure', 'dependency_failure']; const mediumTypes = ['task_timeout', 'communication_failure']; if (criticalTypes.includes(errorType)) return 'critical'; if (highTypes.includes(errorType)) return 'high'; if (mediumTypes.includes(errorType)) return 'medium'; const lowerMessage = message.toLowerCase(); if (lowerMessage.includes('critical') || lowerMessage.includes('fatal')) return 'critical'; if (lowerMessage.includes('error') || lowerMessage.includes('failed')) return 'high'; if (lowerMessage.includes('warning') || lowerMessage.includes('timeout')) return 'medium'; return 'low'; } isRecoverable(errorType) { const unrecoverableTypes = ['validation_error', 'system_error']; return !unrecoverableTypes.includes(errorType); } selectRecoveryStrategy(errorInfo, previousAttempts) { const pattern = this.findMatchingPattern(errorInfo); if (pattern) { return pattern.suggestedStrategy; } switch (errorInfo.errorType) { case 'agent_failure': return previousAttempts.length === 0 ? 'restart_agent' : 'reassign_different_agent'; case 'task_timeout': return previousAttempts.length === 0 ? 'retry_same_agent' : 'reassign_different_agent'; case 'dependency_failure': return 'retry_same_agent'; case 'communication_failure': return 'retry_same_agent'; case 'resource_exhaustion': return 'reassign_different_agent'; default: return previousAttempts.length === 0 ? 'retry_same_agent' : 'reassign_different_agent'; } } async executeRecoveryStrategy(errorInfo, attempt) { try { switch (attempt.strategy) { case 'retry_same_agent': return await this.retryWithSameAgent(errorInfo, attempt); case 'reassign_different_agent': return await this.reassignToDifferentAgent(errorInfo, attempt); case 'restart_agent': return await this.restartAgent(errorInfo, attempt); case 'split_task': return await this.splitTask(errorInfo, attempt); case 'escalate_human': return await this.escalateToHuman(errorInfo, attempt); case 'skip_task': return await this.skipTask(errorInfo, attempt); default: logger.warn({ strategy: attempt.strategy }, 'Unknown recovery strategy'); return false; } } catch (error) { logger.error({ err: error, strategy: attempt.strategy }, 'Recovery strategy execution failed'); return false; } } async retryWithSameAgent(errorInfo, attempt) { if (!errorInfo.agentId) return false; try { await this.taskStreamer.releaseTask(errorInfo.taskId, errorInfo.agentId); attempt.message = 'Task released for retry with same agent'; return true; } catch (error) { attempt.message = `Failed to retry with same agent: ${error}`; return false; } } async reassignToDifferentAgent(errorInfo, attempt) { try { if (errorInfo.agentId) { await this.taskStreamer.releaseTask(errorInfo.taskId, errorInfo.agentId); } attempt.message = 'Task released for reassignment to different agent'; return true; } catch (error) { attempt.message = `Failed to reassign task: ${error}`; return false; } } async restartAgent(errorInfo, attempt) { if (!errorInfo.agentId) return false; try { logger.warn({ agentId: errorInfo.agentId }, 'Agent restart requested'); attempt.message = 'Agent restart requested'; return true; } catch (error) { attempt.message = `Failed to restart agent: ${error}`; return false; } } async splitTask(errorInfo, attempt) { try { logger.info({ taskId: errorInfo.taskId }, 'Task splitting requested'); attempt.message = 'Task splitting initiated'; return true; } catch (error) { attempt.message = `Failed to split task: ${error}`; return false; } } async escalateToHuman(errorInfo, attempt) { try { logger.error({ errorId: errorInfo.id, taskId: errorInfo.taskId, agentId: errorInfo.agentId }, 'HUMAN INTERVENTION REQUIRED: Error escalated'); attempt.message = 'Escalated to human intervention'; return true; } catch (error) { attempt.message = `Failed to escalate: ${error}`; return false; } } async skipTask(errorInfo, attempt) { try { logger.warn({ taskId: errorInfo.taskId }, 'Task skipped due to recovery failure'); attempt.message = 'Task skipped'; return true; } catch (error) { attempt.message = `Failed to skip task: ${error}`; return false; } } async analyzeErrorPattern(errorInfo) { try { const patternKey = `${errorInfo.errorType}_${this.extractPatternFromMessage(errorInfo.message)}`; let pattern = this.errorPatterns.get(patternKey); if (!pattern) { pattern = { pattern: patternKey, errorType: errorInfo.errorType, frequency: 0, lastOccurrence: errorInfo.timestamp, affectedAgents: new Set(), affectedTasks: new Set(), suggestedStrategy: this.selectRecoveryStrategy(errorInfo, []), preventionMeasures: [] }; this.errorPatterns.set(patternKey, pattern); } pattern.frequency++; pattern.lastOccurrence = errorInfo.timestamp; if (errorInfo.agentId) pattern.affectedAgents.add(errorInfo.agentId); pattern.affectedTasks.add(errorInfo.taskId); if (pattern.frequency >= 3) { pattern.preventionMeasures = this.generatePreventionMeasures(pattern); } logger.debug({ patternKey, frequency: pattern.frequency }, 'Error pattern updated'); } catch (error) { logger.error({ err: error }, 'Failed to analyze error pattern'); } } extractPatternFromMessage(message) { const words = message.toLowerCase().split(/\s+/); const significantWords = words.filter(word => word.length > 3 && !['the', 'and', 'for', 'with', 'from', 'this', 'that'].includes(word)); return significantWords.slice(0, 3).join('_'); } findMatchingPattern(errorInfo) { const patternKey = `${errorInfo.errorType}_${this.extractPatternFromMessage(errorInfo.message)}`; return this.errorPatterns.get(patternKey) || null; } generatePreventionMeasures(pattern) { const measures = []; switch (pattern.errorType) { case 'agent_failure': measures.push('Implement agent health monitoring'); measures.push('Add agent restart automation'); break; case 'task_timeout': measures.push('Adjust timeout thresholds'); measures.push('Implement task complexity analysis'); break; case 'communication_failure': measures.push('Add communication retry logic'); measures.push('Implement connection pooling'); break; } return measures; } calculateRetryDelay(attemptNumber) { if (!this.config.exponentialBackoff) { return this.config.retryDelayMinutes * 60000; } const delay = this.config.retryDelayMinutes * Math.pow(2, attemptNumber - 1); return Math.min(delay, this.config.maxBackoffMinutes) * 60000; } sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async escalateError(errorId) { const errorInfo = this.errors.get(errorId); if (errorInfo) { logger.error({ errorId, taskId: errorInfo.taskId, errorType: errorInfo.errorType }, 'Error escalated after max recovery attempts'); } } updateStats() { const errors = Array.from(this.errors.values()); const errorTypeCounts = new Map(); errors.forEach(error => { errorTypeCounts.set(error.errorType, (errorTypeCounts.get(error.errorType) || 0) + 1); }); let maxCount = 0; let mostCommonType = 'unknown_error'; for (const [type, count] of errorTypeCounts.entries()) { if (count > maxCount) { maxCount = count; mostCommonType = type; } } this.stats.mostCommonErrorType = mostCommonType; this.stats.lastStatsUpdate = new Date(); } destroy() { this.errors.clear(); this.recoveryAttempts.clear(); this.errorPatterns.clear(); ErrorRecovery.instance = null; logger.info('Error recovery system destroyed'); } }