devflow-ai
Version: 
Enterprise-grade AI agent orchestration with swarm management UI dashboard
346 lines (290 loc) • 9.93 kB
text/typescript
/**
 * Recovery Manager for MCP
 * Orchestrates all recovery components for comprehensive connection stability
 */
import { EventEmitter } from 'node:events';
import type { ILogger } from '../../core/logger.js';
import type { MCPClient } from '../client.js';
import { ConnectionHealthMonitor, HealthStatus } from './connection-health-monitor.js';
import { ReconnectionManager } from './reconnection-manager.js';
import { FallbackCoordinator } from './fallback-coordinator.js';
import { ConnectionStateManager } from './connection-state-manager.js';
import type { MCPConfig, MCPRequest } from '../../utils/types.js';
export interface RecoveryConfig {
  enableRecovery: boolean;
  healthMonitor?: {
    heartbeatInterval?: number;
    heartbeatTimeout?: number;
    maxMissedHeartbeats?: number;
  };
  reconnection?: {
    maxRetries?: number;
    initialDelay?: number;
    maxDelay?: number;
    backoffMultiplier?: number;
  };
  fallback?: {
    enableFallback?: boolean;
    maxQueueSize?: number;
    cliPath?: string;
  };
  state?: {
    enablePersistence?: boolean;
    stateDirectory?: string;
  };
}
export interface RecoveryStatus {
  isRecoveryActive: boolean;
  connectionHealth: HealthStatus;
  reconnectionState: {
    attempts: number;
    isReconnecting: boolean;
    nextDelay?: number;
  };
  fallbackState: {
    isFallbackActive: boolean;
    queuedOperations: number;
  };
  metrics: {
    totalRecoveries: number;
    successfulRecoveries: number;
    failedRecoveries: number;
    averageRecoveryTime: number;
  };
}
export class RecoveryManager extends EventEmitter {
  private healthMonitor: ConnectionHealthMonitor;
  private reconnectionManager: ReconnectionManager;
  private fallbackCoordinator: FallbackCoordinator;
  private stateManager: ConnectionStateManager;
  private isRecoveryActive = false;
  private recoveryStartTime?: Date;
  private metrics = {
    totalRecoveries: 0,
    successfulRecoveries: 0,
    failedRecoveries: 0,
    totalRecoveryTime: 0,
  };
  constructor(
    private client: MCPClient,
    private mcpConfig: MCPConfig,
    private logger: ILogger,
    config?: RecoveryConfig,
  ) {
    super();
    // Initialize components
    this.healthMonitor = new ConnectionHealthMonitor(client, logger, config?.healthMonitor);
    this.reconnectionManager = new ReconnectionManager(client, logger, config?.reconnection);
    this.fallbackCoordinator = new FallbackCoordinator(logger, config?.fallback);
    this.stateManager = new ConnectionStateManager(logger, config?.state);
    // Set up component event handlers
    this.setupEventHandlers();
    this.logger.info('Recovery manager initialized');
  }
  /**
   * Start recovery management
   */
  async start(): Promise<void> {
    this.logger.info('Starting recovery manager');
    // Start health monitoring
    await this.healthMonitor.start();
    // Restore any previous state
    const previousState = this.stateManager.restoreState();
    if (previousState && previousState.pendingRequests.length > 0) {
      this.logger.info('Restored previous connection state', {
        sessionId: previousState.sessionId,
        pendingRequests: previousState.pendingRequests.length,
      });
      // Queue pending requests for retry
      previousState.pendingRequests.forEach((request) => {
        this.fallbackCoordinator.queueOperation({
          type: 'tool',
          method: request.method,
          params: request.params,
          priority: 'high',
          retryable: true,
        });
      });
    }
    this.emit('started');
  }
  /**
   * Stop recovery management
   */
  async stop(): Promise<void> {
    this.logger.info('Stopping recovery manager');
    // Stop all components
    await this.healthMonitor.stop();
    this.reconnectionManager.stopReconnection();
    this.fallbackCoordinator.disableCLIFallback();
    await this.stateManager.cleanup();
    this.emit('stopped');
  }
  /**
   * Get current recovery status
   */
  getStatus(): RecoveryStatus {
    const healthStatus = this.healthMonitor.getHealthStatus();
    const reconnectionState = this.reconnectionManager.getState();
    const fallbackState = this.fallbackCoordinator.getState();
    return {
      isRecoveryActive: this.isRecoveryActive,
      connectionHealth: healthStatus,
      reconnectionState: {
        attempts: reconnectionState.attempts,
        isReconnecting: reconnectionState.isReconnecting,
        nextDelay: reconnectionState.nextDelay,
      },
      fallbackState: {
        isFallbackActive: fallbackState.isFallbackActive,
        queuedOperations: fallbackState.queuedOperations,
      },
      metrics: {
        totalRecoveries: this.metrics.totalRecoveries,
        successfulRecoveries: this.metrics.successfulRecoveries,
        failedRecoveries: this.metrics.failedRecoveries,
        averageRecoveryTime:
          this.metrics.totalRecoveries > 0
            ? this.metrics.totalRecoveryTime / this.metrics.totalRecoveries
            : 0,
      },
    };
  }
  /**
   * Force a recovery attempt
   */
  async forceRecovery(): Promise<boolean> {
    this.logger.info('Forcing recovery attempt');
    // Check if already recovering
    if (this.isRecoveryActive) {
      this.logger.warn('Recovery already in progress');
      return false;
    }
    return this.startRecovery('manual');
  }
  /**
   * Handle a request that needs recovery consideration
   */
  async handleRequest(request: MCPRequest): Promise<void> {
    // Add to pending requests if disconnected
    if (!this.client.isConnected()) {
      this.stateManager.addPendingRequest(request);
      // Queue for fallback execution
      this.fallbackCoordinator.queueOperation({
        type: 'tool',
        method: request.method,
        params: request.params,
        priority: 'medium',
        retryable: true,
      });
    }
  }
  private setupEventHandlers(): void {
    // Health monitor events
    this.healthMonitor.on('connectionLost', async ({ error }) => {
      this.logger.error('Connection lost, initiating recovery', error);
      await this.startRecovery('health-check');
    });
    this.healthMonitor.on('healthChange', (newStatus, oldStatus) => {
      this.emit('healthChange', newStatus, oldStatus);
      // Record state change
      this.stateManager.recordEvent({
        type: newStatus.healthy ? 'connect' : 'disconnect',
        sessionId: this.generateSessionId(),
        details: { health: newStatus },
      });
    });
    // Reconnection manager events
    this.reconnectionManager.on('success', async ({ attempts, duration }) => {
      this.logger.info('Reconnection successful', { attempts, duration });
      await this.completeRecovery(true);
    });
    this.reconnectionManager.on('maxRetriesExceeded', async () => {
      this.logger.error('Max reconnection attempts exceeded');
      await this.completeRecovery(false);
    });
    this.reconnectionManager.on('attemptFailed', ({ attempt, error }) => {
      this.emit('recoveryAttemptFailed', { attempt, error });
    });
    // Fallback coordinator events
    this.fallbackCoordinator.on('fallbackEnabled', (state) => {
      this.logger.warn('Fallback mode activated', state);
      this.emit('fallbackActivated', state);
    });
    this.fallbackCoordinator.on('replayOperation', async (operation) => {
      // Replay operation through MCP client
      if (this.client.isConnected()) {
        try {
          await this.client.request(operation.method, operation.params);
          this.stateManager.removePendingRequest(operation.id);
        } catch (error) {
          this.logger.error('Failed to replay operation', { operation, error });
        }
      }
    });
  }
  private async startRecovery(trigger: string): Promise<boolean> {
    if (this.isRecoveryActive) {
      return false;
    }
    this.isRecoveryActive = true;
    this.recoveryStartTime = new Date();
    this.metrics.totalRecoveries++;
    this.logger.info('Starting recovery process', { trigger });
    this.emit('recoveryStart', { trigger });
    // Save current state
    this.stateManager.saveState({
      sessionId: this.generateSessionId(),
      lastConnected: new Date(),
      pendingRequests: [],
      configuration: this.mcpConfig,
      metadata: { trigger },
    });
    // Enable fallback mode immediately
    this.fallbackCoordinator.enableCLIFallback();
    // Start reconnection attempts
    this.reconnectionManager.startAutoReconnect();
    return true;
  }
  private async completeRecovery(success: boolean): Promise<void> {
    if (!this.isRecoveryActive) {
      return;
    }
    const duration = this.recoveryStartTime ? Date.now() - this.recoveryStartTime.getTime() : 0;
    this.isRecoveryActive = false;
    this.recoveryStartTime = undefined;
    if (success) {
      this.metrics.successfulRecoveries++;
      this.metrics.totalRecoveryTime += duration;
      // Disable fallback mode
      this.fallbackCoordinator.disableCLIFallback();
      // Process any queued operations
      await this.fallbackCoordinator.processQueue();
      // Reset health monitor
      this.healthMonitor.reset();
      // Record reconnection
      this.stateManager.recordEvent({
        type: 'reconnect',
        sessionId: this.generateSessionId(),
        details: { duration },
      });
      this.logger.info('Recovery completed successfully', { duration });
      this.emit('recoveryComplete', { success: true, duration });
    } else {
      this.metrics.failedRecoveries++;
      this.logger.error('Recovery failed');
      this.emit('recoveryComplete', { success: false, duration });
      // Keep fallback active
      this.emit('fallbackPermanent');
    }
  }
  private generateSessionId(): string {
    return `recovery-${Date.now()}-${Math.random().toString(36).slice(2)}`;
  }
  /**
   * Clean up resources
   */
  async cleanup(): Promise<void> {
    await this.stop();
  }
}