UNPKG

shipdeck

Version:

Ship MVPs in 48 hours. Fix bugs in 30 seconds. The command deck for developers who ship.

563 lines (471 loc) β€’ 14.4 kB
/** * Auto-Recovery System * Monitors for issues and automatically triggers rollbacks when needed */ const { CheckpointManager } = require('./checkpoint-manager'); const EventEmitter = require('events'); const { execSync } = require('child_process'); class AutoRecovery extends EventEmitter { constructor(options = {}) { super(); this.checkpointManager = options.checkpointManager || new CheckpointManager(); this.enabled = options.enabled !== false; // Monitoring configuration this.monitors = { errors: options.monitorErrors !== false, performance: options.monitorPerformance !== false, security: options.monitorSecurity !== false, deployment: options.monitorDeployment !== false }; // Thresholds for auto-rollback this.thresholds = { errorRate: options.errorRateThreshold || 0.1, // 10% error rate responseTime: options.responseTimeThreshold || 5000, // 5 seconds memoryUsage: options.memoryThreshold || 0.9, // 90% memory cpuUsage: options.cpuThreshold || 0.9, // 90% CPU consecutiveFailures: options.failureThreshold || 3 }; // Tracking this.metrics = { errors: [], performance: [], rollbacks: [], consecutiveFailures: 0 }; // Recovery state this.isRecovering = false; this.lastRecovery = null; // Monitoring intervals this.intervals = new Map(); } /** * Initialize auto-recovery system */ async initialize() { await this.checkpointManager.initialize(); if (this.enabled) { this.startMonitoring(); console.log('πŸ›‘οΈ Auto-Recovery System initialized and monitoring'); } else { console.log('πŸ›‘οΈ Auto-Recovery System initialized (disabled)'); } return this; } /** * Start monitoring for issues */ startMonitoring() { // Error monitoring if (this.monitors.errors) { this.startErrorMonitoring(); } // Performance monitoring if (this.monitors.performance) { this.startPerformanceMonitoring(); } // Security monitoring if (this.monitors.security) { this.startSecurityMonitoring(); } // Deployment monitoring if (this.monitors.deployment) { this.startDeploymentMonitoring(); } console.log('πŸ‘οΈ Monitoring started'); } /** * Stop all monitoring */ stopMonitoring() { for (const [name, interval] of this.intervals) { clearInterval(interval); } this.intervals.clear(); console.log('πŸ‘οΈ Monitoring stopped'); } /** * Monitor for errors */ startErrorMonitoring() { // Monitor process errors process.on('uncaughtException', (error) => { this.handleError('uncaught', error); }); process.on('unhandledRejection', (reason, promise) => { this.handleError('unhandled', new Error(reason)); }); // Monitor test failures const interval = setInterval(() => { this.checkTestHealth(); }, 30000); // Check every 30 seconds this.intervals.set('error-monitor', interval); } /** * Monitor performance metrics */ startPerformanceMonitoring() { const interval = setInterval(() => { const metrics = this.collectPerformanceMetrics(); this.analyzePerformance(metrics); }, 10000); // Check every 10 seconds this.intervals.set('performance-monitor', interval); } /** * Monitor security issues */ startSecurityMonitoring() { const interval = setInterval(() => { this.checkSecurityIssues(); }, 60000); // Check every minute this.intervals.set('security-monitor', interval); } /** * Monitor deployment health */ startDeploymentMonitoring() { const interval = setInterval(() => { this.checkDeploymentHealth(); }, 60000); // Check every minute this.intervals.set('deployment-monitor', interval); } /** * Handle detected error */ async handleError(type, error) { console.error(`❌ Error detected (${type}):`, error.message); // Track error this.metrics.errors.push({ type, error: error.message, stack: error.stack, timestamp: Date.now() }); // Increment consecutive failures this.metrics.consecutiveFailures++; // Check if we should auto-rollback if (this.shouldAutoRollback('error')) { await this.triggerAutoRollback({ reason: 'error', type, error: error.message }); } // Emit event this.emit('error:detected', { type, error }); } /** * Check test health */ async checkTestHealth() { try { execSync('npm test', { cwd: this.checkpointManager.projectPath, stdio: 'pipe' }); // Tests passed, reset failure counter this.metrics.consecutiveFailures = 0; } catch (error) { console.warn('⚠️ Test failure detected'); this.metrics.consecutiveFailures++; if (this.shouldAutoRollback('test-failure')) { await this.triggerAutoRollback({ reason: 'test-failure', details: error.message }); } } } /** * Collect performance metrics */ collectPerformanceMetrics() { const usage = process.memoryUsage(); const cpuUsage = process.cpuUsage(); return { memory: { heapUsed: usage.heapUsed, heapTotal: usage.heapTotal, rss: usage.rss, percentUsed: usage.heapUsed / usage.heapTotal }, cpu: { user: cpuUsage.user, system: cpuUsage.system }, timestamp: Date.now() }; } /** * Analyze performance metrics */ analyzePerformance(metrics) { // Track metrics this.metrics.performance.push(metrics); // Keep only last 100 metrics if (this.metrics.performance.length > 100) { this.metrics.performance = this.metrics.performance.slice(-100); } // Check memory threshold if (metrics.memory.percentUsed > this.thresholds.memoryUsage) { console.warn(`⚠️ High memory usage: ${(metrics.memory.percentUsed * 100).toFixed(1)}%`); if (this.shouldAutoRollback('memory')) { this.triggerAutoRollback({ reason: 'high-memory', usage: metrics.memory.percentUsed }); } } // Emit metrics this.emit('metrics:collected', metrics); } /** * Check for security issues */ async checkSecurityIssues() { try { // Run security audit const auditResult = execSync('npm audit --json', { cwd: this.checkpointManager.projectPath, stdio: 'pipe' }).toString(); const audit = JSON.parse(auditResult); // Check for critical vulnerabilities if (audit.metadata?.vulnerabilities?.critical > 0) { console.warn(`⚠️ Critical security vulnerabilities detected: ${audit.metadata.vulnerabilities.critical}`); if (this.shouldAutoRollback('security')) { await this.triggerAutoRollback({ reason: 'security-vulnerability', critical: audit.metadata.vulnerabilities.critical, high: audit.metadata.vulnerabilities.high }); } } } catch (error) { // Audit command failed or no vulnerabilities } } /** * Check deployment health */ async checkDeploymentHealth() { // This would check actual deployment health // For now, we'll simulate by checking if build works try { execSync('npm run build', { cwd: this.checkpointManager.projectPath, stdio: 'pipe' }); } catch (error) { console.warn('⚠️ Build failure detected'); if (this.shouldAutoRollback('deployment')) { await this.triggerAutoRollback({ reason: 'build-failure', error: error.message }); } } } /** * Determine if auto-rollback should be triggered */ shouldAutoRollback(reason) { // Don't rollback if already recovering if (this.isRecovering) { return false; } // Don't rollback too frequently if (this.lastRecovery && Date.now() - this.lastRecovery < 60000) { return false; // Wait at least 1 minute between rollbacks } // Check consecutive failures threshold if (reason === 'error' || reason === 'test-failure') { return this.metrics.consecutiveFailures >= this.thresholds.consecutiveFailures; } // Check specific reason thresholds switch (reason) { case 'memory': return true; // Always rollback on high memory case 'security': return true; // Always rollback on critical security issues case 'deployment': case 'build-failure': return this.metrics.consecutiveFailures >= 2; default: return false; } } /** * Trigger automatic rollback */ async triggerAutoRollback(context) { if (this.isRecovering) { console.log('⏳ Recovery already in progress'); return; } console.log(`🚨 Auto-rollback triggered: ${context.reason}`); this.isRecovering = true; try { // Find last stable checkpoint const stableCheckpoint = await this.findStableCheckpoint(); if (!stableCheckpoint) { throw new Error('No stable checkpoint found'); } console.log(`πŸ”„ Rolling back to stable checkpoint: ${stableCheckpoint.id}`); // Perform rollback const result = await this.checkpointManager.rollback(stableCheckpoint.id, { skipRecoveryPoint: false, runHooks: true, attemptRecovery: true }); if (result.success) { console.log(`βœ… Auto-recovery successful`); // Reset failure counter this.metrics.consecutiveFailures = 0; // Track rollback this.metrics.rollbacks.push({ timestamp: Date.now(), reason: context.reason, checkpoint: stableCheckpoint.id, success: true }); // Send notification this.notifyRecovery(context, stableCheckpoint); // Emit event this.emit('recovery:success', { context, checkpoint: stableCheckpoint }); } else { throw new Error(result.error); } } catch (error) { console.error(`❌ Auto-recovery failed: ${error.message}`); // Track failed rollback this.metrics.rollbacks.push({ timestamp: Date.now(), reason: context.reason, success: false, error: error.message }); // Send alert this.sendAlert('Auto-recovery failed', context, error); // Emit event this.emit('recovery:failed', { context, error: error.message }); } finally { this.isRecovering = false; this.lastRecovery = Date.now(); } } /** * Find last stable checkpoint */ async findStableCheckpoint() { const checkpoints = this.checkpointManager.listCheckpoints({ limit: 10 }); // Look for checkpoint with good health status for (const checkpoint of checkpoints) { if (checkpoint.metadata?.healthStatus?.healthy) { return checkpoint; } if (checkpoint.tags.includes('stable') || checkpoint.tags.includes('good')) { return checkpoint; } } // Return oldest checkpoint as fallback return checkpoints[checkpoints.length - 1]; } /** * Send recovery notification */ notifyRecovery(context, checkpoint) { const message = ` πŸ”„ AUTO-RECOVERY COMPLETED ======================== Reason: ${context.reason} Checkpoint: ${checkpoint.id} Time: ${new Date().toISOString()} Details: ${JSON.stringify(context, null, 2)} `; console.log(message); // In production, this would send email/Slack notification this.emit('notification:recovery', message); } /** * Send alert for critical issues */ sendAlert(title, context, error) { const message = ` 🚨 CRITICAL ALERT: ${title} =========================== Context: ${JSON.stringify(context, null, 2)} Error: ${error?.message || 'Unknown'} Time: ${new Date().toISOString()} `; console.error(message); // In production, this would page on-call engineer this.emit('alert:critical', { title, context, error }); } /** * Get recovery statistics */ getStats() { const successfulRollbacks = this.metrics.rollbacks.filter(r => r.success).length; const failedRollbacks = this.metrics.rollbacks.filter(r => !r.success).length; return { totalErrors: this.metrics.errors.length, recentErrors: this.metrics.errors.slice(-10), consecutiveFailures: this.metrics.consecutiveFailures, totalRollbacks: this.metrics.rollbacks.length, successfulRollbacks, failedRollbacks, successRate: successfulRollbacks / (successfulRollbacks + failedRollbacks) * 100, lastRecovery: this.lastRecovery, isRecovering: this.isRecovering, monitoring: { enabled: this.enabled, monitors: this.monitors } }; } /** * Enable auto-recovery */ enable() { this.enabled = true; this.startMonitoring(); console.log('βœ… Auto-recovery enabled'); } /** * Disable auto-recovery */ disable() { this.enabled = false; this.stopMonitoring(); console.log('⏸️ Auto-recovery disabled'); } /** * Test auto-recovery system */ async testRecovery() { console.log('πŸ§ͺ Testing auto-recovery system...'); // Create a stable checkpoint await this.checkpointManager.createCheckpoint({ type: 'manual', description: 'Test stable checkpoint', tags: ['stable', 'test'] }); // Simulate an error this.metrics.consecutiveFailures = this.thresholds.consecutiveFailures; // Trigger rollback await this.triggerAutoRollback({ reason: 'test', description: 'Testing auto-recovery system' }); console.log('βœ… Auto-recovery test complete'); } } module.exports = { AutoRecovery };