shipdeck
Version:
Ship MVPs in 48 hours. Fix bugs in 30 seconds. The command deck for developers who ship.
563 lines (471 loc) β’ 14.4 kB
JavaScript
/**
* Auto-Recovery System
* Monitors for issues and automatically triggers rollbacks when needed
*/
const { CheckpointManager } = require('./checkpoint-manager');
const EventEmitter = require('events');
const { execSync } = require('child_process');
class AutoRecovery extends EventEmitter {
constructor(options = {}) {
super();
this.checkpointManager = options.checkpointManager || new CheckpointManager();
this.enabled = options.enabled !== false;
// Monitoring configuration
this.monitors = {
errors: options.monitorErrors !== false,
performance: options.monitorPerformance !== false,
security: options.monitorSecurity !== false,
deployment: options.monitorDeployment !== false
};
// Thresholds for auto-rollback
this.thresholds = {
errorRate: options.errorRateThreshold || 0.1, // 10% error rate
responseTime: options.responseTimeThreshold || 5000, // 5 seconds
memoryUsage: options.memoryThreshold || 0.9, // 90% memory
cpuUsage: options.cpuThreshold || 0.9, // 90% CPU
consecutiveFailures: options.failureThreshold || 3
};
// Tracking
this.metrics = {
errors: [],
performance: [],
rollbacks: [],
consecutiveFailures: 0
};
// Recovery state
this.isRecovering = false;
this.lastRecovery = null;
// Monitoring intervals
this.intervals = new Map();
}
/**
* Initialize auto-recovery system
*/
async initialize() {
await this.checkpointManager.initialize();
if (this.enabled) {
this.startMonitoring();
console.log('π‘οΈ Auto-Recovery System initialized and monitoring');
} else {
console.log('π‘οΈ Auto-Recovery System initialized (disabled)');
}
return this;
}
/**
* Start monitoring for issues
*/
startMonitoring() {
// Error monitoring
if (this.monitors.errors) {
this.startErrorMonitoring();
}
// Performance monitoring
if (this.monitors.performance) {
this.startPerformanceMonitoring();
}
// Security monitoring
if (this.monitors.security) {
this.startSecurityMonitoring();
}
// Deployment monitoring
if (this.monitors.deployment) {
this.startDeploymentMonitoring();
}
console.log('ποΈ Monitoring started');
}
/**
* Stop all monitoring
*/
stopMonitoring() {
for (const [name, interval] of this.intervals) {
clearInterval(interval);
}
this.intervals.clear();
console.log('ποΈ Monitoring stopped');
}
/**
* Monitor for errors
*/
startErrorMonitoring() {
// Monitor process errors
process.on('uncaughtException', (error) => {
this.handleError('uncaught', error);
});
process.on('unhandledRejection', (reason, promise) => {
this.handleError('unhandled', new Error(reason));
});
// Monitor test failures
const interval = setInterval(() => {
this.checkTestHealth();
}, 30000); // Check every 30 seconds
this.intervals.set('error-monitor', interval);
}
/**
* Monitor performance metrics
*/
startPerformanceMonitoring() {
const interval = setInterval(() => {
const metrics = this.collectPerformanceMetrics();
this.analyzePerformance(metrics);
}, 10000); // Check every 10 seconds
this.intervals.set('performance-monitor', interval);
}
/**
* Monitor security issues
*/
startSecurityMonitoring() {
const interval = setInterval(() => {
this.checkSecurityIssues();
}, 60000); // Check every minute
this.intervals.set('security-monitor', interval);
}
/**
* Monitor deployment health
*/
startDeploymentMonitoring() {
const interval = setInterval(() => {
this.checkDeploymentHealth();
}, 60000); // Check every minute
this.intervals.set('deployment-monitor', interval);
}
/**
* Handle detected error
*/
async handleError(type, error) {
console.error(`β Error detected (${type}):`, error.message);
// Track error
this.metrics.errors.push({
type,
error: error.message,
stack: error.stack,
timestamp: Date.now()
});
// Increment consecutive failures
this.metrics.consecutiveFailures++;
// Check if we should auto-rollback
if (this.shouldAutoRollback('error')) {
await this.triggerAutoRollback({
reason: 'error',
type,
error: error.message
});
}
// Emit event
this.emit('error:detected', { type, error });
}
/**
* Check test health
*/
async checkTestHealth() {
try {
execSync('npm test', {
cwd: this.checkpointManager.projectPath,
stdio: 'pipe'
});
// Tests passed, reset failure counter
this.metrics.consecutiveFailures = 0;
} catch (error) {
console.warn('β οΈ Test failure detected');
this.metrics.consecutiveFailures++;
if (this.shouldAutoRollback('test-failure')) {
await this.triggerAutoRollback({
reason: 'test-failure',
details: error.message
});
}
}
}
/**
* Collect performance metrics
*/
collectPerformanceMetrics() {
const usage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
return {
memory: {
heapUsed: usage.heapUsed,
heapTotal: usage.heapTotal,
rss: usage.rss,
percentUsed: usage.heapUsed / usage.heapTotal
},
cpu: {
user: cpuUsage.user,
system: cpuUsage.system
},
timestamp: Date.now()
};
}
/**
* Analyze performance metrics
*/
analyzePerformance(metrics) {
// Track metrics
this.metrics.performance.push(metrics);
// Keep only last 100 metrics
if (this.metrics.performance.length > 100) {
this.metrics.performance = this.metrics.performance.slice(-100);
}
// Check memory threshold
if (metrics.memory.percentUsed > this.thresholds.memoryUsage) {
console.warn(`β οΈ High memory usage: ${(metrics.memory.percentUsed * 100).toFixed(1)}%`);
if (this.shouldAutoRollback('memory')) {
this.triggerAutoRollback({
reason: 'high-memory',
usage: metrics.memory.percentUsed
});
}
}
// Emit metrics
this.emit('metrics:collected', metrics);
}
/**
* Check for security issues
*/
async checkSecurityIssues() {
try {
// Run security audit
const auditResult = execSync('npm audit --json', {
cwd: this.checkpointManager.projectPath,
stdio: 'pipe'
}).toString();
const audit = JSON.parse(auditResult);
// Check for critical vulnerabilities
if (audit.metadata?.vulnerabilities?.critical > 0) {
console.warn(`β οΈ Critical security vulnerabilities detected: ${audit.metadata.vulnerabilities.critical}`);
if (this.shouldAutoRollback('security')) {
await this.triggerAutoRollback({
reason: 'security-vulnerability',
critical: audit.metadata.vulnerabilities.critical,
high: audit.metadata.vulnerabilities.high
});
}
}
} catch (error) {
// Audit command failed or no vulnerabilities
}
}
/**
* Check deployment health
*/
async checkDeploymentHealth() {
// This would check actual deployment health
// For now, we'll simulate by checking if build works
try {
execSync('npm run build', {
cwd: this.checkpointManager.projectPath,
stdio: 'pipe'
});
} catch (error) {
console.warn('β οΈ Build failure detected');
if (this.shouldAutoRollback('deployment')) {
await this.triggerAutoRollback({
reason: 'build-failure',
error: error.message
});
}
}
}
/**
* Determine if auto-rollback should be triggered
*/
shouldAutoRollback(reason) {
// Don't rollback if already recovering
if (this.isRecovering) {
return false;
}
// Don't rollback too frequently
if (this.lastRecovery && Date.now() - this.lastRecovery < 60000) {
return false; // Wait at least 1 minute between rollbacks
}
// Check consecutive failures threshold
if (reason === 'error' || reason === 'test-failure') {
return this.metrics.consecutiveFailures >= this.thresholds.consecutiveFailures;
}
// Check specific reason thresholds
switch (reason) {
case 'memory':
return true; // Always rollback on high memory
case 'security':
return true; // Always rollback on critical security issues
case 'deployment':
case 'build-failure':
return this.metrics.consecutiveFailures >= 2;
default:
return false;
}
}
/**
* Trigger automatic rollback
*/
async triggerAutoRollback(context) {
if (this.isRecovering) {
console.log('β³ Recovery already in progress');
return;
}
console.log(`π¨ Auto-rollback triggered: ${context.reason}`);
this.isRecovering = true;
try {
// Find last stable checkpoint
const stableCheckpoint = await this.findStableCheckpoint();
if (!stableCheckpoint) {
throw new Error('No stable checkpoint found');
}
console.log(`π Rolling back to stable checkpoint: ${stableCheckpoint.id}`);
// Perform rollback
const result = await this.checkpointManager.rollback(stableCheckpoint.id, {
skipRecoveryPoint: false,
runHooks: true,
attemptRecovery: true
});
if (result.success) {
console.log(`β
Auto-recovery successful`);
// Reset failure counter
this.metrics.consecutiveFailures = 0;
// Track rollback
this.metrics.rollbacks.push({
timestamp: Date.now(),
reason: context.reason,
checkpoint: stableCheckpoint.id,
success: true
});
// Send notification
this.notifyRecovery(context, stableCheckpoint);
// Emit event
this.emit('recovery:success', {
context,
checkpoint: stableCheckpoint
});
} else {
throw new Error(result.error);
}
} catch (error) {
console.error(`β Auto-recovery failed: ${error.message}`);
// Track failed rollback
this.metrics.rollbacks.push({
timestamp: Date.now(),
reason: context.reason,
success: false,
error: error.message
});
// Send alert
this.sendAlert('Auto-recovery failed', context, error);
// Emit event
this.emit('recovery:failed', {
context,
error: error.message
});
} finally {
this.isRecovering = false;
this.lastRecovery = Date.now();
}
}
/**
* Find last stable checkpoint
*/
async findStableCheckpoint() {
const checkpoints = this.checkpointManager.listCheckpoints({
limit: 10
});
// Look for checkpoint with good health status
for (const checkpoint of checkpoints) {
if (checkpoint.metadata?.healthStatus?.healthy) {
return checkpoint;
}
if (checkpoint.tags.includes('stable') || checkpoint.tags.includes('good')) {
return checkpoint;
}
}
// Return oldest checkpoint as fallback
return checkpoints[checkpoints.length - 1];
}
/**
* Send recovery notification
*/
notifyRecovery(context, checkpoint) {
const message = `
π AUTO-RECOVERY COMPLETED
========================
Reason: ${context.reason}
Checkpoint: ${checkpoint.id}
Time: ${new Date().toISOString()}
Details: ${JSON.stringify(context, null, 2)}
`;
console.log(message);
// In production, this would send email/Slack notification
this.emit('notification:recovery', message);
}
/**
* Send alert for critical issues
*/
sendAlert(title, context, error) {
const message = `
π¨ CRITICAL ALERT: ${title}
===========================
Context: ${JSON.stringify(context, null, 2)}
Error: ${error?.message || 'Unknown'}
Time: ${new Date().toISOString()}
`;
console.error(message);
// In production, this would page on-call engineer
this.emit('alert:critical', { title, context, error });
}
/**
* Get recovery statistics
*/
getStats() {
const successfulRollbacks = this.metrics.rollbacks.filter(r => r.success).length;
const failedRollbacks = this.metrics.rollbacks.filter(r => !r.success).length;
return {
totalErrors: this.metrics.errors.length,
recentErrors: this.metrics.errors.slice(-10),
consecutiveFailures: this.metrics.consecutiveFailures,
totalRollbacks: this.metrics.rollbacks.length,
successfulRollbacks,
failedRollbacks,
successRate: successfulRollbacks / (successfulRollbacks + failedRollbacks) * 100,
lastRecovery: this.lastRecovery,
isRecovering: this.isRecovering,
monitoring: {
enabled: this.enabled,
monitors: this.monitors
}
};
}
/**
* Enable auto-recovery
*/
enable() {
this.enabled = true;
this.startMonitoring();
console.log('β
Auto-recovery enabled');
}
/**
* Disable auto-recovery
*/
disable() {
this.enabled = false;
this.stopMonitoring();
console.log('βΈοΈ Auto-recovery disabled');
}
/**
* Test auto-recovery system
*/
async testRecovery() {
console.log('π§ͺ Testing auto-recovery system...');
// Create a stable checkpoint
await this.checkpointManager.createCheckpoint({
type: 'manual',
description: 'Test stable checkpoint',
tags: ['stable', 'test']
});
// Simulate an error
this.metrics.consecutiveFailures = this.thresholds.consecutiveFailures;
// Trigger rollback
await this.triggerAutoRollback({
reason: 'test',
description: 'Testing auto-recovery system'
});
console.log('β
Auto-recovery test complete');
}
}
module.exports = { AutoRecovery };