aiwg
Version:
Deployment tool and support utility for AI context. Copies agents, skills, commands, rules, and behaviors into the paths each AI platform reads (Claude Code, Codex, Copilot, Cursor, Warp, OpenClaw, and 6 more) so one source of truth works across 10 platfo
454 lines (380 loc) • 11.6 kB
text/typescript
/**
* @file error-recovery.ts
* @description Automated error recovery and resilience system
*
* Implements F-012/UC-012: Error Recovery
* - Automatic error detection and classification
* - Recovery strategy execution
* - Retry logic with exponential backoff
* - Circuit breaker pattern
* - Graceful degradation
* - Error reporting and logging
*
* @implements NFR-RECOV-001: Recovery time <30s for transient errors
* @implements NFR-RECOV-002: 95% automatic recovery success rate
* @implements NFR-RECOV-003: Zero data loss during recovery
*/
import { EventEmitter } from 'events';
// ============================================================================
// Types and Interfaces
// ============================================================================
export type ErrorSeverity = 'transient' | 'recoverable' | 'critical';
export type RecoveryStrategy = 'retry' | 'fallback' | 'circuit-breaker' | 'restart' | 'manual';
export interface RecoverableError {
timestamp: Date;
error: Error;
severity: ErrorSeverity;
context: Record<string, any>;
stackTrace?: string;
}
export interface RecoveryAttempt {
timestamp: Date;
strategy: RecoveryStrategy;
success: boolean;
duration: number; // ms
error?: string;
}
export interface RecoveryResult {
recovered: boolean;
attempts: RecoveryAttempt[];
finalStrategy?: RecoveryStrategy;
totalDuration: number;
}
export interface CircuitBreakerState {
state: 'closed' | 'open' | 'half-open';
failures: number;
lastFailureTime?: Date;
nextAttemptTime?: Date;
}
export interface ErrorRecoveryConfig {
maxRetries?: number;
retryDelay?: number; // ms
exponentialBackoff?: boolean;
circuitBreakerThreshold?: number;
circuitBreakerTimeout?: number; // ms
fallbackEnabled?: boolean;
}
// ============================================================================
// Error Recovery Class
// ============================================================================
export class ErrorRecoverySystem extends EventEmitter {
private config: Required<ErrorRecoveryConfig>;
private errorHistory: RecoverableError[];
private circuitBreakers: Map<string, CircuitBreakerState>;
constructor(config: ErrorRecoveryConfig = {}) {
super();
this.config = {
maxRetries: config.maxRetries || 3,
retryDelay: config.retryDelay || 1000,
exponentialBackoff: config.exponentialBackoff !== false,
circuitBreakerThreshold: config.circuitBreakerThreshold || 5,
circuitBreakerTimeout: config.circuitBreakerTimeout || 60000,
fallbackEnabled: config.fallbackEnabled !== false
};
this.errorHistory = [];
this.circuitBreakers = new Map();
}
// ========================================================================
// Recovery Methods
// ========================================================================
/**
* Attempt to recover from an error
*/
public async recover<T>(
operation: () => Promise<T>,
fallback?: () => Promise<T>,
context?: Record<string, any>
): Promise<T> {
const startTime = Date.now();
const attempts: RecoveryAttempt[] = [];
try {
// Check circuit breaker
const circuitKey = context?.operation || 'default';
if (this.isCircuitOpen(circuitKey)) {
throw new Error('Circuit breaker is OPEN');
}
// Try main operation with retries
const result = await this.retryWithBackoff(operation, attempts);
this.recordSuccess(circuitKey);
return result;
} catch (error) {
const circuitKey = context?.operation || 'default';
this.recordFailure(circuitKey);
// Log error
this.logError(error as Error, context);
// Try fallback if available
if (fallback && this.config.fallbackEnabled) {
try {
const fallbackResult = await fallback();
attempts.push({
timestamp: new Date(),
strategy: 'fallback',
success: true,
duration: Date.now() - startTime
});
this.emit('recovered', {
recovered: true,
attempts,
finalStrategy: 'fallback',
totalDuration: Date.now() - startTime
});
return fallbackResult;
} catch (fallbackError) {
attempts.push({
timestamp: new Date(),
strategy: 'fallback',
success: false,
duration: Date.now() - startTime,
error: (fallbackError as Error).message
});
}
}
// Recovery failed
this.emit('failed', {
recovered: false,
attempts,
totalDuration: Date.now() - startTime
});
throw error;
}
}
/**
* Retry operation with exponential backoff
*/
private async retryWithBackoff<T>(
operation: () => Promise<T>,
attempts: RecoveryAttempt[]
): Promise<T> {
let lastError: Error;
for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
const attemptStart = Date.now();
try {
const result = await operation();
attempts.push({
timestamp: new Date(),
strategy: 'retry',
success: true,
duration: Date.now() - attemptStart
});
return result;
} catch (error) {
lastError = error as Error;
attempts.push({
timestamp: new Date(),
strategy: 'retry',
success: false,
duration: Date.now() - attemptStart,
error: lastError.message
});
// Don't retry on last attempt
if (attempt < this.config.maxRetries) {
const delay = this.calculateDelay(attempt);
await this.sleep(delay);
}
}
}
throw lastError!;
}
/**
* Calculate retry delay with exponential backoff
*/
private calculateDelay(attempt: number): number {
if (!this.config.exponentialBackoff) {
return this.config.retryDelay;
}
return this.config.retryDelay * Math.pow(2, attempt);
}
/**
* Sleep for specified duration
*/
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
// ========================================================================
// Circuit Breaker Methods
// ========================================================================
/**
* Check if circuit breaker is open
*/
private isCircuitOpen(key: string): boolean {
const state = this.circuitBreakers.get(key);
if (!state) {
return false;
}
if (state.state === 'closed') {
return false;
}
if (state.state === 'open') {
// Check if timeout has elapsed
if (state.nextAttemptTime && new Date() >= state.nextAttemptTime) {
this.setCircuitState(key, 'half-open');
return false;
}
return true;
}
return false; // half-open allows one attempt
}
/**
* Record successful operation
*/
private recordSuccess(key: string): void {
const state = this.circuitBreakers.get(key);
if (state && state.state === 'half-open') {
this.setCircuitState(key, 'closed');
}
if (state) {
state.failures = 0;
}
}
/**
* Record failed operation
*/
private recordFailure(key: string): void {
let state = this.circuitBreakers.get(key);
if (!state) {
state = {
state: 'closed',
failures: 0
};
this.circuitBreakers.set(key, state);
}
state.failures++;
state.lastFailureTime = new Date();
if (state.failures >= this.config.circuitBreakerThreshold) {
this.openCircuit(key);
}
}
/**
* Open circuit breaker
*/
private openCircuit(key: string): void {
const nextAttemptTime = new Date();
nextAttemptTime.setTime(nextAttemptTime.getTime() + this.config.circuitBreakerTimeout);
this.setCircuitState(key, 'open', nextAttemptTime);
this.emit('circuitOpened', {
key,
failures: this.circuitBreakers.get(key)?.failures,
nextAttemptTime
});
}
/**
* Set circuit breaker state
*/
private setCircuitState(key: string, state: 'closed' | 'open' | 'half-open', nextAttemptTime?: Date): void {
const circuitState = this.circuitBreakers.get(key) || {
state: 'closed',
failures: 0
};
circuitState.state = state;
if (nextAttemptTime) {
circuitState.nextAttemptTime = nextAttemptTime;
}
this.circuitBreakers.set(key, circuitState);
}
/**
* Get circuit breaker state
*/
public getCircuitState(key: string): CircuitBreakerState | undefined {
return this.circuitBreakers.get(key);
}
/**
* Reset circuit breaker
*/
public resetCircuit(key: string): void {
this.circuitBreakers.delete(key);
}
// ========================================================================
// Error Classification
// ========================================================================
/**
* Classify error severity
*/
public classifyError(error: Error): ErrorSeverity {
const message = error.message.toLowerCase();
// Transient errors (network, timeouts)
if (message.includes('timeout') ||
message.includes('network') ||
message.includes('econnrefused') ||
message.includes('enotfound')) {
return 'transient';
}
// Critical errors (data corruption, system failures)
if (message.includes('corrupt') ||
message.includes('out of memory') ||
message.includes('segfault')) {
return 'critical';
}
// Default to recoverable
return 'recoverable';
}
// ========================================================================
// Logging and Monitoring
// ========================================================================
/**
* Log error with context
*/
private logError(error: Error, context?: Record<string, any>): void {
const recoverableError: RecoverableError = {
timestamp: new Date(),
error,
severity: this.classifyError(error),
context: context || {},
stackTrace: error.stack
};
this.errorHistory.push(recoverableError);
this.emit('error', recoverableError);
// Prune old errors
if (this.errorHistory.length > 1000) {
this.errorHistory = this.errorHistory.slice(-500);
}
}
/**
* Get error history
*/
public getErrorHistory(count: number = 100): RecoverableError[] {
return this.errorHistory.slice(-count);
}
/**
* Get error statistics
*/
public getStatistics(): {
totalErrors: number;
transientErrors: number;
recoverableErrors: number;
criticalErrors: number;
circuitBreakerTrips: number;
} {
const stats = {
totalErrors: this.errorHistory.length,
transientErrors: 0,
recoverableErrors: 0,
criticalErrors: 0,
circuitBreakerTrips: 0
};
for (const error of this.errorHistory) {
switch (error.severity) {
case 'transient':
stats.transientErrors++;
break;
case 'recoverable':
stats.recoverableErrors++;
break;
case 'critical':
stats.criticalErrors++;
break;
}
}
for (const state of this.circuitBreakers.values()) {
if (state.state === 'open') {
stats.circuitBreakerTrips++;
}
}
return stats;
}
/**
* Clear error history
*/
public clearHistory(): void {
this.errorHistory = [];
}
}