UNPKG

quantum-cli-core

Version:

Quantum CLI Core - Multi-LLM Collaboration System

524 lines 20.4 kB
/** * @license * Copyright 2025 Google LLC * SPDX-License-Identifier: Apache-2.0 */ export class FailureRecoveryManager { failures = new Map(); circuitBreakers = new Map(); providers = new Map(); config; healthCheckInterval; constructor(config) { this.config = { ...{ maxRetries: 3, retryDelayMs: 1000, exponentialBackoff: true, circuitBreakerThreshold: 5, circuitBreakerTimeoutMs: 60000, // 1 minute healthCheckIntervalMs: 30000, // 30 seconds fallbackChain: [], }, ...config, }; this.startHealthChecks(); } /** * Register a provider for failure recovery management */ registerProvider(provider) { this.providers.set(provider.id, provider); // Initialize circuit breaker this.circuitBreakers.set(provider.id, { state: 'closed', failureCount: 0, }); // Initialize failure history this.failures.set(provider.id, []); } /** * Execute provider call with retry and circuit breaker logic */ async executeWithRecovery(providerId, operation, context, options) { const provider = this.providers.get(providerId); if (!provider) { throw new Error(`Provider ${providerId} not found`); } // Check circuit breaker if (!this.isProviderAvailable(providerId)) { throw new Error(`Provider ${providerId} is circuit broken`); } let lastError = null; let retryCount = 0; while (retryCount <= this.config.maxRetries) { try { const result = await operation(); // Success - reset circuit breaker this.recordSuccess(providerId); return result; } catch (error) { lastError = error; // Record failure this.recordFailure(providerId, error, context, retryCount); // Check if we should retry if (retryCount < this.config.maxRetries && this.shouldRetry(error)) { const delay = this.calculateRetryDelay(retryCount); await this.delay(delay); retryCount++; } else { break; } } } // All retries exhausted throw (lastError || new Error(`Provider ${providerId} failed after ${this.config.maxRetries} retries`)); } /** * Get the next available provider from fallback chain */ getNextProvider(failedProviderId, context) { const fallbackChain = this.config.fallbackChain.length > 0 ? this.config.fallbackChain : Array.from(this.providers.keys()); // Find the failed provider in the chain const failedIndex = fallbackChain.indexOf(failedProviderId); // Try providers after the failed one for (let i = failedIndex + 1; i < fallbackChain.length; i++) { const providerId = fallbackChain[i]; if (this.isProviderAvailable(providerId)) { return this.providers.get(providerId) || null; } } // Try providers before the failed one for (let i = 0; i < failedIndex; i++) { const providerId = fallbackChain[i]; if (this.isProviderAvailable(providerId)) { return this.providers.get(providerId) || null; } } return null; } /** * Execute with automatic fallback to other providers */ async executeWithFallback(primaryProviderId, operation, context, options) { const triedProviders = new Set(); let currentProviderId = primaryProviderId; while (currentProviderId && !triedProviders.has(currentProviderId)) { triedProviders.add(currentProviderId); try { const provider = this.providers.get(currentProviderId); if (!provider) { throw new Error(`Provider ${currentProviderId} not found`); } const result = await this.executeWithRecovery(currentProviderId, () => operation(provider), context, options); return { result, providerId: currentProviderId }; } catch (error) { console.warn(`Provider ${currentProviderId} failed, trying fallback:`, error.message); // Get next provider const nextProvider = this.getNextProvider(currentProviderId, context); currentProviderId = nextProvider?.id || ''; } } throw new Error('All providers failed or are circuit broken'); } /** * Check if a provider is available (not circuit broken) */ isProviderAvailable(providerId) { const circuitBreaker = this.circuitBreakers.get(providerId); if (!circuitBreaker) return true; const now = new Date(); switch (circuitBreaker.state) { case 'closed': return true; case 'open': // Check if timeout has elapsed if (circuitBreaker.nextAttemptTime && now >= circuitBreaker.nextAttemptTime) { // Move to half-open state circuitBreaker.state = 'half-open'; return true; } return false; case 'half-open': return true; default: return true; } } /** * Record a successful operation */ recordSuccess(providerId) { const circuitBreaker = this.circuitBreakers.get(providerId); if (circuitBreaker) { // Reset circuit breaker on success circuitBreaker.state = 'closed'; circuitBreaker.failureCount = 0; circuitBreaker.lastFailureTime = undefined; circuitBreaker.nextAttemptTime = undefined; } } /** * Record a failure and update circuit breaker */ recordFailure(providerId, error, context, retryCount = 0) { // Record failure history const failures = this.failures.get(providerId) || []; const failureRecord = { providerId, timestamp: new Date(), errorType: error.constructor.name, errorMessage: error.message, context, retryCount, }; failures.push(failureRecord); // Keep only last 100 failures if (failures.length > 100) { failures.shift(); } this.failures.set(providerId, failures); // Update circuit breaker const circuitBreaker = this.circuitBreakers.get(providerId); if (circuitBreaker) { circuitBreaker.failureCount++; circuitBreaker.lastFailureTime = new Date(); // Open circuit breaker if threshold reached if (circuitBreaker.failureCount >= this.config.circuitBreakerThreshold) { circuitBreaker.state = 'open'; circuitBreaker.nextAttemptTime = new Date(Date.now() + this.config.circuitBreakerTimeoutMs); console.warn(`Circuit breaker opened for provider ${providerId} after ${circuitBreaker.failureCount} failures`); } } } /** * Calculate retry delay with optional exponential backoff */ calculateRetryDelay(retryCount) { if (this.config.exponentialBackoff) { return this.config.retryDelayMs * Math.pow(2, retryCount); } return this.config.retryDelayMs; } /** * Delay execution */ delay(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } /** * Determine if an error is retryable */ shouldRetry(error) { // Define non-retryable errors const nonRetryableErrors = [ 'AuthenticationError', 'PermissionDeniedError', 'InvalidInputError', 'QuotaExceededError', ]; return !nonRetryableErrors.includes(error.constructor.name); } /** * Start periodic health checks */ startHealthChecks() { if (this.healthCheckInterval) { clearInterval(this.healthCheckInterval); } this.healthCheckInterval = setInterval(async () => { await this.performHealthChecks(); }, this.config.healthCheckIntervalMs); } /** * Perform health checks on all providers */ async performHealthChecks() { const healthCheckPromises = Array.from(this.providers.entries()).map(async ([providerId, provider]) => { try { const isHealthy = await provider.validateCredentials(); if (isHealthy) { // Provider is healthy - potentially close circuit breaker const circuitBreaker = this.circuitBreakers.get(providerId); if (circuitBreaker && circuitBreaker.state === 'half-open') { this.recordSuccess(providerId); } } else { // Provider is unhealthy this.recordFailure(providerId, new Error('Health check failed'), undefined, 0); } } catch (error) { this.recordFailure(providerId, error, undefined, 0); } }); await Promise.allSettled(healthCheckPromises); } /** * Get failure statistics for a provider */ getFailureStats(providerId) { const failures = this.failures.get(providerId) || []; const circuitBreaker = this.circuitBreakers.get(providerId); // Count recent failures (last hour) const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000); const recentFailures = failures.filter((f) => f.timestamp > oneHourAgo).length; // Calculate failure rate (failures per hour) const failureRate = recentFailures; return { recentFailures, totalFailures: failures.length, circuitBreakerState: circuitBreaker?.state || 'unknown', lastFailureTime: circuitBreaker?.lastFailureTime, failureRate, }; } /** * Get all provider health status */ getHealthStatus() { const status = {}; for (const [providerId] of this.providers) { const circuitBreaker = this.circuitBreakers.get(providerId); status[providerId] = { available: this.isProviderAvailable(providerId), circuitBreakerState: circuitBreaker?.state || 'closed', failureCount: circuitBreaker?.failureCount || 0, lastFailureTime: circuitBreaker?.lastFailureTime, }; } return status; } /** * Manually reset circuit breaker for a provider */ resetCircuitBreaker(providerId) { const circuitBreaker = this.circuitBreakers.get(providerId); if (circuitBreaker) { circuitBreaker.state = 'closed'; circuitBreaker.failureCount = 0; circuitBreaker.lastFailureTime = undefined; circuitBreaker.nextAttemptTime = undefined; } } /** * Update recovery configuration */ updateConfig(config) { this.config = { ...this.config, ...config }; // Restart health checks if interval changed if (config.healthCheckIntervalMs) { this.startHealthChecks(); } } /** * Get the healthiest available provider */ getHealthyProvider() { const availableProviders = Array.from(this.providers.values()).filter((provider) => this.isProviderAvailable(provider.id)); if (availableProviders.length === 0) { return null; } if (availableProviders.length === 1) { return availableProviders[0]; } // Sort by health metrics (lower failure count is better) const providerHealthScores = availableProviders.map((provider) => { const circuitBreaker = this.circuitBreakers.get(provider.id); const failures = this.failures.get(provider.id) || []; // Recent failures (last hour) const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000); const recentFailures = failures.filter((f) => f.timestamp > oneHourAgo).length; // Health score (higher is better) const healthScore = 100 - (circuitBreaker?.failureCount || 0) - recentFailures * 5; return { provider, healthScore }; }); providerHealthScores.sort((a, b) => b.healthScore - a.healthScore); return providerHealthScores[0].provider; } /** * Get provider health information */ getProviderHealth(providerId) { const circuitBreaker = this.circuitBreakers.get(providerId); const failures = this.failures.get(providerId) || []; // Recent failures (last hour) const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000); const recentFailures = failures.filter((f) => f.timestamp > oneHourAgo).length; // Health score calculation const healthScore = Math.max(0, 100 - (circuitBreaker?.failureCount || 0) * 10 - recentFailures * 5); return { isAvailable: this.isProviderAvailable(providerId), circuitBreakerState: circuitBreaker?.state || 'closed', failureCount: circuitBreaker?.failureCount || 0, recentFailures, healthScore, lastFailure: circuitBreaker?.lastFailureTime, }; } /** * Execute with smart fallback that considers provider health */ async executeWithSmartFallback(operation, context, options) { const maxAttempts = options?.maxAttempts || 3; const usedProviders = new Set(); let attemptsUsed = 0; // Try preferred provider first if specified and available if (options?.preferredProviderId && this.isProviderAvailable(options.preferredProviderId)) { const preferredProvider = this.providers.get(options.preferredProviderId); if (preferredProvider) { try { attemptsUsed++; const result = await this.executeWithRecovery(options.preferredProviderId, () => operation(preferredProvider), context); return { result, providerId: options.preferredProviderId, attemptsUsed, }; } catch (error) { usedProviders.add(options.preferredProviderId); console.warn(`Preferred provider ${options.preferredProviderId} failed:`, error.message); } } } // Try providers in health order while (attemptsUsed < maxAttempts) { const nextProvider = this.getHealthyProvider(); if (!nextProvider || usedProviders.has(nextProvider.id)) { // No more healthy providers available break; } try { attemptsUsed++; const result = await this.executeWithRecovery(nextProvider.id, () => operation(nextProvider), context); return { result, providerId: nextProvider.id, attemptsUsed }; } catch (error) { usedProviders.add(nextProvider.id); console.warn(`Provider ${nextProvider.id} failed:`, error.message); // If diversity is required, temporarily mark as unavailable if (options?.requireDiversity) { const circuitBreaker = this.circuitBreakers.get(nextProvider.id); if (circuitBreaker) { circuitBreaker.state = 'open'; circuitBreaker.nextAttemptTime = new Date(Date.now() + 10000); // 10 second timeout } } } } throw new Error(`All providers failed after ${attemptsUsed} attempts`); } /** * Bulk health check for all providers */ async bulkHealthCheck() { const results = {}; const healthChecks = Array.from(this.providers.entries()).map(async ([providerId, provider]) => { const startTime = Date.now(); try { const isHealthy = await provider.validateCredentials(); const latency = Date.now() - startTime; results[providerId] = { healthy: isHealthy, latency, }; } catch (error) { const latency = Date.now() - startTime; results[providerId] = { healthy: false, latency, error: error.message, }; } }); await Promise.allSettled(healthChecks); return results; } /** * Get recovery recommendations based on current state */ getRecoveryRecommendations() { const recommendations = []; const providers = Array.from(this.providers.keys()); const healthyProviders = providers.filter((id) => this.isProviderAvailable(id)); const healthRatio = healthyProviders.length / providers.length; // Check for circuit breakers that could be reset for (const [providerId, circuitBreaker] of this.circuitBreakers) { if (circuitBreaker.state === 'open' && circuitBreaker.nextAttemptTime && new Date() > circuitBreaker.nextAttemptTime) { recommendations.push({ type: 'circuit_breaker_reset', providerId, reason: `Circuit breaker timeout elapsed for ${providerId}`, action: `Consider resetting circuit breaker for ${providerId}`, priority: 'medium', }); } } // Check for providers with high failure rates for (const [providerId, failures] of this.failures) { const recentFailures = failures.filter((f) => f.timestamp > new Date(Date.now() - 60 * 60 * 1000)).length; if (recentFailures > 5) { recommendations.push({ type: 'health_check', providerId, reason: `High failure rate: ${recentFailures} failures in last hour`, action: `Perform detailed health check on ${providerId}`, priority: 'high', }); } } // Overall system recommendations if (healthRatio < 0.5) { recommendations.push({ type: 'config_adjustment', reason: `Only ${(healthRatio * 100).toFixed(0)}% of providers are healthy`, action: 'Consider adjusting circuit breaker thresholds or retry limits', priority: 'high', }); } if (healthyProviders.length <= 1) { recommendations.push({ type: 'provider_rotation', reason: 'Only one or no providers available', action: 'Add more providers or increase retry tolerance', priority: 'high', }); } // Determine overall health let overallHealth; if (healthRatio >= 0.8) { overallHealth = 'good'; } else if (healthRatio >= 0.5) { overallHealth = 'degraded'; } else { overallHealth = 'critical'; } return { recommendations, overallHealth }; } /** * Cleanup resources */ cleanup() { if (this.healthCheckInterval) { clearInterval(this.healthCheckInterval); this.healthCheckInterval = undefined; } } } //# sourceMappingURL=failure-recovery.js.map