quantum-cli-core
Version:
Quantum CLI Core - Multi-LLM Collaboration System
524 lines • 20.4 kB
JavaScript
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
export class FailureRecoveryManager {
failures = new Map();
circuitBreakers = new Map();
providers = new Map();
config;
healthCheckInterval;
constructor(config) {
this.config = {
...{
maxRetries: 3,
retryDelayMs: 1000,
exponentialBackoff: true,
circuitBreakerThreshold: 5,
circuitBreakerTimeoutMs: 60000, // 1 minute
healthCheckIntervalMs: 30000, // 30 seconds
fallbackChain: [],
},
...config,
};
this.startHealthChecks();
}
/**
* Register a provider for failure recovery management
*/
registerProvider(provider) {
this.providers.set(provider.id, provider);
// Initialize circuit breaker
this.circuitBreakers.set(provider.id, {
state: 'closed',
failureCount: 0,
});
// Initialize failure history
this.failures.set(provider.id, []);
}
/**
* Execute provider call with retry and circuit breaker logic
*/
async executeWithRecovery(providerId, operation, context, options) {
const provider = this.providers.get(providerId);
if (!provider) {
throw new Error(`Provider ${providerId} not found`);
}
// Check circuit breaker
if (!this.isProviderAvailable(providerId)) {
throw new Error(`Provider ${providerId} is circuit broken`);
}
let lastError = null;
let retryCount = 0;
while (retryCount <= this.config.maxRetries) {
try {
const result = await operation();
// Success - reset circuit breaker
this.recordSuccess(providerId);
return result;
}
catch (error) {
lastError = error;
// Record failure
this.recordFailure(providerId, error, context, retryCount);
// Check if we should retry
if (retryCount < this.config.maxRetries && this.shouldRetry(error)) {
const delay = this.calculateRetryDelay(retryCount);
await this.delay(delay);
retryCount++;
}
else {
break;
}
}
}
// All retries exhausted
throw (lastError ||
new Error(`Provider ${providerId} failed after ${this.config.maxRetries} retries`));
}
/**
* Get the next available provider from fallback chain
*/
getNextProvider(failedProviderId, context) {
const fallbackChain = this.config.fallbackChain.length > 0
? this.config.fallbackChain
: Array.from(this.providers.keys());
// Find the failed provider in the chain
const failedIndex = fallbackChain.indexOf(failedProviderId);
// Try providers after the failed one
for (let i = failedIndex + 1; i < fallbackChain.length; i++) {
const providerId = fallbackChain[i];
if (this.isProviderAvailable(providerId)) {
return this.providers.get(providerId) || null;
}
}
// Try providers before the failed one
for (let i = 0; i < failedIndex; i++) {
const providerId = fallbackChain[i];
if (this.isProviderAvailable(providerId)) {
return this.providers.get(providerId) || null;
}
}
return null;
}
/**
* Execute with automatic fallback to other providers
*/
async executeWithFallback(primaryProviderId, operation, context, options) {
const triedProviders = new Set();
let currentProviderId = primaryProviderId;
while (currentProviderId && !triedProviders.has(currentProviderId)) {
triedProviders.add(currentProviderId);
try {
const provider = this.providers.get(currentProviderId);
if (!provider) {
throw new Error(`Provider ${currentProviderId} not found`);
}
const result = await this.executeWithRecovery(currentProviderId, () => operation(provider), context, options);
return { result, providerId: currentProviderId };
}
catch (error) {
console.warn(`Provider ${currentProviderId} failed, trying fallback:`, error.message);
// Get next provider
const nextProvider = this.getNextProvider(currentProviderId, context);
currentProviderId = nextProvider?.id || '';
}
}
throw new Error('All providers failed or are circuit broken');
}
/**
* Check if a provider is available (not circuit broken)
*/
isProviderAvailable(providerId) {
const circuitBreaker = this.circuitBreakers.get(providerId);
if (!circuitBreaker)
return true;
const now = new Date();
switch (circuitBreaker.state) {
case 'closed':
return true;
case 'open':
// Check if timeout has elapsed
if (circuitBreaker.nextAttemptTime &&
now >= circuitBreaker.nextAttemptTime) {
// Move to half-open state
circuitBreaker.state = 'half-open';
return true;
}
return false;
case 'half-open':
return true;
default:
return true;
}
}
/**
* Record a successful operation
*/
recordSuccess(providerId) {
const circuitBreaker = this.circuitBreakers.get(providerId);
if (circuitBreaker) {
// Reset circuit breaker on success
circuitBreaker.state = 'closed';
circuitBreaker.failureCount = 0;
circuitBreaker.lastFailureTime = undefined;
circuitBreaker.nextAttemptTime = undefined;
}
}
/**
* Record a failure and update circuit breaker
*/
recordFailure(providerId, error, context, retryCount = 0) {
// Record failure history
const failures = this.failures.get(providerId) || [];
const failureRecord = {
providerId,
timestamp: new Date(),
errorType: error.constructor.name,
errorMessage: error.message,
context,
retryCount,
};
failures.push(failureRecord);
// Keep only last 100 failures
if (failures.length > 100) {
failures.shift();
}
this.failures.set(providerId, failures);
// Update circuit breaker
const circuitBreaker = this.circuitBreakers.get(providerId);
if (circuitBreaker) {
circuitBreaker.failureCount++;
circuitBreaker.lastFailureTime = new Date();
// Open circuit breaker if threshold reached
if (circuitBreaker.failureCount >= this.config.circuitBreakerThreshold) {
circuitBreaker.state = 'open';
circuitBreaker.nextAttemptTime = new Date(Date.now() + this.config.circuitBreakerTimeoutMs);
console.warn(`Circuit breaker opened for provider ${providerId} after ${circuitBreaker.failureCount} failures`);
}
}
}
/**
* Calculate retry delay with optional exponential backoff
*/
calculateRetryDelay(retryCount) {
if (this.config.exponentialBackoff) {
return this.config.retryDelayMs * Math.pow(2, retryCount);
}
return this.config.retryDelayMs;
}
/**
* Delay execution
*/
delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Determine if an error is retryable
*/
shouldRetry(error) {
// Define non-retryable errors
const nonRetryableErrors = [
'AuthenticationError',
'PermissionDeniedError',
'InvalidInputError',
'QuotaExceededError',
];
return !nonRetryableErrors.includes(error.constructor.name);
}
/**
* Start periodic health checks
*/
startHealthChecks() {
if (this.healthCheckInterval) {
clearInterval(this.healthCheckInterval);
}
this.healthCheckInterval = setInterval(async () => {
await this.performHealthChecks();
}, this.config.healthCheckIntervalMs);
}
/**
* Perform health checks on all providers
*/
async performHealthChecks() {
const healthCheckPromises = Array.from(this.providers.entries()).map(async ([providerId, provider]) => {
try {
const isHealthy = await provider.validateCredentials();
if (isHealthy) {
// Provider is healthy - potentially close circuit breaker
const circuitBreaker = this.circuitBreakers.get(providerId);
if (circuitBreaker && circuitBreaker.state === 'half-open') {
this.recordSuccess(providerId);
}
}
else {
// Provider is unhealthy
this.recordFailure(providerId, new Error('Health check failed'), undefined, 0);
}
}
catch (error) {
this.recordFailure(providerId, error, undefined, 0);
}
});
await Promise.allSettled(healthCheckPromises);
}
/**
* Get failure statistics for a provider
*/
getFailureStats(providerId) {
const failures = this.failures.get(providerId) || [];
const circuitBreaker = this.circuitBreakers.get(providerId);
// Count recent failures (last hour)
const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000);
const recentFailures = failures.filter((f) => f.timestamp > oneHourAgo).length;
// Calculate failure rate (failures per hour)
const failureRate = recentFailures;
return {
recentFailures,
totalFailures: failures.length,
circuitBreakerState: circuitBreaker?.state || 'unknown',
lastFailureTime: circuitBreaker?.lastFailureTime,
failureRate,
};
}
/**
* Get all provider health status
*/
getHealthStatus() {
const status = {};
for (const [providerId] of this.providers) {
const circuitBreaker = this.circuitBreakers.get(providerId);
status[providerId] = {
available: this.isProviderAvailable(providerId),
circuitBreakerState: circuitBreaker?.state || 'closed',
failureCount: circuitBreaker?.failureCount || 0,
lastFailureTime: circuitBreaker?.lastFailureTime,
};
}
return status;
}
/**
* Manually reset circuit breaker for a provider
*/
resetCircuitBreaker(providerId) {
const circuitBreaker = this.circuitBreakers.get(providerId);
if (circuitBreaker) {
circuitBreaker.state = 'closed';
circuitBreaker.failureCount = 0;
circuitBreaker.lastFailureTime = undefined;
circuitBreaker.nextAttemptTime = undefined;
}
}
/**
* Update recovery configuration
*/
updateConfig(config) {
this.config = { ...this.config, ...config };
// Restart health checks if interval changed
if (config.healthCheckIntervalMs) {
this.startHealthChecks();
}
}
/**
* Get the healthiest available provider
*/
getHealthyProvider() {
const availableProviders = Array.from(this.providers.values()).filter((provider) => this.isProviderAvailable(provider.id));
if (availableProviders.length === 0) {
return null;
}
if (availableProviders.length === 1) {
return availableProviders[0];
}
// Sort by health metrics (lower failure count is better)
const providerHealthScores = availableProviders.map((provider) => {
const circuitBreaker = this.circuitBreakers.get(provider.id);
const failures = this.failures.get(provider.id) || [];
// Recent failures (last hour)
const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000);
const recentFailures = failures.filter((f) => f.timestamp > oneHourAgo).length;
// Health score (higher is better)
const healthScore = 100 - (circuitBreaker?.failureCount || 0) - recentFailures * 5;
return { provider, healthScore };
});
providerHealthScores.sort((a, b) => b.healthScore - a.healthScore);
return providerHealthScores[0].provider;
}
/**
* Get provider health information
*/
getProviderHealth(providerId) {
const circuitBreaker = this.circuitBreakers.get(providerId);
const failures = this.failures.get(providerId) || [];
// Recent failures (last hour)
const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000);
const recentFailures = failures.filter((f) => f.timestamp > oneHourAgo).length;
// Health score calculation
const healthScore = Math.max(0, 100 - (circuitBreaker?.failureCount || 0) * 10 - recentFailures * 5);
return {
isAvailable: this.isProviderAvailable(providerId),
circuitBreakerState: circuitBreaker?.state || 'closed',
failureCount: circuitBreaker?.failureCount || 0,
recentFailures,
healthScore,
lastFailure: circuitBreaker?.lastFailureTime,
};
}
/**
* Execute with smart fallback that considers provider health
*/
async executeWithSmartFallback(operation, context, options) {
const maxAttempts = options?.maxAttempts || 3;
const usedProviders = new Set();
let attemptsUsed = 0;
// Try preferred provider first if specified and available
if (options?.preferredProviderId &&
this.isProviderAvailable(options.preferredProviderId)) {
const preferredProvider = this.providers.get(options.preferredProviderId);
if (preferredProvider) {
try {
attemptsUsed++;
const result = await this.executeWithRecovery(options.preferredProviderId, () => operation(preferredProvider), context);
return {
result,
providerId: options.preferredProviderId,
attemptsUsed,
};
}
catch (error) {
usedProviders.add(options.preferredProviderId);
console.warn(`Preferred provider ${options.preferredProviderId} failed:`, error.message);
}
}
}
// Try providers in health order
while (attemptsUsed < maxAttempts) {
const nextProvider = this.getHealthyProvider();
if (!nextProvider || usedProviders.has(nextProvider.id)) {
// No more healthy providers available
break;
}
try {
attemptsUsed++;
const result = await this.executeWithRecovery(nextProvider.id, () => operation(nextProvider), context);
return { result, providerId: nextProvider.id, attemptsUsed };
}
catch (error) {
usedProviders.add(nextProvider.id);
console.warn(`Provider ${nextProvider.id} failed:`, error.message);
// If diversity is required, temporarily mark as unavailable
if (options?.requireDiversity) {
const circuitBreaker = this.circuitBreakers.get(nextProvider.id);
if (circuitBreaker) {
circuitBreaker.state = 'open';
circuitBreaker.nextAttemptTime = new Date(Date.now() + 10000); // 10 second timeout
}
}
}
}
throw new Error(`All providers failed after ${attemptsUsed} attempts`);
}
/**
* Bulk health check for all providers
*/
async bulkHealthCheck() {
const results = {};
const healthChecks = Array.from(this.providers.entries()).map(async ([providerId, provider]) => {
const startTime = Date.now();
try {
const isHealthy = await provider.validateCredentials();
const latency = Date.now() - startTime;
results[providerId] = {
healthy: isHealthy,
latency,
};
}
catch (error) {
const latency = Date.now() - startTime;
results[providerId] = {
healthy: false,
latency,
error: error.message,
};
}
});
await Promise.allSettled(healthChecks);
return results;
}
/**
* Get recovery recommendations based on current state
*/
getRecoveryRecommendations() {
const recommendations = [];
const providers = Array.from(this.providers.keys());
const healthyProviders = providers.filter((id) => this.isProviderAvailable(id));
const healthRatio = healthyProviders.length / providers.length;
// Check for circuit breakers that could be reset
for (const [providerId, circuitBreaker] of this.circuitBreakers) {
if (circuitBreaker.state === 'open' &&
circuitBreaker.nextAttemptTime &&
new Date() > circuitBreaker.nextAttemptTime) {
recommendations.push({
type: 'circuit_breaker_reset',
providerId,
reason: `Circuit breaker timeout elapsed for ${providerId}`,
action: `Consider resetting circuit breaker for ${providerId}`,
priority: 'medium',
});
}
}
// Check for providers with high failure rates
for (const [providerId, failures] of this.failures) {
const recentFailures = failures.filter((f) => f.timestamp > new Date(Date.now() - 60 * 60 * 1000)).length;
if (recentFailures > 5) {
recommendations.push({
type: 'health_check',
providerId,
reason: `High failure rate: ${recentFailures} failures in last hour`,
action: `Perform detailed health check on ${providerId}`,
priority: 'high',
});
}
}
// Overall system recommendations
if (healthRatio < 0.5) {
recommendations.push({
type: 'config_adjustment',
reason: `Only ${(healthRatio * 100).toFixed(0)}% of providers are healthy`,
action: 'Consider adjusting circuit breaker thresholds or retry limits',
priority: 'high',
});
}
if (healthyProviders.length <= 1) {
recommendations.push({
type: 'provider_rotation',
reason: 'Only one or no providers available',
action: 'Add more providers or increase retry tolerance',
priority: 'high',
});
}
// Determine overall health
let overallHealth;
if (healthRatio >= 0.8) {
overallHealth = 'good';
}
else if (healthRatio >= 0.5) {
overallHealth = 'degraded';
}
else {
overallHealth = 'critical';
}
return { recommendations, overallHealth };
}
/**
* Cleanup resources
*/
cleanup() {
if (this.healthCheckInterval) {
clearInterval(this.healthCheckInterval);
this.healthCheckInterval = undefined;
}
}
}
//# sourceMappingURL=failure-recovery.js.map