UNPKG

@stackmemoryai/stackmemory

Version:

Project-scoped memory for AI coding tools. Durable context across sessions with MCP integration, frames, smart retrieval, Claude Code skills, and automatic hooks.

463 lines (462 loc) 15.6 kB
import { fileURLToPath as __fileURLToPath } from 'url'; import { dirname as __pathDirname } from 'path'; const __filename = __fileURLToPath(import.meta.url); const __dirname = __pathDirname(__filename); import * as fs from "fs/promises"; import * as path from "path"; import { logger } from "../../../core/monitoring/logger.js"; class CrashRecoverySystem { checkpoints = /* @__PURE__ */ new Map(); crashReports = []; recoveryStrategies = []; swarmCoordinator; checkpointInterval; recoveryDir; constructor(swarmCoordinator, recoveryDir = ".swarm/recovery") { this.swarmCoordinator = swarmCoordinator; this.recoveryDir = recoveryDir; this.setupRecoveryStrategies(); } /** * Initialize crash recovery system */ async initialize() { await this.ensureRecoveryDirectory(); await this.loadExistingCheckpoints(); this.startPeriodicCheckpoints(); process.on("unhandledRejection", (reason, promise) => { this.handleCrash(new Error(`Unhandled Rejection: ${reason}`), { type: "unhandled_rejection", promise: promise.toString() }); }); process.on("uncaughtException", (error) => { this.handleCrash(error, { type: "uncaught_exception" }); }); logger.info("Crash recovery system initialized"); } /** * Create recovery checkpoint */ async createCheckpoint(swarmId, reason = "periodic") { try { const swarmState = this.swarmCoordinator.swarmState; const agents = Array.from( this.swarmCoordinator.activeAgents.values() ); const checkpoint = { id: this.generateId(), swarmId, timestamp: Date.now(), swarmState: { ...swarmState }, agents: agents.map((agent) => ({ ...agent })), tasks: swarmState.tasks || [], errorLog: this.crashReports.slice(-10), // Last 10 errors gitState: await this.captureGitState() }; const checkpointPath = path.join( this.recoveryDir, `checkpoint-${checkpoint.id}.json` ); await fs.writeFile(checkpointPath, JSON.stringify(checkpoint, null, 2)); this.checkpoints.set(checkpoint.id, checkpoint); logger.info( `Created checkpoint ${checkpoint.id} for swarm ${swarmId} (${reason})` ); return checkpoint.id; } catch (error) { logger.error("Failed to create checkpoint:", error); throw error; } } /** * Handle crash or error */ async handleCrash(error, context = {}) { const report = { id: this.generateId(), timestamp: Date.now(), agentId: context.agentId, errorType: this.classifyError(error, context), error, context, recoveryAction: "", severity: this.assessSeverity(error, context), resolved: false }; this.crashReports.push(report); logger.error(`Crash detected [${report.id}]:`, error); const recovered = await this.attemptRecovery(report); if (recovered) { report.resolved = true; report.recoveryAction = "auto_recovered"; logger.info(`Successfully recovered from crash ${report.id}`); } else { logger.error(`Failed to recover from crash ${report.id}`); if (report.severity === "critical") { await this.escalateCriticalFailure(report); } } await this.saveCrashReport(report); } /** * Restore from checkpoint */ async restoreFromCheckpoint(checkpointId) { try { const checkpoint = this.checkpoints.get(checkpointId); if (!checkpoint) { logger.error(`Checkpoint ${checkpointId} not found`); return false; } logger.info(`Restoring from checkpoint ${checkpointId}`); await this.restoreGitState(checkpoint.gitState); if (checkpoint.databaseBackup) { await this.restoreDatabase(checkpoint.databaseBackup); } await this.restoreSwarmState(checkpoint); logger.info(`Successfully restored from checkpoint ${checkpointId}`); return true; } catch (error) { logger.error( `Failed to restore from checkpoint ${checkpointId}:`, error ); return false; } } /** * Get recovery recommendations */ getRecoveryRecommendations() { const recent = Date.now() - 36e5; const recentCheckpoints = Array.from(this.checkpoints.values()).filter((cp) => cp.timestamp > recent).sort((a, b) => b.timestamp - a.timestamp).slice(0, 5); const errorCounts = /* @__PURE__ */ new Map(); const errorTimes = /* @__PURE__ */ new Map(); for (const report of this.crashReports.filter( (r) => r.timestamp > recent )) { errorCounts.set( report.errorType, (errorCounts.get(report.errorType) || 0) + 1 ); errorTimes.set( report.errorType, Math.max(errorTimes.get(report.errorType) || 0, report.timestamp) ); } const frequentErrors = Array.from(errorCounts.entries()).map(([type, count]) => ({ type, count, lastOccurrence: errorTimes.get(type) || 0 })).sort((a, b) => b.count - a.count); const criticalErrors = this.crashReports.filter( (r) => r.severity === "critical" && r.timestamp > recent && !r.resolved ); const systemHealth = criticalErrors.length > 0 ? "critical" : frequentErrors.length > 3 ? "degraded" : "good"; return { recentCheckpoints, frequentErrors, recoveryActions: this.generateRecoveryActions(frequentErrors), systemHealth }; } /** * Auto-recovery from common failures */ async attemptAutoRecovery(swarmId) { logger.info(`Attempting auto-recovery for swarm ${swarmId}`); try { const recentCheckpoint = this.findRecentCheckpoint(swarmId); if (recentCheckpoint) { logger.info(`Found recent checkpoint: ${recentCheckpoint.id}`); return await this.restoreFromCheckpoint(recentCheckpoint.id); } await this.swarmCoordinator.forceCleanup(); await this.clearProblematicState(); logger.info("Restarting swarm with minimal configuration"); return true; } catch (error) { logger.error("Auto-recovery failed:", error); return false; } } async attemptRecovery(report) { for (const strategy of this.recoveryStrategies) { if (strategy.condition(report.error, report.context)) { logger.info(`Applying recovery strategy: ${strategy.errorType}`); let retries = 0; while (retries < strategy.maxRetries) { try { const success = await strategy.action(report, this); if (success) { report.recoveryAction = strategy.errorType; return true; } } catch (error) { logger.warn( `Recovery attempt ${retries + 1} failed:`, error ); } retries++; if (retries < strategy.maxRetries) { await this.sleep(strategy.backoffMs * Math.pow(2, retries)); } } } } return false; } classifyError(error, context) { const message = error.message.toLowerCase(); if (message.includes("database") || message.includes("sqlite")) { return "database_failure"; } else if (message.includes("git") || message.includes("branch")) { return "git_conflict"; } else if (message.includes("timeout") || context.timeout) { return "agent_timeout"; } else if (message.includes("memory") || message.includes("heap")) { return "memory_overflow"; } else if (message.includes("network") || message.includes("connect")) { return "network_error"; } return "database_failure"; } assessSeverity(error, context) { if (context.type === "uncaught_exception") return "critical"; if (error.message.includes("unhandled")) return "high"; if (error.message.includes("database")) return "medium"; return "low"; } setupRecoveryStrategies() { this.recoveryStrategies = [ { errorType: "database_failure", condition: (error) => error.message.includes("database") || error.message.includes("sqlite"), action: async (report, recovery) => { logger.info("Attempting database recovery"); try { await recovery.clearProblematicState(); return true; } catch { return false; } }, maxRetries: 3, backoffMs: 1e3 }, { errorType: "git_conflict", condition: (error) => error.message.includes("git") || error.message.includes("branch"), action: async (report, recovery) => { logger.info("Attempting git conflict resolution"); try { const { execSync } = await import("child_process"); execSync("git checkout main", { stdio: "ignore" }); execSync("git reset --hard HEAD", { stdio: "ignore" }); return true; } catch { return false; } }, maxRetries: 2, backoffMs: 500 }, { errorType: "agent_timeout", condition: (error, context) => error.message.includes("timeout") || context.timeout, action: async (report, recovery) => { logger.info("Attempting agent timeout recovery"); await recovery.swarmCoordinator.forceCleanup(); return true; }, maxRetries: 1, backoffMs: 2e3 }, { errorType: "memory_overflow", condition: (error) => error.message.includes("memory") || error.message.includes("heap"), action: async (report, recovery) => { logger.info("Attempting memory recovery"); if (global.gc) global.gc(); await recovery.cleanupOldCheckpoints(5); return true; }, maxRetries: 1, backoffMs: 5e3 } ]; } async captureGitState() { try { const { execSync } = await import("child_process"); const currentBranch = execSync("git branch --show-current", { encoding: "utf8" }).trim(); const statusOutput = execSync("git status --porcelain", { encoding: "utf8" }); const uncommittedChanges = statusOutput.trim().split("\n").filter(Boolean); const branchesOutput = execSync("git branch", { encoding: "utf8" }); const activeBranches = branchesOutput.split("\n").map((line) => line.trim().replace(/^\*?\s*/, "")).filter(Boolean); return { currentBranch, uncommittedChanges, activeBranches }; } catch (error) { logger.warn("Failed to capture git state:", error); return { currentBranch: "unknown", uncommittedChanges: [], activeBranches: [] }; } } async restoreGitState(gitState) { try { const { execSync } = await import("child_process"); execSync(`git checkout ${gitState.currentBranch}`, { stdio: "ignore" }); logger.info(`Restored git branch: ${gitState.currentBranch}`); } catch (error) { logger.warn("Failed to restore git state:", error); } } async restoreDatabase(backupPath) { logger.info(`Restoring database from ${backupPath}`); } async restoreSwarmState(checkpoint) { logger.info(`Restoring swarm state from checkpoint ${checkpoint.id}`); } findRecentCheckpoint(swarmId) { const recent = Date.now() - 18e5; return Array.from(this.checkpoints.values()).filter((cp) => cp.swarmId === swarmId && cp.timestamp > recent).sort((a, b) => b.timestamp - a.timestamp)[0] || null; } async clearProblematicState() { try { await this.cleanupTempFiles(); logger.info("Cleared problematic state"); } catch (error) { logger.error("Failed to clear problematic state:", error); } } async cleanupTempFiles() { } async cleanupOldCheckpoints(keepCount) { const sorted = Array.from(this.checkpoints.values()).sort( (a, b) => b.timestamp - a.timestamp ); const toDelete = sorted.slice(keepCount); for (const checkpoint of toDelete) { try { const checkpointPath = path.join( this.recoveryDir, `checkpoint-${checkpoint.id}.json` ); await fs.unlink(checkpointPath); this.checkpoints.delete(checkpoint.id); } catch (error) { logger.warn( `Failed to delete checkpoint ${checkpoint.id}:`, error ); } } logger.info(`Cleaned up ${toDelete.length} old checkpoints`); } async escalateCriticalFailure(report) { logger.error(`CRITICAL FAILURE [${report.id}]: ${report.error.message}`); try { const swarmState = this.swarmCoordinator.swarmState; if (swarmState?.id) { await this.createCheckpoint(swarmState.id, "critical_failure"); } } catch { logger.error("Failed to create emergency checkpoint"); } await this.swarmCoordinator.forceCleanup(); } generateRecoveryActions(frequentErrors) { const actions = []; for (const { type, count } of frequentErrors) { if (count > 3) { switch (type) { case "database_failure": actions.push("Consider upgrading database configuration"); break; case "git_conflict": actions.push("Review git workflow and branch strategy"); break; case "agent_timeout": actions.push( "Increase agent timeout limits or reduce task complexity" ); break; case "memory_overflow": actions.push("Monitor memory usage and consider increasing limits"); break; } } } return actions; } startPeriodicCheckpoints() { this.checkpointInterval = setInterval(async () => { const swarmState = this.swarmCoordinator.swarmState; if (swarmState?.id && swarmState.status === "active") { await this.createCheckpoint(swarmState.id, "periodic"); } }, 3e5); } async ensureRecoveryDirectory() { try { await fs.mkdir(this.recoveryDir, { recursive: true }); } catch (error) { logger.error("Failed to create recovery directory:", error); } } async loadExistingCheckpoints() { try { const files = await fs.readdir(this.recoveryDir); for (const file of files) { if (file.startsWith("checkpoint-") && file.endsWith(".json")) { try { const content = await fs.readFile( path.join(this.recoveryDir, file), "utf8" ); const checkpoint = JSON.parse(content); this.checkpoints.set(checkpoint.id, checkpoint); } catch (error) { logger.warn(`Failed to load checkpoint ${file}:`, error); } } } logger.info(`Loaded ${this.checkpoints.size} existing checkpoints`); } catch (error) { logger.warn("Failed to load existing checkpoints:", error); } } async saveCrashReport(report) { try { const reportPath = path.join(this.recoveryDir, `crash-${report.id}.json`); await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); } catch (error) { logger.error("Failed to save crash report:", error); } } sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } generateId() { return `recovery_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; } } var crash_recovery_default = CrashRecoverySystem; export { CrashRecoverySystem, crash_recovery_default as default }; //# sourceMappingURL=crash-recovery.js.map