@stackmemoryai/stackmemory
Version:
Project-scoped memory for AI coding tools. Durable context across sessions with MCP integration, frames, smart retrieval, Claude Code skills, and automatic hooks.
463 lines (462 loc) • 15.6 kB
JavaScript
import { fileURLToPath as __fileURLToPath } from 'url';
import { dirname as __pathDirname } from 'path';
const __filename = __fileURLToPath(import.meta.url);
const __dirname = __pathDirname(__filename);
import * as fs from "fs/promises";
import * as path from "path";
import { logger } from "../../../core/monitoring/logger.js";
class CrashRecoverySystem {
checkpoints = /* @__PURE__ */ new Map();
crashReports = [];
recoveryStrategies = [];
swarmCoordinator;
checkpointInterval;
recoveryDir;
constructor(swarmCoordinator, recoveryDir = ".swarm/recovery") {
this.swarmCoordinator = swarmCoordinator;
this.recoveryDir = recoveryDir;
this.setupRecoveryStrategies();
}
/**
* Initialize crash recovery system
*/
async initialize() {
await this.ensureRecoveryDirectory();
await this.loadExistingCheckpoints();
this.startPeriodicCheckpoints();
process.on("unhandledRejection", (reason, promise) => {
this.handleCrash(new Error(`Unhandled Rejection: ${reason}`), {
type: "unhandled_rejection",
promise: promise.toString()
});
});
process.on("uncaughtException", (error) => {
this.handleCrash(error, { type: "uncaught_exception" });
});
logger.info("Crash recovery system initialized");
}
/**
* Create recovery checkpoint
*/
async createCheckpoint(swarmId, reason = "periodic") {
try {
const swarmState = this.swarmCoordinator.swarmState;
const agents = Array.from(
this.swarmCoordinator.activeAgents.values()
);
const checkpoint = {
id: this.generateId(),
swarmId,
timestamp: Date.now(),
swarmState: { ...swarmState },
agents: agents.map((agent) => ({ ...agent })),
tasks: swarmState.tasks || [],
errorLog: this.crashReports.slice(-10),
// Last 10 errors
gitState: await this.captureGitState()
};
const checkpointPath = path.join(
this.recoveryDir,
`checkpoint-${checkpoint.id}.json`
);
await fs.writeFile(checkpointPath, JSON.stringify(checkpoint, null, 2));
this.checkpoints.set(checkpoint.id, checkpoint);
logger.info(
`Created checkpoint ${checkpoint.id} for swarm ${swarmId} (${reason})`
);
return checkpoint.id;
} catch (error) {
logger.error("Failed to create checkpoint:", error);
throw error;
}
}
/**
* Handle crash or error
*/
async handleCrash(error, context = {}) {
const report = {
id: this.generateId(),
timestamp: Date.now(),
agentId: context.agentId,
errorType: this.classifyError(error, context),
error,
context,
recoveryAction: "",
severity: this.assessSeverity(error, context),
resolved: false
};
this.crashReports.push(report);
logger.error(`Crash detected [${report.id}]:`, error);
const recovered = await this.attemptRecovery(report);
if (recovered) {
report.resolved = true;
report.recoveryAction = "auto_recovered";
logger.info(`Successfully recovered from crash ${report.id}`);
} else {
logger.error(`Failed to recover from crash ${report.id}`);
if (report.severity === "critical") {
await this.escalateCriticalFailure(report);
}
}
await this.saveCrashReport(report);
}
/**
* Restore from checkpoint
*/
async restoreFromCheckpoint(checkpointId) {
try {
const checkpoint = this.checkpoints.get(checkpointId);
if (!checkpoint) {
logger.error(`Checkpoint ${checkpointId} not found`);
return false;
}
logger.info(`Restoring from checkpoint ${checkpointId}`);
await this.restoreGitState(checkpoint.gitState);
if (checkpoint.databaseBackup) {
await this.restoreDatabase(checkpoint.databaseBackup);
}
await this.restoreSwarmState(checkpoint);
logger.info(`Successfully restored from checkpoint ${checkpointId}`);
return true;
} catch (error) {
logger.error(
`Failed to restore from checkpoint ${checkpointId}:`,
error
);
return false;
}
}
/**
* Get recovery recommendations
*/
getRecoveryRecommendations() {
const recent = Date.now() - 36e5;
const recentCheckpoints = Array.from(this.checkpoints.values()).filter((cp) => cp.timestamp > recent).sort((a, b) => b.timestamp - a.timestamp).slice(0, 5);
const errorCounts = /* @__PURE__ */ new Map();
const errorTimes = /* @__PURE__ */ new Map();
for (const report of this.crashReports.filter(
(r) => r.timestamp > recent
)) {
errorCounts.set(
report.errorType,
(errorCounts.get(report.errorType) || 0) + 1
);
errorTimes.set(
report.errorType,
Math.max(errorTimes.get(report.errorType) || 0, report.timestamp)
);
}
const frequentErrors = Array.from(errorCounts.entries()).map(([type, count]) => ({
type,
count,
lastOccurrence: errorTimes.get(type) || 0
})).sort((a, b) => b.count - a.count);
const criticalErrors = this.crashReports.filter(
(r) => r.severity === "critical" && r.timestamp > recent && !r.resolved
);
const systemHealth = criticalErrors.length > 0 ? "critical" : frequentErrors.length > 3 ? "degraded" : "good";
return {
recentCheckpoints,
frequentErrors,
recoveryActions: this.generateRecoveryActions(frequentErrors),
systemHealth
};
}
/**
* Auto-recovery from common failures
*/
async attemptAutoRecovery(swarmId) {
logger.info(`Attempting auto-recovery for swarm ${swarmId}`);
try {
const recentCheckpoint = this.findRecentCheckpoint(swarmId);
if (recentCheckpoint) {
logger.info(`Found recent checkpoint: ${recentCheckpoint.id}`);
return await this.restoreFromCheckpoint(recentCheckpoint.id);
}
await this.swarmCoordinator.forceCleanup();
await this.clearProblematicState();
logger.info("Restarting swarm with minimal configuration");
return true;
} catch (error) {
logger.error("Auto-recovery failed:", error);
return false;
}
}
async attemptRecovery(report) {
for (const strategy of this.recoveryStrategies) {
if (strategy.condition(report.error, report.context)) {
logger.info(`Applying recovery strategy: ${strategy.errorType}`);
let retries = 0;
while (retries < strategy.maxRetries) {
try {
const success = await strategy.action(report, this);
if (success) {
report.recoveryAction = strategy.errorType;
return true;
}
} catch (error) {
logger.warn(
`Recovery attempt ${retries + 1} failed:`,
error
);
}
retries++;
if (retries < strategy.maxRetries) {
await this.sleep(strategy.backoffMs * Math.pow(2, retries));
}
}
}
}
return false;
}
classifyError(error, context) {
const message = error.message.toLowerCase();
if (message.includes("database") || message.includes("sqlite")) {
return "database_failure";
} else if (message.includes("git") || message.includes("branch")) {
return "git_conflict";
} else if (message.includes("timeout") || context.timeout) {
return "agent_timeout";
} else if (message.includes("memory") || message.includes("heap")) {
return "memory_overflow";
} else if (message.includes("network") || message.includes("connect")) {
return "network_error";
}
return "database_failure";
}
assessSeverity(error, context) {
if (context.type === "uncaught_exception") return "critical";
if (error.message.includes("unhandled")) return "high";
if (error.message.includes("database")) return "medium";
return "low";
}
setupRecoveryStrategies() {
this.recoveryStrategies = [
{
errorType: "database_failure",
condition: (error) => error.message.includes("database") || error.message.includes("sqlite"),
action: async (report, recovery) => {
logger.info("Attempting database recovery");
try {
await recovery.clearProblematicState();
return true;
} catch {
return false;
}
},
maxRetries: 3,
backoffMs: 1e3
},
{
errorType: "git_conflict",
condition: (error) => error.message.includes("git") || error.message.includes("branch"),
action: async (report, recovery) => {
logger.info("Attempting git conflict resolution");
try {
const { execSync } = await import("child_process");
execSync("git checkout main", { stdio: "ignore" });
execSync("git reset --hard HEAD", { stdio: "ignore" });
return true;
} catch {
return false;
}
},
maxRetries: 2,
backoffMs: 500
},
{
errorType: "agent_timeout",
condition: (error, context) => error.message.includes("timeout") || context.timeout,
action: async (report, recovery) => {
logger.info("Attempting agent timeout recovery");
await recovery.swarmCoordinator.forceCleanup();
return true;
},
maxRetries: 1,
backoffMs: 2e3
},
{
errorType: "memory_overflow",
condition: (error) => error.message.includes("memory") || error.message.includes("heap"),
action: async (report, recovery) => {
logger.info("Attempting memory recovery");
if (global.gc) global.gc();
await recovery.cleanupOldCheckpoints(5);
return true;
},
maxRetries: 1,
backoffMs: 5e3
}
];
}
async captureGitState() {
try {
const { execSync } = await import("child_process");
const currentBranch = execSync("git branch --show-current", {
encoding: "utf8"
}).trim();
const statusOutput = execSync("git status --porcelain", {
encoding: "utf8"
});
const uncommittedChanges = statusOutput.trim().split("\n").filter(Boolean);
const branchesOutput = execSync("git branch", { encoding: "utf8" });
const activeBranches = branchesOutput.split("\n").map((line) => line.trim().replace(/^\*?\s*/, "")).filter(Boolean);
return {
currentBranch,
uncommittedChanges,
activeBranches
};
} catch (error) {
logger.warn("Failed to capture git state:", error);
return {
currentBranch: "unknown",
uncommittedChanges: [],
activeBranches: []
};
}
}
async restoreGitState(gitState) {
try {
const { execSync } = await import("child_process");
execSync(`git checkout ${gitState.currentBranch}`, { stdio: "ignore" });
logger.info(`Restored git branch: ${gitState.currentBranch}`);
} catch (error) {
logger.warn("Failed to restore git state:", error);
}
}
async restoreDatabase(backupPath) {
logger.info(`Restoring database from ${backupPath}`);
}
async restoreSwarmState(checkpoint) {
logger.info(`Restoring swarm state from checkpoint ${checkpoint.id}`);
}
findRecentCheckpoint(swarmId) {
const recent = Date.now() - 18e5;
return Array.from(this.checkpoints.values()).filter((cp) => cp.swarmId === swarmId && cp.timestamp > recent).sort((a, b) => b.timestamp - a.timestamp)[0] || null;
}
async clearProblematicState() {
try {
await this.cleanupTempFiles();
logger.info("Cleared problematic state");
} catch (error) {
logger.error("Failed to clear problematic state:", error);
}
}
async cleanupTempFiles() {
}
async cleanupOldCheckpoints(keepCount) {
const sorted = Array.from(this.checkpoints.values()).sort(
(a, b) => b.timestamp - a.timestamp
);
const toDelete = sorted.slice(keepCount);
for (const checkpoint of toDelete) {
try {
const checkpointPath = path.join(
this.recoveryDir,
`checkpoint-${checkpoint.id}.json`
);
await fs.unlink(checkpointPath);
this.checkpoints.delete(checkpoint.id);
} catch (error) {
logger.warn(
`Failed to delete checkpoint ${checkpoint.id}:`,
error
);
}
}
logger.info(`Cleaned up ${toDelete.length} old checkpoints`);
}
async escalateCriticalFailure(report) {
logger.error(`CRITICAL FAILURE [${report.id}]: ${report.error.message}`);
try {
const swarmState = this.swarmCoordinator.swarmState;
if (swarmState?.id) {
await this.createCheckpoint(swarmState.id, "critical_failure");
}
} catch {
logger.error("Failed to create emergency checkpoint");
}
await this.swarmCoordinator.forceCleanup();
}
generateRecoveryActions(frequentErrors) {
const actions = [];
for (const { type, count } of frequentErrors) {
if (count > 3) {
switch (type) {
case "database_failure":
actions.push("Consider upgrading database configuration");
break;
case "git_conflict":
actions.push("Review git workflow and branch strategy");
break;
case "agent_timeout":
actions.push(
"Increase agent timeout limits or reduce task complexity"
);
break;
case "memory_overflow":
actions.push("Monitor memory usage and consider increasing limits");
break;
}
}
}
return actions;
}
startPeriodicCheckpoints() {
this.checkpointInterval = setInterval(async () => {
const swarmState = this.swarmCoordinator.swarmState;
if (swarmState?.id && swarmState.status === "active") {
await this.createCheckpoint(swarmState.id, "periodic");
}
}, 3e5);
}
async ensureRecoveryDirectory() {
try {
await fs.mkdir(this.recoveryDir, { recursive: true });
} catch (error) {
logger.error("Failed to create recovery directory:", error);
}
}
async loadExistingCheckpoints() {
try {
const files = await fs.readdir(this.recoveryDir);
for (const file of files) {
if (file.startsWith("checkpoint-") && file.endsWith(".json")) {
try {
const content = await fs.readFile(
path.join(this.recoveryDir, file),
"utf8"
);
const checkpoint = JSON.parse(content);
this.checkpoints.set(checkpoint.id, checkpoint);
} catch (error) {
logger.warn(`Failed to load checkpoint ${file}:`, error);
}
}
}
logger.info(`Loaded ${this.checkpoints.size} existing checkpoints`);
} catch (error) {
logger.warn("Failed to load existing checkpoints:", error);
}
}
async saveCrashReport(report) {
try {
const reportPath = path.join(this.recoveryDir, `crash-${report.id}.json`);
await fs.writeFile(reportPath, JSON.stringify(report, null, 2));
} catch (error) {
logger.error("Failed to save crash report:", error);
}
}
sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
generateId() {
return `recovery_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
}
var crash_recovery_default = CrashRecoverySystem;
export {
CrashRecoverySystem,
crash_recovery_default as default
};
//# sourceMappingURL=crash-recovery.js.map