UNPKG

aiwg

Version:

Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.

500 lines (420 loc) 13.6 kB
/** * Recovery Engine for External Ralph Loop * * Handles crash detection, state recovery, and session resumption. * Enhanced with per-loop crash detection and recovery support. * * @implements @.aiwg/requirements/design-ralph-external.md * @implements @.aiwg/requirements/use-cases/UC-273-multi-loop-supervisor.md */ import { existsSync, readFileSync, readdirSync, writeFileSync } from 'fs'; import { join } from 'path'; import { gunzipSync } from 'zlib'; import { StateManager } from './state-manager.mjs'; /** * @typedef {Object} RecoveryState * @property {boolean} crashed - Whether a crash was detected * @property {number} [iteration] - Last iteration before crash * @property {string} [lastCheckpoint] - Last checkpoint identifier * @property {Object} [recoveryStrategy] - Recommended recovery strategy */ /** * @typedef {Object} RecoveryStrategy * @property {'resume_internal'|'continue_external'|'restart'} type * @property {string} action - Description of recovery action * @property {string} [prompt] - Suggested prompt for recovery */ /** * @typedef {Object} CheckpointInfo * @property {string} checkpointId - Checkpoint identifier * @property {string} path - Path to checkpoint file * @property {number} iteration - Iteration number * @property {number} timestamp - Checkpoint timestamp */ export class RecoveryEngine { /** * @param {string} projectRoot - Project root directory */ constructor(projectRoot) { this.projectRoot = projectRoot; this.stateManager = new StateManager(projectRoot); this.internalRalphStatePath = join(projectRoot, '.aiwg', 'ralph', 'current-loop.json'); this.loopsDir = join(projectRoot, '.aiwg', 'ralph', 'loops'); } /** * Read internal Ralph state * @returns {Object|null} */ readInternalRalphState() { if (!existsSync(this.internalRalphStatePath)) { return null; } try { const content = readFileSync(this.internalRalphStatePath, 'utf8'); return JSON.parse(content); } catch { return null; } } /** * Check if a process is still running * @param {number} pid - Process ID * @returns {boolean} */ isProcessAlive(pid) { if (!pid || pid <= 0) { return false; } try { // Signal 0 checks if process exists without killing it process.kill(pid, 0); return true; } catch (error) { // ESRCH = No such process if (error.code === 'ESRCH') { return false; } // EPERM = Process exists but no permission to signal if (error.code === 'EPERM') { return true; } return false; } } /** * Detect crashed loops across all active loops * @returns {string[]} - Array of crashed loop IDs */ detectCrashedLoops() { const crashedLoops = []; if (!existsSync(this.loopsDir)) { return crashedLoops; } const loopDirs = readdirSync(this.loopsDir, { withFileTypes: true }) .filter(dirent => dirent.isDirectory()) .map(dirent => dirent.name); for (const loopId of loopDirs) { const stateFile = join(this.loopsDir, loopId, 'state.json'); if (!existsSync(stateFile)) { continue; } try { const state = JSON.parse(readFileSync(stateFile, 'utf8')); // Check if loop is marked as running but process is dead if (state.status === 'running' && state.currentPid) { if (!this.isProcessAlive(state.currentPid)) { crashedLoops.push(loopId); } } } catch (error) { // Corrupted state file indicates crash crashedLoops.push(loopId); } } return crashedLoops; } /** * Detect crash state for specific loop * @param {string} [loopId] - Loop ID (uses default if not provided) * @returns {RecoveryState} */ detectCrash(loopId = null) { let state; if (loopId) { const stateFile = join(this.loopsDir, loopId, 'state.json'); if (!existsSync(stateFile)) { return { crashed: false }; } try { state = JSON.parse(readFileSync(stateFile, 'utf8')); } catch { return { crashed: false }; } } else { state = this.stateManager.load(); if (!state) { return { crashed: false }; } } // If status is 'running' but process is dead, it crashed if (state.status === 'running') { const isRunning = this.isProcessAlive(state.currentPid); if (!isRunning) { return { crashed: true, iteration: state.currentIteration, lastCheckpoint: `iteration-${state.currentIteration}`, recoveryStrategy: this.determineRecoveryStrategy(state), }; } } return { crashed: false }; } /** * Recover specific loop * @param {string} loopId - Loop ID to recover * @param {Object} [options] - Recovery options * @returns {Object|null} - Recovery context or null if no recovery needed */ recoverLoop(loopId, options = {}) { const crashState = this.detectCrash(loopId); if (!crashState.crashed) { return null; } const stateFile = join(this.loopsDir, loopId, 'state.json'); const state = JSON.parse(readFileSync(stateFile, 'utf8')); // Update state to indicate recovery state.status = 'recovering'; state.recoveryAttempts = (state.recoveryAttempts || 0) + 1; state.lastRecoveryAt = new Date().toISOString(); writeFileSync(stateFile, JSON.stringify(state, null, 2)); return { loopId, state, strategy: crashState.recoveryStrategy, options, }; } /** * Restore loop from checkpoint * @param {string} loopId - Loop ID * @param {string} [checkpointId] - Specific checkpoint ID (uses latest if not provided) * @returns {Object} - Restored state */ restoreFromCheckpoint(loopId, checkpointId = null) { const checkpointsDir = join(this.loopsDir, loopId, 'checkpoints'); if (!existsSync(checkpointsDir)) { throw new Error(`No checkpoints found for loop ${loopId}`); } let checkpoint; if (checkpointId) { // Restore from specific checkpoint const checkpointFile = join(checkpointsDir, `${checkpointId}.json.gz`); if (!existsSync(checkpointFile)) { throw new Error(`Checkpoint ${checkpointId} not found`); } checkpoint = checkpointFile; } else { // Get latest checkpoint const latest = this.getLatestCheckpoint(loopId); if (!latest) { throw new Error(`No checkpoints available for loop ${loopId}`); } checkpoint = latest.path; } // Read and decompress checkpoint const compressed = readFileSync(checkpoint); const decompressed = gunzipSync(compressed); const state = JSON.parse(decompressed.toString('utf8')); // Write restored state const stateFile = join(this.loopsDir, loopId, 'state.json'); writeFileSync(stateFile, JSON.stringify(state, null, 2)); return state; } /** * Get latest checkpoint for a loop * @param {string} loopId - Loop ID * @returns {CheckpointInfo|null} */ getLatestCheckpoint(loopId) { const checkpointsDir = join(this.loopsDir, loopId, 'checkpoints'); if (!existsSync(checkpointsDir)) { return null; } const checkpoints = readdirSync(checkpointsDir) .filter(f => f.endsWith('.json.gz')) .map(f => { const path = join(checkpointsDir, f); const match = f.match(/checkpoint-(\d+)-(\d+)\.json\.gz/); if (!match) return null; return { checkpointId: f.replace('.json.gz', ''), path, iteration: parseInt(match[1], 10), timestamp: parseInt(match[2], 10), }; }) .filter(c => c !== null) .sort((a, b) => b.timestamp - a.timestamp); return checkpoints[0] || null; } /** * Notify about crash (can be extended to send notifications) * @param {string} loopId - Loop ID * @param {Error} error - Error that caused crash */ notifyCrash(loopId, error) { const notification = { loopId, timestamp: new Date().toISOString(), error: { message: error.message, stack: error.stack, }, }; // Write crash log const crashLog = join(this.loopsDir, loopId, 'crash.log'); const logEntry = `[${notification.timestamp}] ${error.message}\n${error.stack}\n\n`; try { const existing = existsSync(crashLog) ? readFileSync(crashLog, 'utf8') : ''; writeFileSync(crashLog, existing + logEntry); } catch { // Ignore write errors } console.error(`[Recovery] Loop ${loopId} crashed: ${error.message}`); } /** * Determine best recovery strategy * @param {Object} state - Loop state * @returns {RecoveryStrategy} */ determineRecoveryStrategy(state) { // Check internal Ralph state const internalState = this.readInternalRalphState(); if (internalState?.active) { return { type: 'resume_internal', action: 'Resume internal Ralph loop with /ralph-resume', prompt: this.buildInternalResumePrompt(state, internalState), }; } // Check last iteration analysis const lastIteration = state.iterations[state.iterations.length - 1]; const lastAnalysis = lastIteration?.analysis; if (lastAnalysis?.shouldContinue) { return { type: 'continue_external', action: 'Continue with accumulated learnings', prompt: this.buildContinuationPrompt(state, lastAnalysis), }; } // Default: restart with learnings return { type: 'restart', action: 'Restart with accumulated learnings', prompt: this.buildRestartPrompt(state), }; } /** * Build prompt for resuming internal Ralph * @param {Object} externalState - External loop state * @param {Object} internalState - Internal Ralph state * @returns {string} */ buildInternalResumePrompt(externalState, internalState) { return `# Recovery: Resume Internal Ralph Loop ## External Loop Context - Loop ID: ${externalState.loopId} - External Iteration: ${externalState.currentIteration} - Objective: ${externalState.objective} ## Internal Ralph State - Internal Iteration: ${internalState.currentIteration || 'unknown'} - Task: ${internalState.task || externalState.objective} - Status: Active (was interrupted) ## Recovery Action First, check the internal Ralph status: \`\`\` /ralph-status \`\`\` Then resume the internal loop: \`\`\` /ralph-resume \`\`\` ## Previous Learnings ${externalState.accumulatedLearnings || 'None recorded'} `; } /** * Build prompt for continuing external loop * @param {Object} state - Loop state * @param {Object} lastAnalysis - Last analysis result * @returns {string} */ buildContinuationPrompt(state, lastAnalysis) { return `# Recovery: Continue External Loop ## Context - Loop ID: ${state.loopId} - Objective: ${state.objective} - Completion Criteria: ${state.completionCriteria} - Progress: ${lastAnalysis?.completionPercentage || 0}% ## Session was interrupted. Continuing from last state. ### Last Analysis ${lastAnalysis?.learnings || 'No learnings recorded'} ### Suggested Approach ${lastAnalysis?.nextApproach || 'Continue with accumulated context'} ### Blockers (if any) ${lastAnalysis?.blockers?.join('\n- ') || 'None identified'} ## Instructions Continue working on the objective. Check git status and matric-memory for latest state. `; } /** * Build prompt for restarting with learnings * @param {Object} state - Loop state * @returns {string} */ buildRestartPrompt(state) { return `# Recovery: Restart with Accumulated Learnings ## Context - Loop ID: ${state.loopId} - Objective: ${state.objective} - Completion Criteria: ${state.completionCriteria} - Previous Iterations: ${state.currentIteration} ## Session crashed and needs fresh start with context. ### Accumulated Learnings ${state.accumulatedLearnings || 'None recorded'} ### Files Modified ${state.filesModified?.map(f => `- ${f}`).join('\n') || 'None recorded'} ## Instructions Start fresh but apply the learnings above. Use \`/ralph\` for iterative implementation. `; } /** * Perform recovery * @param {string} [loopId] - Specific loop ID (uses default if not provided) * @returns {Object|null} - Recovery context or null if no recovery needed */ recover(loopId = null) { if (loopId) { return this.recoverLoop(loopId); } const crashState = this.detectCrash(); if (!crashState.crashed) { return null; } const state = this.stateManager.load(); // Update state to indicate recovery state.status = 'recovering'; this.stateManager.save(state); return { state, strategy: crashState.recoveryStrategy, }; } /** * Mark recovery complete * @param {string} [loopId] - Specific loop ID (uses default if not provided) */ markRecovered(loopId = null) { if (loopId) { const stateFile = join(this.loopsDir, loopId, 'state.json'); if (!existsSync(stateFile)) { return; } const state = JSON.parse(readFileSync(stateFile, 'utf8')); if (state.status === 'recovering') { state.status = 'running'; writeFileSync(stateFile, JSON.stringify(state, null, 2)); } } else { const state = this.stateManager.load(); if (state && state.status === 'recovering') { state.status = 'running'; this.stateManager.save(state); } } } } export default RecoveryEngine;