UNPKG

claude-flow-novice

Version:

Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes Local RuVector Accelerator and all CFN skills for complete functionality.

837 lines (836 loc) 33.1 kB
/** * Checkpoint Manager * * Manages dual persistence model with Redis (runtime/ephemeral) and SQLite (durable/persistent). * Provides idempotent checkpointing with atomic operations and state validation. * * Task: Integration Standardization Plan - Task 4.5 * Version: 1.0.0 * * Persistence Boundaries: * - Redis: Agent execution state, coordination signals, temporary queues, active locks * - SQLite: Completed task results, agent metrics, audit trail, skill metadata * * @example * ```typescript * const checkpointMgr = new CheckpointManager(dbService); * await checkpointMgr.initialize(); * * // Create checkpoint on task completion * await checkpointMgr.createCheckpoint('task-123', CheckpointTrigger.TASK_COMPLETION); * * // Recover from checkpoint * const state = await checkpointMgr.recoverFromCheckpoint('task-123'); * ``` */ import * as crypto from 'crypto'; import { getGlobalLogger } from './logging.js'; import { StandardError, ErrorCode } from './errors.js'; const logger = getGlobalLogger(); // ============================================================================ // Type Definitions // ============================================================================ /** * Checkpoint trigger types */ export var CheckpointTrigger = /*#__PURE__*/ function(CheckpointTrigger) { /** Triggered when a task completes */ CheckpointTrigger["TASK_COMPLETION"] = "task_completion"; /** Triggered at iteration boundaries (CFN Loop iterations) */ CheckpointTrigger["ITERATION_BOUNDARY"] = "iteration_boundary"; /** Triggered periodically (default: 5 minutes) */ CheckpointTrigger["PERIODIC"] = "periodic"; /** Manually triggered checkpoint */ CheckpointTrigger["MANUAL"] = "manual"; return CheckpointTrigger; }({}); /** * Checkpoint status */ export var CheckpointStatus = /*#__PURE__*/ function(CheckpointStatus) { /** Checkpoint creation in progress */ CheckpointStatus["IN_PROGRESS"] = "in_progress"; /** Checkpoint completed successfully */ CheckpointStatus["COMPLETED"] = "completed"; /** Checkpoint failed */ CheckpointStatus["FAILED"] = "failed"; /** Checkpoint recovered and applied */ CheckpointStatus["RECOVERED"] = "recovered"; return CheckpointStatus; }({}); // ============================================================================ // Checkpoint Manager // ============================================================================ /** * Checkpoint Manager * * Manages dual persistence model with idempotent checkpointing and recovery. */ export class CheckpointManager { dbService; redisAdapter; sqliteAdapter; config; periodicCheckpointTimer; initialized = false; constructor(dbService, config = {}){ this.dbService = dbService; // Get adapters this.redisAdapter = dbService.getAdapter('redis'); this.sqliteAdapter = dbService.getAdapter('sqlite'); // Set config with defaults this.config = { enablePeriodicCheckpoints: config.enablePeriodicCheckpoints ?? true, periodicInterval: config.periodicInterval ?? 300000, retentionPeriod: config.retentionPeriod ?? 7 * 24 * 60 * 60 * 1000, enableAutoCleanup: config.enableAutoCleanup ?? true, validationTimeout: config.validationTimeout ?? 5000 }; logger.info('CheckpointManager initialized', { config: this.config }); } /** * Initialize checkpoint manager and setup periodic checkpoints */ async initialize() { if (this.initialized) { logger.warn('CheckpointManager already initialized'); return; } try { // Create SQLite tables for checkpoint storage await this.createCheckpointTables(); // Start periodic checkpoints if enabled if (this.config.enablePeriodicCheckpoints) { this.startPeriodicCheckpoints(); } this.initialized = true; logger.info('CheckpointManager initialization complete'); } catch (error) { const err = error; logger.error('Failed to initialize CheckpointManager', err); throw new StandardError(ErrorCode.CONFIGURATION_ERROR, 'Failed to initialize CheckpointManager', { error: err.message }, err); } } /** * Shutdown checkpoint manager and cleanup resources */ async shutdown() { if (this.periodicCheckpointTimer) { clearInterval(this.periodicCheckpointTimer); this.periodicCheckpointTimer = undefined; } this.initialized = false; logger.info('CheckpointManager shutdown complete'); } /** * Create a checkpoint for the given task * * Idempotent: Creating checkpoint with same state produces same result */ async createCheckpoint(taskId, trigger, metadata) { this.ensureInitialized(); const startTime = Date.now(); const checkpointId = this.generateCheckpointId(taskId, trigger); logger.info('Creating checkpoint', { taskId, checkpointId, trigger }); try { // 1. Capture runtime state from Redis const runtimeState = await this.captureRuntimeState(taskId); const runtimeStateHash = this.hashState(runtimeState); // 2. Capture durable state from SQLite const durableState = await this.captureDurableState(taskId); const durableStateHash = this.hashState(durableState); // 3. Check for idempotency (same state hash = skip checkpoint) const existingCheckpoint = await this.findCheckpointByHash(taskId, runtimeStateHash, durableStateHash); if (existingCheckpoint && existingCheckpoint.status === "completed") { logger.info('Checkpoint already exists with same state hash (idempotent)', { checkpointId: existingCheckpoint.checkpointId, taskId }); return existingCheckpoint; } // 4. Create checkpoint metadata const checkpointMetadata = { checkpointId, taskId, trigger, status: "in_progress", runtimeStateHash, durableStateHash, createdAt: new Date(), metadata }; // 5. Store checkpoint metadata await this.storeCheckpointMetadata(checkpointMetadata); // 6. Validate state before storing (atomic check) await this.validateState(runtimeState, durableState); // 7. Store checkpoint data atomically await this.storeCheckpointData(checkpointId, runtimeState, durableState); // 8. Update checkpoint status to completed checkpointMetadata.status = "completed"; checkpointMetadata.completedAt = new Date(); await this.updateCheckpointMetadata(checkpointMetadata); const duration = Date.now() - startTime; logger.info('Checkpoint created successfully', { checkpointId, taskId, trigger, duration, runtimeStateHash, durableStateHash }); return checkpointMetadata; } catch (error) { const err = error; logger.error('Failed to create checkpoint', err, { taskId, checkpointId }); // Mark checkpoint as failed const failedMetadata = { checkpointId, taskId, trigger, status: "failed", runtimeStateHash: '', durableStateHash: '', createdAt: new Date(), error: err.message, metadata }; await this.storeCheckpointMetadata(failedMetadata); throw new StandardError(ErrorCode.OPERATION_TIMEOUT, 'Failed to create checkpoint', { taskId, checkpointId, trigger }, err); } } /** * Recover from the latest checkpoint for the given task */ async recoverFromCheckpoint(taskId) { this.ensureInitialized(); logger.info('Starting checkpoint recovery', { taskId }); try { // 1. Find latest completed checkpoint const checkpoint = await this.findLatestCheckpoint(taskId); if (!checkpoint) { throw new StandardError(ErrorCode.DB_NOT_FOUND, 'No checkpoint found for task', { taskId }); } // 2. Load checkpoint data const checkpointData = await this.loadCheckpointData(checkpoint.checkpointId); // 3. Validate checkpoint data await this.validateCheckpointData(checkpointData); // 4. Restore runtime state to Redis const runtimeRestored = await this.restoreRuntimeState(taskId, checkpointData.runtimeState); // 5. Restore durable state to SQLite (if needed) const durableRestored = await this.restoreDurableState(taskId, checkpointData.durableState); // 6. Update checkpoint status checkpoint.status = "recovered"; await this.updateCheckpointMetadata(checkpoint); const result = { success: true, checkpointId: checkpoint.checkpointId, taskId, runtimeStateRestored: runtimeRestored, durableStateRestored: durableRestored, timestamp: new Date() }; logger.info('Checkpoint recovery completed', result); return result; } catch (error) { const err = error; logger.error('Failed to recover from checkpoint', err, { taskId }); return { success: false, checkpointId: '', taskId, runtimeStateRestored: false, durableStateRestored: false, timestamp: new Date(), errors: [ err.message ] }; } } /** * List checkpoints for a task */ async listCheckpoints(taskId) { this.ensureInitialized(); try { const rows = await this.sqliteAdapter.list('checkpoints', { filters: [ { field: 'task_id', operator: 'eq', value: taskId } ], orderBy: 'created_at', order: 'desc' }); return rows.map((row)=>this.deserializeCheckpointMetadata(row)); } catch (error) { const err = error; logger.error('Failed to list checkpoints', err, { taskId }); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to list checkpoints', { taskId }, err); } } /** * Delete old checkpoints (cleanup) */ async cleanupOldCheckpoints() { this.ensureInitialized(); if (!this.config.enableAutoCleanup) { logger.info('Automatic cleanup disabled'); return 0; } try { const cutoffDate = new Date(Date.now() - this.config.retentionPeriod); const result = await this.sqliteAdapter.raw(` DELETE FROM checkpoints WHERE created_at < ? `, [ cutoffDate.toISOString() ]); const deletedCount = result.rowsAffected || 0; logger.info('Cleaned up old checkpoints', { deletedCount, cutoffDate: cutoffDate.toISOString() }); return deletedCount; } catch (error) { const err = error; logger.error('Failed to cleanup old checkpoints', err); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to cleanup old checkpoints', {}, err); } } // ============================================================================ // Private Methods - State Capture // ============================================================================ async captureRuntimeState(taskId) { try { // Capture agent execution state const agentKeys = await this.redisAdapter.raw('KEYS', [ `agent:${taskId}:*` ]); const agents = []; for (const key of agentKeys){ const data = await this.redisAdapter.get(key); if (data) { agents.push({ agentId: data.agentId || '', agentType: data.agentType || '', status: data.status || 'in_progress', confidence: data.confidence, startedAt: new Date(data.startedAt || Date.now()), completedAt: data.completedAt ? new Date(data.completedAt) : undefined, metadata: data.metadata }); } } // Capture coordination signals const signalKeys = await this.redisAdapter.raw('KEYS', [ `swarm:${taskId}:*` ]); const coordinationSignals = []; for (const key of signalKeys){ const value = await this.redisAdapter.get(key); const ttl = await this.redisAdapter.raw('TTL', [ key ]); if (value) { coordinationSignals.push({ key, value, ttl: ttl > 0 ? ttl : undefined, createdAt: new Date() }); } } // Capture queue data const queueKeys = await this.redisAdapter.raw('KEYS', [ `queue:${taskId}:*` ]); const queueData = []; for (const key of queueKeys){ const items = await this.redisAdapter.raw('LRANGE', [ key, '0', '-1' ]); if (items && items.length > 0) { queueData.push({ queueName: key, items }); } } // Capture active locks const lockKeys = await this.redisAdapter.raw('KEYS', [ `lock:${taskId}:*` ]); const activeLocks = []; for (const key of lockKeys){ const owner = await this.redisAdapter.get(key); const ttl = await this.redisAdapter.raw('TTL', [ key ]); if (owner && ttl > 0) { activeLocks.push({ lockKey: key, owner, acquiredAt: new Date(), expiresAt: new Date(Date.now() + ttl * 1000) }); } } return { taskId, agents, coordinationSignals, queueData, activeLocks, capturedAt: new Date() }; } catch (error) { const err = error; logger.error('Failed to capture runtime state', err, { taskId }); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to capture runtime state from Redis', { taskId }, err); } } async captureDurableState(taskId) { try { // Capture task results const taskResults = await this.sqliteAdapter.list('task_results', { filters: [ { field: 'taskId', operator: 'eq', value: taskId } ] }); // Capture agent metrics const agentMetrics = await this.sqliteAdapter.list('agent_metrics', { filters: [ { field: 'taskId', operator: 'eq', value: taskId } ] }); // Capture audit trail const auditTrail = await this.sqliteAdapter.list('audit_trail', { filters: [ { field: 'taskId', operator: 'eq', value: taskId } ] }); // Capture skill metadata const skillMetadata = await this.sqliteAdapter.list('skill_metadata', {}); return { taskId, taskResults, agentMetrics, auditTrail, skillMetadata, capturedAt: new Date() }; } catch (error) { const err = error; logger.error('Failed to capture durable state', err, { taskId }); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to capture durable state from SQLite', { taskId }, err); } } // ============================================================================ // Private Methods - State Storage // ============================================================================ async storeCheckpointData(checkpointId, runtimeState, durableState) { try { // Store runtime state as JSON in a key-value table await this.sqliteAdapter.insert('checkpoint_data', { key: `checkpoint_runtime:${checkpointId}`, value: JSON.stringify(runtimeState) }); // Store durable state as JSON in a key-value table await this.sqliteAdapter.insert('checkpoint_data', { key: `checkpoint_durable:${checkpointId}`, value: JSON.stringify(durableState) }); logger.debug('Checkpoint data stored', { checkpointId }); } catch (error) { const err = error; logger.error('Failed to store checkpoint data', err, { checkpointId }); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to store checkpoint data', { checkpointId }, err); } } async loadCheckpointData(checkpointId) { try { // Load metadata from checkpoints table const metadataRow = await this.sqliteAdapter.get(`checkpoints:${checkpointId}`); if (!metadataRow) { throw new StandardError(ErrorCode.DB_NOT_FOUND, 'Checkpoint metadata not found', { checkpointId }); } // Load runtime state from checkpoint_data table const runtimeStateRow = await this.sqliteAdapter.get(`checkpoint_data:checkpoint_runtime:${checkpointId}`); if (!runtimeStateRow || !runtimeStateRow.value) { throw new StandardError(ErrorCode.DB_NOT_FOUND, 'Checkpoint runtime state not found', { checkpointId }); } // Load durable state from checkpoint_data table const durableStateRow = await this.sqliteAdapter.get(`checkpoint_data:checkpoint_durable:${checkpointId}`); if (!durableStateRow || !durableStateRow.value) { throw new StandardError(ErrorCode.DB_NOT_FOUND, 'Checkpoint durable state not found', { checkpointId }); } return { metadata: this.deserializeCheckpointMetadata(metadataRow), runtimeState: JSON.parse(runtimeStateRow.value), durableState: JSON.parse(durableStateRow.value) }; } catch (error) { const err = error; logger.error('Failed to load checkpoint data', err, { checkpointId }); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to load checkpoint data', { checkpointId }, err); } } // ============================================================================ // Private Methods - State Restoration // ============================================================================ async restoreRuntimeState(taskId, runtimeState) { try { // Restore agent execution state for (const agent of runtimeState.agents){ await this.redisAdapter.insert(`agent:${taskId}:${agent.agentId}`, agent); } // Restore coordination signals for (const signal of runtimeState.coordinationSignals){ await this.redisAdapter.insert(signal.key, signal.value); if (signal.ttl) { await this.redisAdapter.raw('EXPIRE', [ signal.key, signal.ttl.toString() ]); } } // Restore queue data for (const queue of runtimeState.queueData){ await this.redisAdapter.raw('DEL', [ queue.queueName ]); for (const item of queue.items){ await this.redisAdapter.raw('RPUSH', [ queue.queueName, JSON.stringify(item) ]); } } // Restore active locks for (const lock of runtimeState.activeLocks){ const ttl = Math.floor((lock.expiresAt.getTime() - Date.now()) / 1000); if (ttl > 0) { await this.redisAdapter.insert(lock.lockKey, lock.owner); await this.redisAdapter.raw('EXPIRE', [ lock.lockKey, ttl.toString() ]); } } logger.info('Runtime state restored to Redis', { taskId }); return true; } catch (error) { const err = error; logger.error('Failed to restore runtime state', err, { taskId }); return false; } } async restoreDurableState(taskId, durableState) { try { // Durable state in SQLite is already persistent // This method is for future enhancements (e.g., restoring to different instance) logger.info('Durable state validation complete', { taskId }); return true; } catch (error) { const err = error; logger.error('Failed to restore durable state', err, { taskId }); return false; } } // ============================================================================ // Private Methods - Validation // ============================================================================ async validateState(runtimeState, durableState) { // Validate runtime state if (!runtimeState.taskId) { throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Invalid runtime state: missing taskId', { runtimeState }); } if (!Array.isArray(runtimeState.agents)) { throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Invalid runtime state: agents must be an array', { runtimeState }); } // Validate durable state if (!durableState.taskId) { throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Invalid durable state: missing taskId', { durableState }); } if (runtimeState.taskId !== durableState.taskId) { throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Task ID mismatch between runtime and durable state', { runtimeTaskId: runtimeState.taskId, durableTaskId: durableState.taskId }); } logger.debug('State validation passed', { taskId: runtimeState.taskId }); } async validateCheckpointData(checkpointData) { await this.validateState(checkpointData.runtimeState, checkpointData.durableState); if (checkpointData.metadata.status !== "completed") { throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Cannot recover from incomplete checkpoint', { status: checkpointData.metadata.status }); } logger.debug('Checkpoint data validation passed', { checkpointId: checkpointData.metadata.checkpointId }); } // ============================================================================ // Private Methods - Utilities // ============================================================================ generateCheckpointId(taskId, trigger) { const timestamp = Date.now(); const random = crypto.randomBytes(4).toString('hex'); return `checkpoint_${taskId}_${trigger}_${timestamp}_${random}`; } hashState(state) { const stateJson = JSON.stringify(state, Object.keys(state).sort()); return crypto.createHash('sha256').update(stateJson).digest('hex'); } async findCheckpointByHash(taskId, runtimeHash, durableHash) { try { const rows = await this.sqliteAdapter.list('checkpoints', { filters: [ { field: 'task_id', operator: 'eq', value: taskId }, { field: 'runtime_state_hash', operator: 'eq', value: runtimeHash }, { field: 'durable_state_hash', operator: 'eq', value: durableHash } ], limit: 1 }); if (rows.length === 0) { return null; } return this.deserializeCheckpointMetadata(rows[0]); } catch (error) { const err = error; logger.error('Failed to find checkpoint by hash', err, { taskId }); return null; } } async findLatestCheckpoint(taskId) { try { const rows = await this.sqliteAdapter.list('checkpoints', { filters: [ { field: 'task_id', operator: 'eq', value: taskId }, { field: 'status', operator: 'eq', value: "completed" } ], orderBy: 'created_at', order: 'desc', limit: 1 }); if (rows.length === 0) { return null; } return this.deserializeCheckpointMetadata(rows[0]); } catch (error) { const err = error; logger.error('Failed to find latest checkpoint', err, { taskId }); return null; } } async storeCheckpointMetadata(metadata) { try { await this.sqliteAdapter.insert('checkpoints', { checkpoint_id: metadata.checkpointId, task_id: metadata.taskId, trigger: metadata.trigger, status: metadata.status, runtime_state_hash: metadata.runtimeStateHash, durable_state_hash: metadata.durableStateHash, created_at: metadata.createdAt.toISOString(), completed_at: metadata.completedAt?.toISOString(), error: metadata.error, metadata: JSON.stringify(metadata.metadata || {}) }); logger.debug('Checkpoint metadata stored', { checkpointId: metadata.checkpointId }); } catch (error) { const err = error; logger.error('Failed to store checkpoint metadata', err, { checkpointId: metadata.checkpointId }); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to store checkpoint metadata', { checkpointId: metadata.checkpointId }, err); } } async updateCheckpointMetadata(metadata) { try { await this.sqliteAdapter.update('checkpoints', metadata.checkpointId, { status: metadata.status, completed_at: metadata.completedAt?.toISOString(), error: metadata.error }); } catch (error) { // If update fails, try insert (first time) await this.storeCheckpointMetadata(metadata); } } deserializeCheckpointMetadata(row) { return { checkpointId: row.checkpoint_id, taskId: row.task_id, trigger: row.trigger, status: row.status, runtimeStateHash: row.runtime_state_hash, durableStateHash: row.durable_state_hash, createdAt: new Date(row.created_at), completedAt: row.completed_at ? new Date(row.completed_at) : undefined, error: row.error, metadata: row.metadata ? JSON.parse(row.metadata) : undefined }; } async createCheckpointTables() { try { // Create checkpoints table await this.sqliteAdapter.raw(` CREATE TABLE IF NOT EXISTS checkpoints ( checkpoint_id TEXT PRIMARY KEY, task_id TEXT NOT NULL, trigger TEXT NOT NULL, status TEXT NOT NULL, runtime_state_hash TEXT NOT NULL, durable_state_hash TEXT NOT NULL, created_at TEXT NOT NULL, completed_at TEXT, error TEXT, metadata TEXT ) `, []); // Create checkpoint_data table for storing state JSON await this.sqliteAdapter.raw(` CREATE TABLE IF NOT EXISTS checkpoint_data ( key TEXT PRIMARY KEY, value TEXT NOT NULL ) `, []); // Create indexes await this.sqliteAdapter.raw(` CREATE INDEX IF NOT EXISTS idx_checkpoints_task_id ON checkpoints(task_id) `, []); await this.sqliteAdapter.raw(` CREATE INDEX IF NOT EXISTS idx_checkpoints_status ON checkpoints(status) `, []); await this.sqliteAdapter.raw(` CREATE INDEX IF NOT EXISTS idx_checkpoints_created_at ON checkpoints(created_at) `, []); logger.info('Checkpoint tables created successfully'); } catch (error) { const err = error; logger.error('Failed to create checkpoint tables', err); throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to create checkpoint tables', {}, err); } } startPeriodicCheckpoints() { this.periodicCheckpointTimer = setInterval(async ()=>{ try { logger.debug('Running periodic checkpoint cleanup'); await this.cleanupOldCheckpoints(); } catch (error) { const err = error; logger.error('Periodic checkpoint cleanup failed', err); } }, this.config.periodicInterval); logger.info('Periodic checkpoints started', { interval: this.config.periodicInterval }); } ensureInitialized() { if (!this.initialized) { throw new StandardError(ErrorCode.CONFIGURATION_ERROR, 'CheckpointManager not initialized. Call initialize() first.', {}); } } } /** * Export convenience functions */ /** * Create a checkpoint manager instance */ export function createCheckpointManager(dbService, config) { return new CheckpointManager(dbService, config); } //# sourceMappingURL=checkpoint-manager.js.map