claude-flow-novice
Version:
Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes Local RuVector Accelerator and all CFN skills for complete functionality.
837 lines (836 loc) • 33.1 kB
JavaScript
/**
* Checkpoint Manager
*
* Manages dual persistence model with Redis (runtime/ephemeral) and SQLite (durable/persistent).
* Provides idempotent checkpointing with atomic operations and state validation.
*
* Task: Integration Standardization Plan - Task 4.5
* Version: 1.0.0
*
* Persistence Boundaries:
* - Redis: Agent execution state, coordination signals, temporary queues, active locks
* - SQLite: Completed task results, agent metrics, audit trail, skill metadata
*
* @example
* ```typescript
* const checkpointMgr = new CheckpointManager(dbService);
* await checkpointMgr.initialize();
*
* // Create checkpoint on task completion
* await checkpointMgr.createCheckpoint('task-123', CheckpointTrigger.TASK_COMPLETION);
*
* // Recover from checkpoint
* const state = await checkpointMgr.recoverFromCheckpoint('task-123');
* ```
*/ import * as crypto from 'crypto';
import { getGlobalLogger } from './logging.js';
import { StandardError, ErrorCode } from './errors.js';
const logger = getGlobalLogger();
// ============================================================================
// Type Definitions
// ============================================================================
/**
* Checkpoint trigger types
*/ export var CheckpointTrigger = /*#__PURE__*/ function(CheckpointTrigger) {
/** Triggered when a task completes */ CheckpointTrigger["TASK_COMPLETION"] = "task_completion";
/** Triggered at iteration boundaries (CFN Loop iterations) */ CheckpointTrigger["ITERATION_BOUNDARY"] = "iteration_boundary";
/** Triggered periodically (default: 5 minutes) */ CheckpointTrigger["PERIODIC"] = "periodic";
/** Manually triggered checkpoint */ CheckpointTrigger["MANUAL"] = "manual";
return CheckpointTrigger;
}({});
/**
* Checkpoint status
*/ export var CheckpointStatus = /*#__PURE__*/ function(CheckpointStatus) {
/** Checkpoint creation in progress */ CheckpointStatus["IN_PROGRESS"] = "in_progress";
/** Checkpoint completed successfully */ CheckpointStatus["COMPLETED"] = "completed";
/** Checkpoint failed */ CheckpointStatus["FAILED"] = "failed";
/** Checkpoint recovered and applied */ CheckpointStatus["RECOVERED"] = "recovered";
return CheckpointStatus;
}({});
// ============================================================================
// Checkpoint Manager
// ============================================================================
/**
* Checkpoint Manager
*
* Manages dual persistence model with idempotent checkpointing and recovery.
*/ export class CheckpointManager {
dbService;
redisAdapter;
sqliteAdapter;
config;
periodicCheckpointTimer;
initialized = false;
constructor(dbService, config = {}){
this.dbService = dbService;
// Get adapters
this.redisAdapter = dbService.getAdapter('redis');
this.sqliteAdapter = dbService.getAdapter('sqlite');
// Set config with defaults
this.config = {
enablePeriodicCheckpoints: config.enablePeriodicCheckpoints ?? true,
periodicInterval: config.periodicInterval ?? 300000,
retentionPeriod: config.retentionPeriod ?? 7 * 24 * 60 * 60 * 1000,
enableAutoCleanup: config.enableAutoCleanup ?? true,
validationTimeout: config.validationTimeout ?? 5000
};
logger.info('CheckpointManager initialized', {
config: this.config
});
}
/**
* Initialize checkpoint manager and setup periodic checkpoints
*/ async initialize() {
if (this.initialized) {
logger.warn('CheckpointManager already initialized');
return;
}
try {
// Create SQLite tables for checkpoint storage
await this.createCheckpointTables();
// Start periodic checkpoints if enabled
if (this.config.enablePeriodicCheckpoints) {
this.startPeriodicCheckpoints();
}
this.initialized = true;
logger.info('CheckpointManager initialization complete');
} catch (error) {
const err = error;
logger.error('Failed to initialize CheckpointManager', err);
throw new StandardError(ErrorCode.CONFIGURATION_ERROR, 'Failed to initialize CheckpointManager', {
error: err.message
}, err);
}
}
/**
* Shutdown checkpoint manager and cleanup resources
*/ async shutdown() {
if (this.periodicCheckpointTimer) {
clearInterval(this.periodicCheckpointTimer);
this.periodicCheckpointTimer = undefined;
}
this.initialized = false;
logger.info('CheckpointManager shutdown complete');
}
/**
* Create a checkpoint for the given task
*
* Idempotent: Creating checkpoint with same state produces same result
*/ async createCheckpoint(taskId, trigger, metadata) {
this.ensureInitialized();
const startTime = Date.now();
const checkpointId = this.generateCheckpointId(taskId, trigger);
logger.info('Creating checkpoint', {
taskId,
checkpointId,
trigger
});
try {
// 1. Capture runtime state from Redis
const runtimeState = await this.captureRuntimeState(taskId);
const runtimeStateHash = this.hashState(runtimeState);
// 2. Capture durable state from SQLite
const durableState = await this.captureDurableState(taskId);
const durableStateHash = this.hashState(durableState);
// 3. Check for idempotency (same state hash = skip checkpoint)
const existingCheckpoint = await this.findCheckpointByHash(taskId, runtimeStateHash, durableStateHash);
if (existingCheckpoint && existingCheckpoint.status === "completed") {
logger.info('Checkpoint already exists with same state hash (idempotent)', {
checkpointId: existingCheckpoint.checkpointId,
taskId
});
return existingCheckpoint;
}
// 4. Create checkpoint metadata
const checkpointMetadata = {
checkpointId,
taskId,
trigger,
status: "in_progress",
runtimeStateHash,
durableStateHash,
createdAt: new Date(),
metadata
};
// 5. Store checkpoint metadata
await this.storeCheckpointMetadata(checkpointMetadata);
// 6. Validate state before storing (atomic check)
await this.validateState(runtimeState, durableState);
// 7. Store checkpoint data atomically
await this.storeCheckpointData(checkpointId, runtimeState, durableState);
// 8. Update checkpoint status to completed
checkpointMetadata.status = "completed";
checkpointMetadata.completedAt = new Date();
await this.updateCheckpointMetadata(checkpointMetadata);
const duration = Date.now() - startTime;
logger.info('Checkpoint created successfully', {
checkpointId,
taskId,
trigger,
duration,
runtimeStateHash,
durableStateHash
});
return checkpointMetadata;
} catch (error) {
const err = error;
logger.error('Failed to create checkpoint', err, {
taskId,
checkpointId
});
// Mark checkpoint as failed
const failedMetadata = {
checkpointId,
taskId,
trigger,
status: "failed",
runtimeStateHash: '',
durableStateHash: '',
createdAt: new Date(),
error: err.message,
metadata
};
await this.storeCheckpointMetadata(failedMetadata);
throw new StandardError(ErrorCode.OPERATION_TIMEOUT, 'Failed to create checkpoint', {
taskId,
checkpointId,
trigger
}, err);
}
}
/**
* Recover from the latest checkpoint for the given task
*/ async recoverFromCheckpoint(taskId) {
this.ensureInitialized();
logger.info('Starting checkpoint recovery', {
taskId
});
try {
// 1. Find latest completed checkpoint
const checkpoint = await this.findLatestCheckpoint(taskId);
if (!checkpoint) {
throw new StandardError(ErrorCode.DB_NOT_FOUND, 'No checkpoint found for task', {
taskId
});
}
// 2. Load checkpoint data
const checkpointData = await this.loadCheckpointData(checkpoint.checkpointId);
// 3. Validate checkpoint data
await this.validateCheckpointData(checkpointData);
// 4. Restore runtime state to Redis
const runtimeRestored = await this.restoreRuntimeState(taskId, checkpointData.runtimeState);
// 5. Restore durable state to SQLite (if needed)
const durableRestored = await this.restoreDurableState(taskId, checkpointData.durableState);
// 6. Update checkpoint status
checkpoint.status = "recovered";
await this.updateCheckpointMetadata(checkpoint);
const result = {
success: true,
checkpointId: checkpoint.checkpointId,
taskId,
runtimeStateRestored: runtimeRestored,
durableStateRestored: durableRestored,
timestamp: new Date()
};
logger.info('Checkpoint recovery completed', result);
return result;
} catch (error) {
const err = error;
logger.error('Failed to recover from checkpoint', err, {
taskId
});
return {
success: false,
checkpointId: '',
taskId,
runtimeStateRestored: false,
durableStateRestored: false,
timestamp: new Date(),
errors: [
err.message
]
};
}
}
/**
* List checkpoints for a task
*/ async listCheckpoints(taskId) {
this.ensureInitialized();
try {
const rows = await this.sqliteAdapter.list('checkpoints', {
filters: [
{
field: 'task_id',
operator: 'eq',
value: taskId
}
],
orderBy: 'created_at',
order: 'desc'
});
return rows.map((row)=>this.deserializeCheckpointMetadata(row));
} catch (error) {
const err = error;
logger.error('Failed to list checkpoints', err, {
taskId
});
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to list checkpoints', {
taskId
}, err);
}
}
/**
* Delete old checkpoints (cleanup)
*/ async cleanupOldCheckpoints() {
this.ensureInitialized();
if (!this.config.enableAutoCleanup) {
logger.info('Automatic cleanup disabled');
return 0;
}
try {
const cutoffDate = new Date(Date.now() - this.config.retentionPeriod);
const result = await this.sqliteAdapter.raw(`
DELETE FROM checkpoints
WHERE created_at < ?
`, [
cutoffDate.toISOString()
]);
const deletedCount = result.rowsAffected || 0;
logger.info('Cleaned up old checkpoints', {
deletedCount,
cutoffDate: cutoffDate.toISOString()
});
return deletedCount;
} catch (error) {
const err = error;
logger.error('Failed to cleanup old checkpoints', err);
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to cleanup old checkpoints', {}, err);
}
}
// ============================================================================
// Private Methods - State Capture
// ============================================================================
async captureRuntimeState(taskId) {
try {
// Capture agent execution state
const agentKeys = await this.redisAdapter.raw('KEYS', [
`agent:${taskId}:*`
]);
const agents = [];
for (const key of agentKeys){
const data = await this.redisAdapter.get(key);
if (data) {
agents.push({
agentId: data.agentId || '',
agentType: data.agentType || '',
status: data.status || 'in_progress',
confidence: data.confidence,
startedAt: new Date(data.startedAt || Date.now()),
completedAt: data.completedAt ? new Date(data.completedAt) : undefined,
metadata: data.metadata
});
}
}
// Capture coordination signals
const signalKeys = await this.redisAdapter.raw('KEYS', [
`swarm:${taskId}:*`
]);
const coordinationSignals = [];
for (const key of signalKeys){
const value = await this.redisAdapter.get(key);
const ttl = await this.redisAdapter.raw('TTL', [
key
]);
if (value) {
coordinationSignals.push({
key,
value,
ttl: ttl > 0 ? ttl : undefined,
createdAt: new Date()
});
}
}
// Capture queue data
const queueKeys = await this.redisAdapter.raw('KEYS', [
`queue:${taskId}:*`
]);
const queueData = [];
for (const key of queueKeys){
const items = await this.redisAdapter.raw('LRANGE', [
key,
'0',
'-1'
]);
if (items && items.length > 0) {
queueData.push({
queueName: key,
items
});
}
}
// Capture active locks
const lockKeys = await this.redisAdapter.raw('KEYS', [
`lock:${taskId}:*`
]);
const activeLocks = [];
for (const key of lockKeys){
const owner = await this.redisAdapter.get(key);
const ttl = await this.redisAdapter.raw('TTL', [
key
]);
if (owner && ttl > 0) {
activeLocks.push({
lockKey: key,
owner,
acquiredAt: new Date(),
expiresAt: new Date(Date.now() + ttl * 1000)
});
}
}
return {
taskId,
agents,
coordinationSignals,
queueData,
activeLocks,
capturedAt: new Date()
};
} catch (error) {
const err = error;
logger.error('Failed to capture runtime state', err, {
taskId
});
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to capture runtime state from Redis', {
taskId
}, err);
}
}
async captureDurableState(taskId) {
try {
// Capture task results
const taskResults = await this.sqliteAdapter.list('task_results', {
filters: [
{
field: 'taskId',
operator: 'eq',
value: taskId
}
]
});
// Capture agent metrics
const agentMetrics = await this.sqliteAdapter.list('agent_metrics', {
filters: [
{
field: 'taskId',
operator: 'eq',
value: taskId
}
]
});
// Capture audit trail
const auditTrail = await this.sqliteAdapter.list('audit_trail', {
filters: [
{
field: 'taskId',
operator: 'eq',
value: taskId
}
]
});
// Capture skill metadata
const skillMetadata = await this.sqliteAdapter.list('skill_metadata', {});
return {
taskId,
taskResults,
agentMetrics,
auditTrail,
skillMetadata,
capturedAt: new Date()
};
} catch (error) {
const err = error;
logger.error('Failed to capture durable state', err, {
taskId
});
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to capture durable state from SQLite', {
taskId
}, err);
}
}
// ============================================================================
// Private Methods - State Storage
// ============================================================================
async storeCheckpointData(checkpointId, runtimeState, durableState) {
try {
// Store runtime state as JSON in a key-value table
await this.sqliteAdapter.insert('checkpoint_data', {
key: `checkpoint_runtime:${checkpointId}`,
value: JSON.stringify(runtimeState)
});
// Store durable state as JSON in a key-value table
await this.sqliteAdapter.insert('checkpoint_data', {
key: `checkpoint_durable:${checkpointId}`,
value: JSON.stringify(durableState)
});
logger.debug('Checkpoint data stored', {
checkpointId
});
} catch (error) {
const err = error;
logger.error('Failed to store checkpoint data', err, {
checkpointId
});
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to store checkpoint data', {
checkpointId
}, err);
}
}
async loadCheckpointData(checkpointId) {
try {
// Load metadata from checkpoints table
const metadataRow = await this.sqliteAdapter.get(`checkpoints:${checkpointId}`);
if (!metadataRow) {
throw new StandardError(ErrorCode.DB_NOT_FOUND, 'Checkpoint metadata not found', {
checkpointId
});
}
// Load runtime state from checkpoint_data table
const runtimeStateRow = await this.sqliteAdapter.get(`checkpoint_data:checkpoint_runtime:${checkpointId}`);
if (!runtimeStateRow || !runtimeStateRow.value) {
throw new StandardError(ErrorCode.DB_NOT_FOUND, 'Checkpoint runtime state not found', {
checkpointId
});
}
// Load durable state from checkpoint_data table
const durableStateRow = await this.sqliteAdapter.get(`checkpoint_data:checkpoint_durable:${checkpointId}`);
if (!durableStateRow || !durableStateRow.value) {
throw new StandardError(ErrorCode.DB_NOT_FOUND, 'Checkpoint durable state not found', {
checkpointId
});
}
return {
metadata: this.deserializeCheckpointMetadata(metadataRow),
runtimeState: JSON.parse(runtimeStateRow.value),
durableState: JSON.parse(durableStateRow.value)
};
} catch (error) {
const err = error;
logger.error('Failed to load checkpoint data', err, {
checkpointId
});
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to load checkpoint data', {
checkpointId
}, err);
}
}
// ============================================================================
// Private Methods - State Restoration
// ============================================================================
async restoreRuntimeState(taskId, runtimeState) {
try {
// Restore agent execution state
for (const agent of runtimeState.agents){
await this.redisAdapter.insert(`agent:${taskId}:${agent.agentId}`, agent);
}
// Restore coordination signals
for (const signal of runtimeState.coordinationSignals){
await this.redisAdapter.insert(signal.key, signal.value);
if (signal.ttl) {
await this.redisAdapter.raw('EXPIRE', [
signal.key,
signal.ttl.toString()
]);
}
}
// Restore queue data
for (const queue of runtimeState.queueData){
await this.redisAdapter.raw('DEL', [
queue.queueName
]);
for (const item of queue.items){
await this.redisAdapter.raw('RPUSH', [
queue.queueName,
JSON.stringify(item)
]);
}
}
// Restore active locks
for (const lock of runtimeState.activeLocks){
const ttl = Math.floor((lock.expiresAt.getTime() - Date.now()) / 1000);
if (ttl > 0) {
await this.redisAdapter.insert(lock.lockKey, lock.owner);
await this.redisAdapter.raw('EXPIRE', [
lock.lockKey,
ttl.toString()
]);
}
}
logger.info('Runtime state restored to Redis', {
taskId
});
return true;
} catch (error) {
const err = error;
logger.error('Failed to restore runtime state', err, {
taskId
});
return false;
}
}
async restoreDurableState(taskId, durableState) {
try {
// Durable state in SQLite is already persistent
// This method is for future enhancements (e.g., restoring to different instance)
logger.info('Durable state validation complete', {
taskId
});
return true;
} catch (error) {
const err = error;
logger.error('Failed to restore durable state', err, {
taskId
});
return false;
}
}
// ============================================================================
// Private Methods - Validation
// ============================================================================
async validateState(runtimeState, durableState) {
// Validate runtime state
if (!runtimeState.taskId) {
throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Invalid runtime state: missing taskId', {
runtimeState
});
}
if (!Array.isArray(runtimeState.agents)) {
throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Invalid runtime state: agents must be an array', {
runtimeState
});
}
// Validate durable state
if (!durableState.taskId) {
throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Invalid durable state: missing taskId', {
durableState
});
}
if (runtimeState.taskId !== durableState.taskId) {
throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Task ID mismatch between runtime and durable state', {
runtimeTaskId: runtimeState.taskId,
durableTaskId: durableState.taskId
});
}
logger.debug('State validation passed', {
taskId: runtimeState.taskId
});
}
async validateCheckpointData(checkpointData) {
await this.validateState(checkpointData.runtimeState, checkpointData.durableState);
if (checkpointData.metadata.status !== "completed") {
throw new StandardError(ErrorCode.VALIDATION_FAILED, 'Cannot recover from incomplete checkpoint', {
status: checkpointData.metadata.status
});
}
logger.debug('Checkpoint data validation passed', {
checkpointId: checkpointData.metadata.checkpointId
});
}
// ============================================================================
// Private Methods - Utilities
// ============================================================================
generateCheckpointId(taskId, trigger) {
const timestamp = Date.now();
const random = crypto.randomBytes(4).toString('hex');
return `checkpoint_${taskId}_${trigger}_${timestamp}_${random}`;
}
hashState(state) {
const stateJson = JSON.stringify(state, Object.keys(state).sort());
return crypto.createHash('sha256').update(stateJson).digest('hex');
}
async findCheckpointByHash(taskId, runtimeHash, durableHash) {
try {
const rows = await this.sqliteAdapter.list('checkpoints', {
filters: [
{
field: 'task_id',
operator: 'eq',
value: taskId
},
{
field: 'runtime_state_hash',
operator: 'eq',
value: runtimeHash
},
{
field: 'durable_state_hash',
operator: 'eq',
value: durableHash
}
],
limit: 1
});
if (rows.length === 0) {
return null;
}
return this.deserializeCheckpointMetadata(rows[0]);
} catch (error) {
const err = error;
logger.error('Failed to find checkpoint by hash', err, {
taskId
});
return null;
}
}
async findLatestCheckpoint(taskId) {
try {
const rows = await this.sqliteAdapter.list('checkpoints', {
filters: [
{
field: 'task_id',
operator: 'eq',
value: taskId
},
{
field: 'status',
operator: 'eq',
value: "completed"
}
],
orderBy: 'created_at',
order: 'desc',
limit: 1
});
if (rows.length === 0) {
return null;
}
return this.deserializeCheckpointMetadata(rows[0]);
} catch (error) {
const err = error;
logger.error('Failed to find latest checkpoint', err, {
taskId
});
return null;
}
}
async storeCheckpointMetadata(metadata) {
try {
await this.sqliteAdapter.insert('checkpoints', {
checkpoint_id: metadata.checkpointId,
task_id: metadata.taskId,
trigger: metadata.trigger,
status: metadata.status,
runtime_state_hash: metadata.runtimeStateHash,
durable_state_hash: metadata.durableStateHash,
created_at: metadata.createdAt.toISOString(),
completed_at: metadata.completedAt?.toISOString(),
error: metadata.error,
metadata: JSON.stringify(metadata.metadata || {})
});
logger.debug('Checkpoint metadata stored', {
checkpointId: metadata.checkpointId
});
} catch (error) {
const err = error;
logger.error('Failed to store checkpoint metadata', err, {
checkpointId: metadata.checkpointId
});
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to store checkpoint metadata', {
checkpointId: metadata.checkpointId
}, err);
}
}
async updateCheckpointMetadata(metadata) {
try {
await this.sqliteAdapter.update('checkpoints', metadata.checkpointId, {
status: metadata.status,
completed_at: metadata.completedAt?.toISOString(),
error: metadata.error
});
} catch (error) {
// If update fails, try insert (first time)
await this.storeCheckpointMetadata(metadata);
}
}
deserializeCheckpointMetadata(row) {
return {
checkpointId: row.checkpoint_id,
taskId: row.task_id,
trigger: row.trigger,
status: row.status,
runtimeStateHash: row.runtime_state_hash,
durableStateHash: row.durable_state_hash,
createdAt: new Date(row.created_at),
completedAt: row.completed_at ? new Date(row.completed_at) : undefined,
error: row.error,
metadata: row.metadata ? JSON.parse(row.metadata) : undefined
};
}
async createCheckpointTables() {
try {
// Create checkpoints table
await this.sqliteAdapter.raw(`
CREATE TABLE IF NOT EXISTS checkpoints (
checkpoint_id TEXT PRIMARY KEY,
task_id TEXT NOT NULL,
trigger TEXT NOT NULL,
status TEXT NOT NULL,
runtime_state_hash TEXT NOT NULL,
durable_state_hash TEXT NOT NULL,
created_at TEXT NOT NULL,
completed_at TEXT,
error TEXT,
metadata TEXT
)
`, []);
// Create checkpoint_data table for storing state JSON
await this.sqliteAdapter.raw(`
CREATE TABLE IF NOT EXISTS checkpoint_data (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)
`, []);
// Create indexes
await this.sqliteAdapter.raw(`
CREATE INDEX IF NOT EXISTS idx_checkpoints_task_id ON checkpoints(task_id)
`, []);
await this.sqliteAdapter.raw(`
CREATE INDEX IF NOT EXISTS idx_checkpoints_status ON checkpoints(status)
`, []);
await this.sqliteAdapter.raw(`
CREATE INDEX IF NOT EXISTS idx_checkpoints_created_at ON checkpoints(created_at)
`, []);
logger.info('Checkpoint tables created successfully');
} catch (error) {
const err = error;
logger.error('Failed to create checkpoint tables', err);
throw new StandardError(ErrorCode.DB_QUERY_FAILED, 'Failed to create checkpoint tables', {}, err);
}
}
startPeriodicCheckpoints() {
this.periodicCheckpointTimer = setInterval(async ()=>{
try {
logger.debug('Running periodic checkpoint cleanup');
await this.cleanupOldCheckpoints();
} catch (error) {
const err = error;
logger.error('Periodic checkpoint cleanup failed', err);
}
}, this.config.periodicInterval);
logger.info('Periodic checkpoints started', {
interval: this.config.periodicInterval
});
}
ensureInitialized() {
if (!this.initialized) {
throw new StandardError(ErrorCode.CONFIGURATION_ERROR, 'CheckpointManager not initialized. Call initialize() first.', {});
}
}
}
/**
* Export convenience functions
*/ /**
* Create a checkpoint manager instance
*/ export function createCheckpointManager(dbService, config) {
return new CheckpointManager(dbService, config);
}
//# sourceMappingURL=checkpoint-manager.js.map