UNPKG

claude-flow-novice

Version:

Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes Local RuVector Accelerator and all CFN skills for complete functionality.

390 lines (389 loc) 16 kB
/** * Health Check HTTP Endpoints * * Provides REST API endpoints for health monitoring: * - GET /health - Overall system health status * - GET /health/ready - Kubernetes readiness probe * - GET /health/live - Kubernetes liveness probe * - GET /health/detailed - Detailed component-level health report * * Integrates with monitoring dashboards and Kubernetes orchestration. * * Part of Task P2-4.1: Comprehensive Health Checks */ import express from 'express'; import { HealthCheckSystem } from '../services/health-check-system.js'; /** * Health check endpoints router */ export class HealthEndpoints { router; healthCheckSystem; constructor(config){ this.router = express.Router(); this.healthCheckSystem = new HealthCheckSystem(config?.systemConfig); this.setupRoutes(); } /** * Setup all health check routes */ setupRoutes() { /** * GET /health - Overall system health * * Returns the current health status of the system including: * - Overall status (healthy/degraded/unhealthy) * - Response latency * - Component health summary * * Response time: <1s * Status codes: * 200: System healthy * 503: System degraded or unhealthy */ this.router.get('/health', async (req, res)=>{ const startTime = Date.now(); const overall = await this.healthCheckSystem.getOverallHealth(); const responseTime = Date.now() - startTime; const response = { status: overall.status, timestamp: new Date().toISOString(), latency: responseTime, checks: { database: overall.dependencies?.[0].status, redis: overall.dependencies?.[1].status, filesystem: overall.dependencies?.[2].status, agents: overall.dependencies?.[3].status } }; const statusCode = overall.status === 'healthy' ? 200 : 503; res.status(statusCode).json(response); }); /** * GET /health/ready - Kubernetes readiness probe * * Checks if the system is ready to accept traffic. * All critical services must be healthy. * * Response time: <500ms * Status codes: * 200: Ready to accept traffic * 503: Not ready */ this.router.get('/health/ready', async (req, res)=>{ const startTime = Date.now(); const overall = await this.healthCheckSystem.getOverallHealth(); const isReady = overall.status === 'healthy'; const responseTime = Date.now() - startTime; const response = { status: isReady ? 'ready' : 'not-ready', timestamp: new Date().toISOString() }; const statusCode = isReady ? 200 : 503; res.status(statusCode).json(response); }); /** * GET /health/live - Kubernetes liveness probe * * Checks if the system is alive and responding. * Allows degraded services. * * Response time: <500ms * Status codes: * 200: System alive * 503: System down/unhealthy */ this.router.get('/health/live', async (req, res)=>{ const startTime = Date.now(); const overall = await this.healthCheckSystem.getOverallHealth(); const isAlive = overall.status !== 'unhealthy'; const responseTime = Date.now() - startTime; const response = { status: isAlive ? 'alive' : 'not-alive', timestamp: new Date().toISOString() }; const statusCode = isAlive ? 200 : 503; res.status(statusCode).json(response); }); /** * GET /health/detailed - Detailed health report * * Returns comprehensive health information including: * - Overall system status * - Individual service metrics * - Response latencies * - Disk usage * - Queue depth * - Active agents * - Alerts and warnings * * Response time: <1s * Status codes: * 200: Report generated (regardless of health status) */ this.router.get('/health/detailed', async (req, res)=>{ const startTime = Date.now(); const report = await this.healthCheckSystem.getDetailedHealthReport(); const responseTime = Date.now() - startTime; const response = { timestamp: report.timestamp.toISOString(), overallStatus: report.overallStatus, latency: responseTime, totalLatency: report.latency, services: { database: { status: report.services.database.status, latency: report.services.database.latency, message: report.services.database.message, metadata: report.services.database.metadata }, redis: { status: report.services.redis.status, latency: report.services.redis.latency, message: report.services.redis.message, metadata: report.services.redis.metadata }, filesystem: { status: report.services.filesystem.status, latency: report.services.filesystem.latency, message: report.services.filesystem.message, metadata: report.services.filesystem.metadata }, agents: { status: report.services.agents.status, latency: report.services.agents.latency, message: report.services.agents.message, metadata: report.services.agents.metadata } }, alerts: report.alerts || [] }; res.status(200).json(response); }); /** * GET /health/database - Database health only * * Focused check for database connectivity. * Useful for targeted monitoring. */ this.router.get('/health/database', async (req, res)=>{ const startTime = Date.now(); const check = await this.healthCheckSystem.checkDatabase(); const responseTime = Date.now() - startTime; const response = { service: 'database', status: check.status, latency: responseTime, message: check.message, metadata: check.metadata, timestamp: new Date().toISOString() }; const statusCode = check.status === 'healthy' ? 200 : 503; res.status(statusCode).json(response); }); /** * GET /health/redis - Redis health only * * Focused check for Redis connectivity. */ this.router.get('/health/redis', async (req, res)=>{ const startTime = Date.now(); const check = await this.healthCheckSystem.checkRedis(); const responseTime = Date.now() - startTime; const response = { service: 'redis', status: check.status, latency: responseTime, message: check.message, metadata: check.metadata, timestamp: new Date().toISOString() }; const statusCode = check.status === 'healthy' ? 200 : 503; res.status(statusCode).json(response); }); /** * GET /health/filesystem - File system health only * * Focused check for disk space and permissions. */ this.router.get('/health/filesystem', async (req, res)=>{ const startTime = Date.now(); const check = await this.healthCheckSystem.checkFileSystem(); const responseTime = Date.now() - startTime; const response = { service: 'filesystem', status: check.status, latency: responseTime, message: check.message, metadata: check.metadata, timestamp: new Date().toISOString() }; const statusCode = check.status === 'healthy' ? 200 : 503; res.status(statusCode).json(response); }); /** * GET /health/agents - Agent health only * * Focused check for active agents and queue depth. */ this.router.get('/health/agents', async (req, res)=>{ const startTime = Date.now(); const check = await this.healthCheckSystem.checkAgents(); const responseTime = Date.now() - startTime; const response = { service: 'agents', status: check.status, latency: responseTime, message: check.message, metadata: check.metadata, timestamp: new Date().toISOString() }; const statusCode = check.status === 'healthy' ? 200 : 503; res.status(statusCode).json(response); }); /** * GET /health/ping - Fast connectivity check * * Ultra-fast ping endpoint for basic connectivity checks. * Returns in <100ms for Kubernetes probes and dashboards. * * Query parameters: * - timeout: Optional timeout in milliseconds (default: 100) * * Response time: <100ms * Status codes: * 200: System responsive * 503: System unresponsive or timeout */ this.router.get('/health/ping', async (req, res)=>{ const startTime = Date.now(); try { // Parse optional timeout parameter const timeout = req.query.timeout ? parseInt(req.query.timeout, 10) : 100; // Validate timeout if (isNaN(timeout) || timeout < 1 || timeout > 1000) { res.status(400).json({ error: 'Invalid timeout parameter', message: 'Timeout must be between 1 and 1000 milliseconds', timestamp: new Date().toISOString() }); return; } const check = await this.healthCheckSystem.ping(timeout); const responseTime = Date.now() - startTime; const response = { status: check.status, latency: responseTime, message: check.message, metadata: check.metadata, timestamp: new Date().toISOString() }; res.status(200).json(response); } catch (error) { const responseTime = Date.now() - startTime; const response = { status: 'unhealthy', latency: responseTime, message: error.message || 'Ping failed', error: error.code || 'UNKNOWN_ERROR', timestamp: new Date().toISOString() }; res.status(503).json(response); } }); /** * GET /health/aggregate - Aggregated health statistics * * Returns comprehensive aggregated health metrics from all services. * Includes service counts, average latency, warnings, and errors. * * Query parameters: * - timeout: Optional timeout in milliseconds (default: 5000) * * Response time: <5s (default) * Status codes: * 200: Stats collected successfully * 503: Timeout or aggregation failure */ this.router.get('/health/aggregate', async (req, res)=>{ const startTime = Date.now(); try { // Parse optional timeout parameter const timeout = req.query.timeout ? parseInt(req.query.timeout, 10) : 5000; // Validate timeout if (isNaN(timeout) || timeout < 100 || timeout > 30000) { res.status(400).json({ error: 'Invalid timeout parameter', message: 'Timeout must be between 100 and 30000 milliseconds', timestamp: new Date().toISOString() }); return; } const stats = await this.healthCheckSystem.getAggregateStats(timeout); const responseTime = Date.now() - startTime; const response = { timestamp: stats.timestamp.toISOString(), overallStatus: stats.overallStatus, latency: responseTime, totalLatency: stats.latency, averageServiceLatency: stats.averageServiceLatency, serviceCount: stats.serviceCount, services: { database: { status: stats.services.database.status, latency: stats.services.database.latency, message: stats.services.database.message }, redis: { status: stats.services.redis.status, latency: stats.services.redis.latency, message: stats.services.redis.message }, filesystem: { status: stats.services.filesystem.status, latency: stats.services.filesystem.latency, message: stats.services.filesystem.message }, agents: { status: stats.services.agents.status, latency: stats.services.agents.latency, message: stats.services.agents.message } }, metadata: stats.metadata, warnings: stats.warnings, errors: stats.errors }; const statusCode = stats.overallStatus === 'healthy' ? 200 : 503; res.status(statusCode).json(response); } catch (error) { const responseTime = Date.now() - startTime; const response = { status: 'error', latency: responseTime, message: error.message || 'Failed to aggregate health stats', error: error.code || 'UNKNOWN_ERROR', timestamp: new Date().toISOString() }; res.status(503).json(response); } }); } /** * Get the configured router */ getRouter() { return this.router; } /** * Get the health check system instance */ getHealthCheckSystem() { return this.healthCheckSystem; } } /** * Create and configure health check endpoints * Usage: * const app = express(); * const healthEndpoints = new HealthEndpoints(); * app.use('/health', healthEndpoints.getRouter()); */ export function createHealthEndpoints(config) { return new HealthEndpoints(config); } /** * Middleware for adding health check endpoints to an Express app * Usage: * const app = express(); * app.use(mountHealthEndpoints()); */ export function mountHealthEndpoints(config) { const endpoints = new HealthEndpoints(config); return endpoints.getRouter(); } export { HealthCheckSystem }; //# sourceMappingURL=health-endpoints.js.map