UNPKG

@tehreet/conduit

Version:

LLM API gateway with intelligent routing, robust process management, and health monitoring

1,835 lines (1,542 loc) 69.4 kB
# Robust Process Management & Service Architecture Implementation Guide ## Table of Contents 1. [Overview](#overview) 2. [Architecture Design](#architecture-design) 3. [Implementation Code](#implementation-code) - [Service Manager](#service-manager) - [Watchdog Process](#watchdog-process) - [Connection Manager](#connection-manager) - [Updated Server](#updated-server) - [CLI Integration](#cli-integration) 4. [Migration Guide](#migration-guide) 5. [Testing & Monitoring](#testing--monitoring) 6. [Troubleshooting](#troubleshooting) 7. [Best Practices](#best-practices) --- ## Overview This guide details the implementation of a robust process management system for claude-code-router, replacing the current PID file-based approach with a production-ready service architecture. ### Current Issues - Race conditions with PID file management - No automatic restart on crashes - Poor process lifecycle management - File-based reference counting is unreliable - No health monitoring or self-healing ### Solution Benefits - Automatic service recovery - Proper daemon/service integration - Health checks and monitoring - Graceful shutdown with connection draining - Zero-downtime updates - Better resource management ## Architecture Design ``` ┌─────────────────────────────────────────────────────────────────┐ │ Service Layer │ ├─────────────────┬────────────────┬──────────────────────────────┤ │ Service Manager │ Watchdog Process│ Health Monitor │ ├─────────────────┴────────────────┴──────────────────────────────┤ │ Application Layer │ ├──────────────────┬───────────────┬──────────────────────────────┤ │ Main Process │ Worker Pool │ Connection Store │ ├──────────────────┴───────────────┴──────────────────────────────┤ │ Communication Layer │ ├──────────────────┬───────────────┬──────────────────────────────┤ │ IPC Channel │ Health Socket │ Admin Socket │ └──────────────────┴───────────────┴──────────────────────────────┘ ``` --- ## Implementation Code ### Service Manager **File: `src/services/ServiceManager.ts`** ```typescript // src/services/ServiceManager.ts import { EventEmitter } from 'events'; import { ChildProcess, fork } from 'child_process'; import * as fs from 'fs/promises'; import * as path from 'path'; import { Service as WindowsService } from 'node-windows'; import { Service as MacService } from 'node-mac'; import { Service as LinuxService } from 'node-linux'; export interface ServiceConfig { name: string; displayName: string; description: string; script: string; env?: Record<string, string>; maxRestarts?: number; restartDelay?: number; gracefulShutdownTimeout?: number; } export interface ServiceStatus { pid?: number; status: 'running' | 'stopped' | 'starting' | 'stopping' | 'crashed'; uptime?: number; restartCount: number; lastError?: string; health: 'healthy' | 'unhealthy' | 'unknown'; } export class ServiceManager extends EventEmitter { private config: ServiceConfig; private service: any; private status: ServiceStatus; private healthCheckInterval?: NodeJS.Timeout; private watchdog?: ChildProcess; private ipcPath: string; constructor(config: ServiceConfig) { super(); this.config = { maxRestarts: 5, restartDelay: 5000, gracefulShutdownTimeout: 30000, ...config }; this.status = { status: 'stopped', restartCount: 0, health: 'unknown' }; this.ipcPath = process.platform === 'win32' ? '\\\\.\\pipe\\claude-code-router' : '/tmp/claude-code-router.sock'; } async install(): Promise<void> { const platform = process.platform; switch (platform) { case 'win32': await this.installWindows(); break; case 'darwin': await this.installMac(); break; case 'linux': await this.installLinux(); break; default: throw new Error(`Unsupported platform: ${platform}`); } } private async installWindows(): Promise<void> { this.service = new WindowsService({ name: this.config.name, description: this.config.description, script: this.config.script, env: Object.entries(this.config.env || {}).map(([name, value]) => ({ name, value })) }); return new Promise((resolve, reject) => { this.service.on('install', () => { this.emit('installed'); resolve(); }); this.service.on('error', reject); this.service.install(); }); } private async installMac(): Promise<void> { this.service = new MacService({ name: this.config.name, description: this.config.description, script: this.config.script, env: this.config.env }); return new Promise((resolve, reject) => { this.service.on('install', () => { this.emit('installed'); resolve(); }); this.service.on('error', reject); this.service.install(); }); } private async installLinux(): Promise<void> { // For Linux, we'll create a systemd service const serviceContent = ` [Unit] Description=${this.config.description} After=network.target [Service] Type=simple User=${process.env.USER} WorkingDirectory=${path.dirname(this.config.script)} ExecStart=${process.execPath} ${this.config.script} Restart=always RestartSec=${this.config.restartDelay / 1000} StandardOutput=syslog StandardError=syslog SyslogIdentifier=${this.config.name} ${Object.entries(this.config.env || {}).map(([k, v]) => `Environment="${k}=${v}"`).join('\n')} [Install] WantedBy=multi-user.target `; const servicePath = `/etc/systemd/system/${this.config.name}.service`; try { await fs.writeFile(servicePath, serviceContent); await this.execCommand('systemctl daemon-reload'); await this.execCommand(`systemctl enable ${this.config.name}`); this.emit('installed'); } catch (error) { throw new Error(`Failed to install Linux service: ${error}`); } } async start(): Promise<void> { if (this.status.status === 'running') { throw new Error('Service is already running'); } this.status.status = 'starting'; this.emit('starting'); try { if (process.platform === 'linux') { await this.execCommand(`systemctl start ${this.config.name}`); } else { await this.startNativeService(); } // Start watchdog process await this.startWatchdog(); // Start health monitoring this.startHealthMonitoring(); this.status.status = 'running'; this.emit('started'); } catch (error) { this.status.status = 'stopped'; this.status.lastError = error.message; throw error; } } private async startWatchdog(): Promise<void> { const watchdogScript = path.join(__dirname, 'watchdog.js'); this.watchdog = fork(watchdogScript, [], { detached: true, env: { ...process.env, SERVICE_NAME: this.config.name, IPC_PATH: this.ipcPath, MAX_RESTARTS: String(this.config.maxRestarts), RESTART_DELAY: String(this.config.restartDelay) } }); this.watchdog.on('message', (msg) => { this.handleWatchdogMessage(msg); }); } private handleWatchdogMessage(msg: any): void { switch (msg.type) { case 'service_crashed': this.status.status = 'crashed'; this.status.restartCount++; this.emit('crashed', msg.error); break; case 'service_restarted': this.status.status = 'running'; this.status.pid = msg.pid; this.emit('restarted'); break; case 'max_restarts_reached': this.status.status = 'stopped'; this.emit('failed', 'Maximum restart attempts reached'); break; } } private startHealthMonitoring(): void { this.healthCheckInterval = setInterval(async () => { try { const health = await this.checkHealth(); this.status.health = health ? 'healthy' : 'unhealthy'; if (!health && this.status.status === 'running') { this.emit('unhealthy'); // Trigger restart if unhealthy await this.restart(); } } catch (error) { this.status.health = 'unknown'; } }, 10000); // Check every 10 seconds } private async checkHealth(): Promise<boolean> { try { const response = await fetch('http://localhost:3456/health', { timeout: 5000 }); const data = await response.json(); return data.status === 'ok'; } catch { return false; } } async stop(): Promise<void> { if (this.status.status !== 'running') { throw new Error('Service is not running'); } this.status.status = 'stopping'; this.emit('stopping'); try { // Stop health monitoring if (this.healthCheckInterval) { clearInterval(this.healthCheckInterval); } // Graceful shutdown await this.gracefulShutdown(); // Stop watchdog if (this.watchdog) { this.watchdog.kill(); } // Stop service if (process.platform === 'linux') { await this.execCommand(`systemctl stop ${this.config.name}`); } else { await this.stopNativeService(); } this.status.status = 'stopped'; this.emit('stopped'); } catch (error) { this.status.lastError = error.message; throw error; } } private async gracefulShutdown(): Promise<void> { return new Promise((resolve, reject) => { const timeout = setTimeout(() => { reject(new Error('Graceful shutdown timeout')); }, this.config.gracefulShutdownTimeout); // Send shutdown signal via IPC this.sendIPCMessage({ type: 'shutdown' }, (error) => { clearTimeout(timeout); if (error) reject(error); else resolve(); }); }); } async restart(): Promise<void> { this.emit('restarting'); await this.stop(); await new Promise(resolve => setTimeout(resolve, 1000)); await this.start(); } async uninstall(): Promise<void> { if (this.status.status === 'running') { await this.stop(); } if (process.platform === 'linux') { await this.execCommand(`systemctl disable ${this.config.name}`); await fs.unlink(`/etc/systemd/system/${this.config.name}.service`); await this.execCommand('systemctl daemon-reload'); } else { await this.uninstallNativeService(); } this.emit('uninstalled'); } getStatus(): ServiceStatus { return { ...this.status }; } private sendIPCMessage(message: any, callback?: (error?: Error) => void): void { // Implementation for IPC communication const client = require('net').createConnection(this.ipcPath, () => { client.write(JSON.stringify(message)); client.end(); callback?.(); }); client.on('error', callback); } private async execCommand(command: string): Promise<string> { const { exec } = require('child_process'); return new Promise((resolve, reject) => { exec(command, (error, stdout, stderr) => { if (error) reject(error); else resolve(stdout); }); }); } private startNativeService(): Promise<void> { return new Promise((resolve, reject) => { this.service.on('start', () => resolve()); this.service.on('error', reject); this.service.start(); }); } private stopNativeService(): Promise<void> { return new Promise((resolve, reject) => { this.service.on('stop', () => resolve()); this.service.on('error', reject); this.service.stop(); }); } private uninstallNativeService(): Promise<void> { return new Promise((resolve, reject) => { this.service.on('uninstall', () => resolve()); this.service.on('error', reject); this.service.uninstall(); }); } } ``` ### Watchdog Process **File: `src/services/watchdog.ts`** ```typescript // src/services/watchdog.ts import { fork, ChildProcess } from 'child_process'; import * as net from 'net'; import * as fs from 'fs'; import * as path from 'path'; interface WatchdogConfig { serviceName: string; scriptPath: string; ipcPath: string; maxRestarts: number; restartDelay: number; healthCheckInterval: number; memoryThreshold: number; // MB cpuThreshold: number; // percentage } interface ProcessMetrics { memory: number; cpu: number; handles: number; uptime: number; } class Watchdog { private config: WatchdogConfig; private childProcess?: ChildProcess; private restartCount: number = 0; private isShuttingDown: boolean = false; private metrics: ProcessMetrics = { memory: 0, cpu: 0, handles: 0, uptime: 0 }; private startTime: number = Date.now(); private cpuUsageHistory: number[] = []; constructor(config: WatchdogConfig) { this.config = config; this.setupSignalHandlers(); this.setupIPCServer(); } private setupSignalHandlers(): void { process.on('SIGINT', () => this.shutdown()); process.on('SIGTERM', () => this.shutdown()); process.on('uncaughtException', (error) => { console.error('Watchdog uncaught exception:', error); this.shutdown(); }); } private setupIPCServer(): void { const server = net.createServer((socket) => { socket.on('data', (data) => { try { const message = JSON.parse(data.toString()); this.handleIPCMessage(message); } catch (error) { console.error('Invalid IPC message:', error); } }); }); // Clean up existing socket if (fs.existsSync(this.config.ipcPath)) { fs.unlinkSync(this.config.ipcPath); } server.listen(this.config.ipcPath); } private handleIPCMessage(message: any): void { switch (message.type) { case 'status': this.sendStatus(); break; case 'restart': this.restartService(); break; case 'shutdown': this.shutdown(); break; case 'metrics': this.sendMetrics(); break; } } async start(): Promise<void> { console.log(`Watchdog starting for ${this.config.serviceName}`); await this.startService(); // Start monitoring setInterval(() => this.monitorService(), this.config.healthCheckInterval); setInterval(() => this.collectMetrics(), 5000); } private async startService(): Promise<void> { if (this.childProcess) { return; } try { this.childProcess = fork(this.config.scriptPath, [], { detached: false, env: { ...process.env, WATCHDOG_ENABLED: 'true', WATCHDOG_PID: String(process.pid) } }); this.childProcess.on('exit', (code, signal) => { this.handleServiceExit(code, signal); }); this.childProcess.on('error', (error) => { this.handleServiceError(error); }); this.childProcess.on('message', (message) => { this.handleServiceMessage(message); }); // Send parent message about successful start if (process.send) { process.send({ type: 'service_started', pid: this.childProcess.pid }); } this.startTime = Date.now(); console.log(`Service started with PID: ${this.childProcess.pid}`); } catch (error) { console.error('Failed to start service:', error); this.handleServiceError(error); } } private handleServiceExit(code: number | null, signal: string | null): void { console.log(`Service exited with code ${code} and signal ${signal}`); this.childProcess = undefined; if (this.isShuttingDown) { return; } // Check if we should restart if (this.restartCount >= this.config.maxRestarts) { console.error('Maximum restart attempts reached'); if (process.send) { process.send({ type: 'max_restarts_reached', restartCount: this.restartCount }); } process.exit(1); } // Restart after delay this.restartCount++; console.log(`Restarting service (attempt ${this.restartCount}/${this.config.maxRestarts})`); setTimeout(() => { this.startService(); }, this.config.restartDelay); if (process.send) { process.send({ type: 'service_crashed', code, signal, restartCount: this.restartCount }); } } private handleServiceError(error: Error): void { console.error('Service error:', error); if (process.send) { process.send({ type: 'service_error', error: error.message }); } } private handleServiceMessage(message: any): void { // Handle messages from the service switch (message.type) { case 'health': this.updateHealth(message.data); break; case 'metrics': this.updateMetrics(message.data); break; case 'ready': console.log('Service reported ready'); if (process.send) { process.send({ type: 'service_ready' }); } break; } } private async monitorService(): Promise<void> { if (!this.childProcess || this.isShuttingDown) { return; } // Check if process is responsive const isResponsive = await this.checkResponsiveness(); if (!isResponsive) { console.warn('Service is unresponsive, restarting...'); await this.restartService(); return; } // Check resource usage if (this.metrics.memory > this.config.memoryThreshold) { console.warn(`Memory usage (${this.metrics.memory}MB) exceeds threshold, restarting...`); await this.restartService(); return; } // Check CPU usage (average over last 5 samples) const avgCpu = this.cpuUsageHistory.slice(-5).reduce((a, b) => a + b, 0) / 5; if (avgCpu > this.config.cpuThreshold) { console.warn(`CPU usage (${avgCpu}%) exceeds threshold, restarting...`); await this.restartService(); return; } } private async checkResponsiveness(): Promise<boolean> { return new Promise((resolve) => { if (!this.childProcess) { resolve(false); return; } const timeout = setTimeout(() => { resolve(false); }, 5000); const messageHandler = (msg: any) => { if (msg.type === 'pong') { clearTimeout(timeout); this.childProcess?.off('message', messageHandler); resolve(true); } }; this.childProcess.on('message', messageHandler); this.childProcess.send({ type: 'ping' }); }); } private async collectMetrics(): Promise<void> { if (!this.childProcess || !this.childProcess.pid) { return; } try { const pid = this.childProcess.pid; // Get process info based on platform if (process.platform === 'linux') { await this.collectLinuxMetrics(pid); } else if (process.platform === 'win32') { await this.collectWindowsMetrics(pid); } else if (process.platform === 'darwin') { await this.collectMacMetrics(pid); } this.metrics.uptime = Date.now() - this.startTime; } catch (error) { console.error('Failed to collect metrics:', error); } } private async collectLinuxMetrics(pid: number): Promise<void> { const statPath = `/proc/${pid}/stat`; const statusPath = `/proc/${pid}/status`; try { const stat = await fs.promises.readFile(statPath, 'utf8'); const status = await fs.promises.readFile(statusPath, 'utf8'); // Parse memory from status const vmRssMatch = status.match(/VmRSS:\s+(\d+)\s+kB/); if (vmRssMatch) { this.metrics.memory = parseInt(vmRssMatch[1]) / 1024; // Convert to MB } // Parse CPU from stat (simplified) const statFields = stat.split(' '); const utime = parseInt(statFields[13]); const stime = parseInt(statFields[14]); const totalTime = utime + stime; // Calculate CPU percentage (this is simplified) const cpuPercent = (totalTime / this.metrics.uptime) * 100; this.metrics.cpu = cpuPercent; this.cpuUsageHistory.push(cpuPercent); // Keep only last 10 samples if (this.cpuUsageHistory.length > 10) { this.cpuUsageHistory.shift(); } } catch (error) { // Process might have died } } private async collectWindowsMetrics(pid: number): Promise<void> { const { exec } = require('child_process'); return new Promise((resolve) => { exec(`wmic process where ProcessId=${pid} get WorkingSetSize,UserModeTime,KernelModeTime`, (error, stdout) => { if (error) { resolve(); return; } const lines = stdout.trim().split('\n'); if (lines.length >= 2) { const values = lines[1].trim().split(/\s+/); if (values.length >= 3) { this.metrics.memory = parseInt(values[2]) / 1024 / 1024; // Convert to MB // CPU calculation would be more complex on Windows } } resolve(); }); }); } private async collectMacMetrics(pid: number): Promise<void> { const { exec } = require('child_process'); return new Promise((resolve) => { exec(`ps -p ${pid} -o rss,pcpu`, (error, stdout) => { if (error) { resolve(); return; } const lines = stdout.trim().split('\n'); if (lines.length >= 2) { const values = lines[1].trim().split(/\s+/); if (values.length >= 2) { this.metrics.memory = parseInt(values[0]) / 1024; // Convert to MB this.metrics.cpu = parseFloat(values[1]); this.cpuUsageHistory.push(this.metrics.cpu); if (this.cpuUsageHistory.length > 10) { this.cpuUsageHistory.shift(); } } } resolve(); }); }); } private async restartService(): Promise<void> { console.log('Restarting service...'); if (this.childProcess) { // Try graceful shutdown first this.childProcess.send({ type: 'shutdown' }); // Wait for graceful shutdown await new Promise<void>((resolve) => { const timeout = setTimeout(() => { // Force kill if not shut down gracefully if (this.childProcess) { this.childProcess.kill('SIGKILL'); } resolve(); }, 10000); if (this.childProcess) { this.childProcess.once('exit', () => { clearTimeout(timeout); resolve(); }); } else { clearTimeout(timeout); resolve(); } }); } // Clear the reference this.childProcess = undefined; // Start service again await this.startService(); if (process.send) { process.send({ type: 'service_restarted', pid: this.childProcess?.pid }); } } private updateHealth(health: any): void { // Update health status based on service report } private updateMetrics(metrics: any): void { // Update metrics based on service report } private sendStatus(): void { // Send current status via IPC } private sendMetrics(): void { // Send current metrics via IPC } private async shutdown(): Promise<void> { console.log('Watchdog shutting down...'); this.isShuttingDown = true; if (this.childProcess) { // Graceful shutdown this.childProcess.send({ type: 'shutdown' }); // Wait for child to exit await new Promise<void>((resolve) => { const timeout = setTimeout(() => { if (this.childProcess) { this.childProcess.kill('SIGKILL'); } resolve(); }, 30000); if (this.childProcess) { this.childProcess.once('exit', () => { clearTimeout(timeout); resolve(); }); } else { clearTimeout(timeout); resolve(); } }); } process.exit(0); } } // Main entry point const config: WatchdogConfig = { serviceName: process.env.SERVICE_NAME || 'claude-code-router', scriptPath: path.join(__dirname, '..', 'server.js'), ipcPath: process.env.IPC_PATH || '/tmp/claude-code-router.sock', maxRestarts: parseInt(process.env.MAX_RESTARTS || '5'), restartDelay: parseInt(process.env.RESTART_DELAY || '5000'), healthCheckInterval: parseInt(process.env.HEALTH_CHECK_INTERVAL || '10000'), memoryThreshold: parseInt(process.env.MEMORY_THRESHOLD || '512'), // MB cpuThreshold: parseInt(process.env.CPU_THRESHOLD || '80') // percentage }; const watchdog = new Watchdog(config); watchdog.start().catch((error) => { console.error('Failed to start watchdog:', error); process.exit(1); }); ``` ### Connection Manager **File: `src/services/ConnectionManager.ts`** ```typescript // src/services/ConnectionManager.ts import { EventEmitter } from 'events'; import * as http from 'http'; import * as net from 'net'; export interface ConnectionInfo { id: string; type: 'http' | 'websocket' | 'tcp'; remoteAddress: string; startTime: number; lastActivity: number; } export class ConnectionManager extends EventEmitter { private connections: Map<string, net.Socket | http.IncomingMessage> = new Map(); private connectionInfo: Map<string, ConnectionInfo> = new Map(); private isShuttingDown: boolean = false; private shutdownTimeout: number = 30000; // 30 seconds default private drainTimeout: number = 5000; // 5 seconds to drain new connections private connectionIdCounter: number = 0; constructor(options?: { shutdownTimeout?: number; drainTimeout?: number }) { super(); if (options?.shutdownTimeout) { this.shutdownTimeout = options.shutdownTimeout; } if (options?.drainTimeout) { this.drainTimeout = options.drainTimeout; } } /** * Track a new connection */ trackConnection(connection: net.Socket | http.IncomingMessage, type: 'http' | 'websocket' | 'tcp' = 'http'): string { if (this.isShuttingDown) { // Reject new connections during shutdown this.rejectConnection(connection); return ''; } const id = this.generateConnectionId(); const socket = this.getSocket(connection); this.connections.set(id, connection); this.connectionInfo.set(id, { id, type, remoteAddress: socket.remoteAddress || 'unknown', startTime: Date.now(), lastActivity: Date.now() }); // Set up cleanup on connection close socket.once('close', () => { this.untrackConnection(id); }); socket.once('error', () => { this.untrackConnection(id); }); // Track activity socket.on('data', () => { const info = this.connectionInfo.get(id); if (info) { info.lastActivity = Date.now(); } }); this.emit('connection:added', id); return id; } /** * Untrack a connection */ untrackConnection(id: string): void { this.connections.delete(id); this.connectionInfo.delete(id); this.emit('connection:removed', id); } /** * Get active connection count */ getActiveCount(): number { return this.connections.size; } /** * Get connection statistics */ getStats(): { total: number; byType: Record<string, number>; averageDuration: number; activeDurations: number[]; } { const byType: Record<string, number> = {}; const durations: number[] = []; const now = Date.now(); for (const info of this.connectionInfo.values()) { byType[info.type] = (byType[info.type] || 0) + 1; durations.push(now - info.startTime); } return { total: this.connections.size, byType, averageDuration: durations.length > 0 ? durations.reduce((a, b) => a + b, 0) / durations.length : 0, activeDurations: durations }; } /** * Start graceful shutdown */ async gracefulShutdown(): Promise<void> { if (this.isShuttingDown) { return; } this.isShuttingDown = true; this.emit('shutdown:started'); console.log(`Starting graceful shutdown with ${this.connections.size} active connections`); // Phase 1: Stop accepting new connections (handled by isShuttingDown flag) // Phase 2: Send connection close headers to HTTP connections await this.drainHttpConnections(); // Phase 3: Wait for connections to close naturally await this.waitForConnectionsToClose(); // Phase 4: Force close remaining connections await this.forceCloseConnections(); this.emit('shutdown:completed'); console.log('Graceful shutdown completed'); } /** * Drain HTTP connections by sending Connection: close */ private async drainHttpConnections(): Promise<void> { const httpConnections = Array.from(this.connectionInfo.entries()) .filter(([_, info]) => info.type === 'http'); for (const [id, info] of httpConnections) { const connection = this.connections.get(id); if (connection && 'setHeader' in connection) { try { (connection as any).setHeader('Connection', 'close'); } catch (error) { // Header already sent, ignore } } } // Give connections time to receive the close header await new Promise(resolve => setTimeout(resolve, this.drainTimeout)); } /** * Wait for connections to close naturally */ private async waitForConnectionsToClose(): Promise<void> { const startTime = Date.now(); const checkInterval = 100; // Check every 100ms return new Promise((resolve) => { const checkConnections = () => { const elapsed = Date.now() - startTime; if (this.connections.size === 0) { console.log('All connections closed naturally'); resolve(); return; } if (elapsed >= this.shutdownTimeout) { console.log(`Shutdown timeout reached with ${this.connections.size} connections remaining`); resolve(); return; } // Log progress if (elapsed % 1000 === 0) { console.log(`Waiting for ${this.connections.size} connections to close (${elapsed}ms elapsed)`); } setTimeout(checkConnections, checkInterval); }; checkConnections(); }); } /** * Force close remaining connections */ private async forceCloseConnections(): Promise<void> { if (this.connections.size === 0) { return; } console.log(`Force closing ${this.connections.size} remaining connections`); const closePromises: Promise<void>[] = []; for (const [id, connection] of this.connections.entries()) { closePromises.push(this.forceCloseConnection(id, connection)); } await Promise.all(closePromises); } /** * Force close a single connection */ private async forceCloseConnection(id: string, connection: net.Socket | http.IncomingMessage): Promise<void> { return new Promise((resolve) => { const socket = this.getSocket(connection); const info = this.connectionInfo.get(id); if (info) { console.log(`Force closing ${info.type} connection from ${info.remoteAddress} (active for ${Date.now() - info.startTime}ms)`); } // Set a timeout for the destroy operation const destroyTimeout = setTimeout(() => { resolve(); }, 1000); socket.once('close', () => { clearTimeout(destroyTimeout); resolve(); }); try { socket.destroy(); } catch (error) { // Socket already destroyed clearTimeout(destroyTimeout); resolve(); } }); } /** * Reject new connections during shutdown */ private rejectConnection(connection: net.Socket | http.IncomingMessage): void { const socket = this.getSocket(connection); if ('writeHead' in connection && typeof connection.writeHead === 'function') { // HTTP response try { (connection as any).writeHead(503, { 'Content-Type': 'text/plain', 'Connection': 'close', 'Retry-After': '60' }); (connection as any).end('Service is shutting down'); } catch (error) { // Headers already sent } } try { socket.destroy(); } catch (error) { // Already destroyed } } /** * Get the underlying socket from a connection */ private getSocket(connection: net.Socket | http.IncomingMessage): net.Socket { if ('socket' in connection && connection.socket) { return connection.socket; } return connection as net.Socket; } /** * Generate a unique connection ID */ private generateConnectionId(): string { return `conn_${Date.now()}_${++this.connectionIdCounter}`; } /** * Monitor connection health */ monitorHealth(): { healthy: boolean; issues: string[]; metrics: { activeConnections: number; connectionRate: number; errorRate: number; }; } { const stats = this.getStats(); const issues: string[] = []; let healthy = true; // Check for too many connections if (stats.total > 1000) { issues.push(`High connection count: ${stats.total}`); healthy = false; } // Check for long-lived connections const longConnections = stats.activeDurations.filter(d => d > 300000); // 5 minutes if (longConnections.length > stats.total * 0.1) { issues.push(`${longConnections.length} connections active for >5 minutes`); } return { healthy, issues, metrics: { activeConnections: stats.total, connectionRate: 0, // Would need to track this over time errorRate: 0 // Would need to track errors } }; } } // Singleton instance export const connectionManager = new ConnectionManager(); ``` ### Updated Server **File: `src/server.ts`** ```typescript // src/server.ts import express, { Application, Request, Response, NextFunction } from 'express'; import * as http from 'http'; import * as net from 'net'; import { ConnectionManager } from './services/ConnectionManager'; import { HealthMonitor } from './services/HealthMonitor'; import { CONFIG_FILE } from './constants'; import { router } from './utils/router'; export interface ServerConfig { port: number; jsonPath: string; initialConfig: any; gracefulShutdownTimeout?: number; } export class ClaudeCodeRouterServer { private app: Application; private server?: http.Server; private config: ServerConfig; private connectionManager: ConnectionManager; private healthMonitor: HealthMonitor; private isShuttingDown: boolean = false; private startTime: number = Date.now(); constructor(config: ServerConfig) { this.config = config; this.app = express(); this.connectionManager = new ConnectionManager({ shutdownTimeout: config.gracefulShutdownTimeout || 30000 }); this.healthMonitor = new HealthMonitor(); this.setupMiddleware(); this.setupRoutes(); this.setupHealthEndpoints(); this.setupIPCHandlers(); } private setupMiddleware(): void { // Parse JSON bodies this.app.use(express.json({ limit: '50mb' })); // Track connections this.app.use((req: Request, res: Response, next: NextFunction) => { if (this.isShuttingDown) { res.status(503).json({ error: 'Service is shutting down', retryAfter: 60 }); return; } const connectionId = this.connectionManager.trackConnection(req, 'http'); res.locals.connectionId = connectionId; // Clean up on response finish res.on('finish', () => { if (connectionId) { this.connectionManager.untrackConnection(connectionId); } }); next(); }); // Request logging this.app.use((req: Request, res: Response, next: NextFunction) => { const start = Date.now(); res.on('finish', () => { const duration = Date.now() - start; console.log(`${req.method} ${req.path} - ${res.statusCode} (${duration}ms)`); // Track metrics this.healthMonitor.recordRequest({ method: req.method, path: req.path, statusCode: res.statusCode, duration }); }); next(); }); // Error handling this.app.use((err: Error, req: Request, res: Response, next: NextFunction) => { console.error('Request error:', err); this.healthMonitor.recordError(err); res.status(500).json({ error: 'Internal server error', message: process.env.NODE_ENV === 'development' ? err.message : undefined }); }); } private setupRoutes(): void { // Main routing logic this.app.use('/', async (req: Request, res: Response, next: NextFunction) => { try { await router(req, res, this.config.initialConfig); } catch (error) { next(error); } }); } private setupHealthEndpoints(): void { // Basic health check this.app.get('/health', (req: Request, res: Response) => { const health = this.healthMonitor.getHealth(); const status = health.status === 'healthy' ? 200 : 503; res.status(status).json(health); }); // Detailed health check this.app.get('/health/detailed', (req: Request, res: Response) => { const health = this.healthMonitor.getDetailedHealth(); const connections = this.connectionManager.getStats(); res.json({ ...health, connections, server: { uptime: Date.now() - this.startTime, version: process.env.npm_package_version || 'unknown', node: process.version, pid: process.pid, memory: process.memoryUsage(), cpu: process.cpuUsage() } }); }); // Readiness check this.app.get('/ready', (req: Request, res: Response) => { if (this.isShuttingDown) { res.status(503).json({ ready: false, reason: 'shutting_down' }); return; } const health = this.healthMonitor.getHealth(); if (health.status !== 'healthy') { res.status(503).json({ ready: false, reason: 'unhealthy' }); return; } res.json({ ready: true }); }); // Liveness check this.app.get('/alive', (req: Request, res: Response) => { res.json({ alive: true, pid: process.pid }); }); // Metrics endpoint this.app.get('/metrics', (req: Request, res: Response) => { const metrics = this.healthMonitor.getMetrics(); const connections = this.connectionManager.getStats(); // Prometheus format const output = [ `# HELP http_requests_total Total number of HTTP requests`, `# TYPE http_requests_total counter`, `http_requests_total ${metrics.totalRequests}`, '', `# HELP http_errors_total Total number of HTTP errors`, `# TYPE http_errors_total counter`, `http_errors_total ${metrics.totalErrors}`, '', `# HELP http_request_duration_seconds HTTP request latencies`, `# TYPE http_request_duration_seconds histogram`, `http_request_duration_seconds_sum ${metrics.totalDuration / 1000}`, `http_request_duration_seconds_count ${metrics.totalRequests}`, '', `# HELP active_connections Number of active connections`, `# TYPE active_connections gauge`, `active_connections ${connections.total}`, '', `# HELP process_uptime_seconds Process uptime`, `# TYPE process_uptime_seconds counter`, `process_uptime_seconds ${(Date.now() - this.startTime) / 1000}` ].join('\n'); res.type('text/plain').send(output); }); } private setupIPCHandlers(): void { // Handle messages from watchdog process.on('message', (message: any) => { switch (message.type) { case 'ping': if (process.send) { process.send({ type: 'pong' }); } break; case 'shutdown': this.gracefulShutdown(); break; case 'health': if (process.send) { process.send({ type: 'health', data: this.healthMonitor.getHealth() }); } break; case 'metrics': if (process.send) { process.send({ type: 'metrics', data: this.healthMonitor.getMetrics() }); } break; } }); // Setup IPC server for admin commands if (process.platform !== 'win32') { this.setupUnixSocketServer(); } } private setupUnixSocketServer(): void { const socketPath = `/tmp/claude-code-router-admin.sock`; // Clean up existing socket try { require('fs').unlinkSync(socketPath); } catch (error) { // Ignore if doesn't exist } const adminServer = net.createServer((socket) => { socket.on('data', (data) => { try { const command = JSON.parse(data.toString()); this.handleAdminCommand(command, socket); } catch (error) { socket.write(JSON.stringify({ error: 'Invalid command' })); socket.end(); } }); }); adminServer.listen(socketPath); } private handleAdminCommand(command: any, socket: net.Socket): void { switch (command.type) { case 'status': socket.write(JSON.stringify({ status: 'ok', health: this.healthMonitor.getHealth(), connections: this.connectionManager.getStats() })); break; case 'reload-config': // Implement config reload this.reloadConfiguration(); socket.write(JSON.stringify({ status: 'reloaded' })); break; case 'clear-cache': // Implement cache clearing socket.write(JSON.stringify({ status: 'cleared' })); break; default: socket.write(JSON.stringify({ error: 'Unknown command' })); } socket.end(); } private reloadConfiguration(): void { try { // Reload configuration without restart delete require.cache[require.resolve(this.config.jsonPath)]; const newConfig = require(this.config.jsonPath); // Update configuration this.config.initialConfig = newConfig; console.log('Configuration reloaded successfully'); } catch (error) { console.error('Failed to reload configuration:', error); } } async start(): Promise<void> { return new Promise((resolve, reject) => { try { this.server = this.app.listen(this.config.port, () => { console.log(`Claude Code Router running on port ${this.config.port}`); // Notify watchdog that we're ready if (process.send) { process.send({ type: 'ready' }); } resolve(); }); // Track server-level connections this.server.on('connection', (socket: net.Socket) => { this.connectionManager.trackConnection(socket, 'tcp'); }); // Handle server errors this.server.on('error', (error: Error) => { console.error('Server error:', error); this.healthMonitor.recordError(error); reject(error); }); } catch (error) { reject(error); } }); } async stop(): Promise<void> { await this.gracefulShutdown(); } private async gracefulShutdown(): Promise<void> { if (this.isShuttingDown) { return; } console.log('Starting graceful shutdown...'); this.isShuttingDown = true; // Stop accepting new connections if (this.server) { this.server.close(() => { console.log('Server stopped accepting new connections'); }); } // Wait for existing connections to close await this.connectionManager.gracefulShutdown(); // Final cleanup console.log('Shutdown complete'); process.exit(0); } } // Health Monitor implementation class HealthMonitor { private metrics = { totalRequests: 0, totalErrors: 0, totalDuration: 0, statusCodes: new Map<number, number>(), errors: [] as Array<{ timestamp: number; error: string }>, lastError?: { timestamp: number; error: string } }; recordRequest(data: { method: string; path: string; statusCode: number; duration: number; }): void { this.metrics.totalRequests++; this.metrics.totalDuration += data.duration; const count = this.metrics.statusCodes.get(data.statusCode) || 0; this.metrics.statusCodes.set(data.statusCode, count + 1); if (data.statusCode >= 500) { this.metrics.totalErrors++; } } recordError(error: Error): void { const errorData = { timestamp: Date.now(), error: error.message }; this.metrics.errors.push(errorData); this.metrics.lastError = errorData; // Keep only last 100 errors if (this.metrics.errors.length > 100) { this.metrics.errors.shift(); } } getHealth(): { status: string; checks: any } { const errorRate = this.metrics.totalRequests > 0 ? this.metrics.totalErrors / this.metrics.totalRequests : 0; const recentErrors = this.metrics.errors.filter( e => e.timestamp > Date.now() - 60000 ).length; let status = 'healthy'; const checks: any = {}; // Check error rate if (errorRate > 0.1) { status = 'unhealthy'; checks.errorRate = { status: 'fail', message: `Error rate ${(errorRate * 100).toFixed(2)}%` }; } else { checks.errorRate = { status: 'pass' }; } // Check recent errors if (recentErrors > 10) { status = 'unhealthy'; checks.recentErrors = { status: 'fail', message: `${recentErrors} errors in last minute` }; } else { checks.recentErrors = { status: 'pass' }; } return { status, checks }; } getDetailedHealth(): any { return { ...this.getHealth(), metrics: this.getMetrics(), recentErrors: this.metrics.errors.slice(-10) }; } getMetrics(): any { return { totalRequests: this.metrics.totalRequests, totalErrors: this.metrics.totalErrors, totalDuration: this.metrics.totalDuration, averageLatency: this.metrics.totalRequests > 0 ? this.metrics.totalDuration / this.metrics.totalRequests : 0, statusCodes: Object.fromEntries(this.metrics.statusCodes), lastError: this.metrics.lastError }; } } // Export factory function for backward compatibility export function createServer(config: ServerConfig): ClaudeCodeRouterServer { return new ClaudeCodeRouterServer(config); } ``` ### CLI Integration **File: `src/cli-service.ts`** ```typescript // src/cli-service.ts import { ServiceManager } from './services/ServiceManager'; import * as path from 'path'; import * as fs from 'fs'; import { spawn } from 'child_process'; const HELP_TEXT = ` Claude Code Router Service Manager Commands: install Install as system service uninstall Remove system service start Start the service stop Stop the service restart Restart the service status Show service status logs Show service logs health Check service health Options: --port Port to run on (default: 3456) --config Path to config file --help Show this help message `; class CLIServiceManager { private serviceManager: ServiceManager; private serviceName = 'claude-code-router'; constructor() { const scriptPath = path.join(__dirname, 'server-wrapper.js'); this.serviceManager = new ServiceManager({ name: this.serviceName, displayName: 'Claude Code Router', description: 'Routes Claude Code requests to different LLM providers', script: scriptPath, env: { NODE_ENV: 'production', SERVICE_PORT: process.env.PORT || '3456', CONFIG_PATH: process.env.CONFIG_PATH || path.join(process.env.HOME || '', '.claude-code-router', 'config.json') }, maxRestarts: 5, restartDelay: 5000, gracefulShutdownTimeout: 30000 }); this.setupEventHandlers(); } private setupEventHandlers(): void { this.serviceManager.on('installed', () => { console.log('✅ Service installed successfully'); console.log('Run "ccr service start" to start the service'); }); this.serviceManager.on('uninstalled', () => { console.log('✅ Service uninstalled successfully'); }); this.serviceManager.on('started', () => { console.log('✅ Service started successfully'); }); this.serviceManager.on('stopped', () => { console.log('✅ Service stopped successfully'); }); this.serviceManager.on('restarted', () => { console.log('✅ Service restarted successfully'); }); this.serviceManager.on('crashed', (error) => { console.error('❌ Service crashed:', error); }); this.serviceManager.on('failed', (reason) => { console.error('❌ Service failed:', reason); }); } async execute(command: string, args: string[]): Promise<void> { try { switch (command) { case 'instal