@tehreet/conduit
Version:
LLM API gateway with intelligent routing, robust process management, and health monitoring
1,835 lines (1,542 loc) • 69.4 kB
Markdown
# Robust Process Management & Service Architecture Implementation Guide
## Table of Contents
1. [Overview](#overview)
2. [Architecture Design](#architecture-design)
3. [Implementation Code](#implementation-code)
- [Service Manager](#service-manager)
- [Watchdog Process](#watchdog-process)
- [Connection Manager](#connection-manager)
- [Updated Server](#updated-server)
- [CLI Integration](#cli-integration)
4. [Migration Guide](#migration-guide)
5. [Testing & Monitoring](#testing--monitoring)
6. [Troubleshooting](#troubleshooting)
7. [Best Practices](#best-practices)
---
## Overview
This guide details the implementation of a robust process management system for claude-code-router, replacing the current PID file-based approach with a production-ready service architecture.
### Current Issues
- Race conditions with PID file management
- No automatic restart on crashes
- Poor process lifecycle management
- File-based reference counting is unreliable
- No health monitoring or self-healing
### Solution Benefits
- Automatic service recovery
- Proper daemon/service integration
- Health checks and monitoring
- Graceful shutdown with connection draining
- Zero-downtime updates
- Better resource management
## Architecture Design
```
┌─────────────────────────────────────────────────────────────────┐
│ Service Layer │
├─────────────────┬────────────────┬──────────────────────────────┤
│ Service Manager │ Watchdog Process│ Health Monitor │
├─────────────────┴────────────────┴──────────────────────────────┤
│ Application Layer │
├──────────────────┬───────────────┬──────────────────────────────┤
│ Main Process │ Worker Pool │ Connection Store │
├──────────────────┴───────────────┴──────────────────────────────┤
│ Communication Layer │
├──────────────────┬───────────────┬──────────────────────────────┤
│ IPC Channel │ Health Socket │ Admin Socket │
└──────────────────┴───────────────┴──────────────────────────────┘
```
---
## Implementation Code
### Service Manager
**File: `src/services/ServiceManager.ts`**
```typescript
// src/services/ServiceManager.ts
import { EventEmitter } from 'events';
import { ChildProcess, fork } from 'child_process';
import * as fs from 'fs/promises';
import * as path from 'path';
import { Service as WindowsService } from 'node-windows';
import { Service as MacService } from 'node-mac';
import { Service as LinuxService } from 'node-linux';
export interface ServiceConfig {
name: string;
displayName: string;
description: string;
script: string;
env?: Record<string, string>;
maxRestarts?: number;
restartDelay?: number;
gracefulShutdownTimeout?: number;
}
export interface ServiceStatus {
pid?: number;
status: 'running' | 'stopped' | 'starting' | 'stopping' | 'crashed';
uptime?: number;
restartCount: number;
lastError?: string;
health: 'healthy' | 'unhealthy' | 'unknown';
}
export class ServiceManager extends EventEmitter {
private config: ServiceConfig;
private service: any;
private status: ServiceStatus;
private healthCheckInterval?: NodeJS.Timeout;
private watchdog?: ChildProcess;
private ipcPath: string;
constructor(config: ServiceConfig) {
super();
this.config = {
maxRestarts: 5,
restartDelay: 5000,
gracefulShutdownTimeout: 30000,
...config
};
this.status = {
status: 'stopped',
restartCount: 0,
health: 'unknown'
};
this.ipcPath = process.platform === 'win32'
? '\\\\.\\pipe\\claude-code-router'
: '/tmp/claude-code-router.sock';
}
async install(): Promise<void> {
const platform = process.platform;
switch (platform) {
case 'win32':
await this.installWindows();
break;
case 'darwin':
await this.installMac();
break;
case 'linux':
await this.installLinux();
break;
default:
throw new Error(`Unsupported platform: ${platform}`);
}
}
private async installWindows(): Promise<void> {
this.service = new WindowsService({
name: this.config.name,
description: this.config.description,
script: this.config.script,
env: Object.entries(this.config.env || {}).map(([name, value]) => ({
name,
value
}))
});
return new Promise((resolve, reject) => {
this.service.on('install', () => {
this.emit('installed');
resolve();
});
this.service.on('error', reject);
this.service.install();
});
}
private async installMac(): Promise<void> {
this.service = new MacService({
name: this.config.name,
description: this.config.description,
script: this.config.script,
env: this.config.env
});
return new Promise((resolve, reject) => {
this.service.on('install', () => {
this.emit('installed');
resolve();
});
this.service.on('error', reject);
this.service.install();
});
}
private async installLinux(): Promise<void> {
// For Linux, we'll create a systemd service
const serviceContent = `
[Unit]
Description=${this.config.description}
After=network.target
[Service]
Type=simple
User=${process.env.USER}
WorkingDirectory=${path.dirname(this.config.script)}
ExecStart=${process.execPath} ${this.config.script}
Restart=always
RestartSec=${this.config.restartDelay / 1000}
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=${this.config.name}
${Object.entries(this.config.env || {}).map(([k, v]) => `Environment="${k}=${v}"`).join('\n')}
[Install]
WantedBy=multi-user.target
`;
const servicePath = `/etc/systemd/system/${this.config.name}.service`;
try {
await fs.writeFile(servicePath, serviceContent);
await this.execCommand('systemctl daemon-reload');
await this.execCommand(`systemctl enable ${this.config.name}`);
this.emit('installed');
} catch (error) {
throw new Error(`Failed to install Linux service: ${error}`);
}
}
async start(): Promise<void> {
if (this.status.status === 'running') {
throw new Error('Service is already running');
}
this.status.status = 'starting';
this.emit('starting');
try {
if (process.platform === 'linux') {
await this.execCommand(`systemctl start ${this.config.name}`);
} else {
await this.startNativeService();
}
// Start watchdog process
await this.startWatchdog();
// Start health monitoring
this.startHealthMonitoring();
this.status.status = 'running';
this.emit('started');
} catch (error) {
this.status.status = 'stopped';
this.status.lastError = error.message;
throw error;
}
}
private async startWatchdog(): Promise<void> {
const watchdogScript = path.join(__dirname, 'watchdog.js');
this.watchdog = fork(watchdogScript, [], {
detached: true,
env: {
...process.env,
SERVICE_NAME: this.config.name,
IPC_PATH: this.ipcPath,
MAX_RESTARTS: String(this.config.maxRestarts),
RESTART_DELAY: String(this.config.restartDelay)
}
});
this.watchdog.on('message', (msg) => {
this.handleWatchdogMessage(msg);
});
}
private handleWatchdogMessage(msg: any): void {
switch (msg.type) {
case 'service_crashed':
this.status.status = 'crashed';
this.status.restartCount++;
this.emit('crashed', msg.error);
break;
case 'service_restarted':
this.status.status = 'running';
this.status.pid = msg.pid;
this.emit('restarted');
break;
case 'max_restarts_reached':
this.status.status = 'stopped';
this.emit('failed', 'Maximum restart attempts reached');
break;
}
}
private startHealthMonitoring(): void {
this.healthCheckInterval = setInterval(async () => {
try {
const health = await this.checkHealth();
this.status.health = health ? 'healthy' : 'unhealthy';
if (!health && this.status.status === 'running') {
this.emit('unhealthy');
// Trigger restart if unhealthy
await this.restart();
}
} catch (error) {
this.status.health = 'unknown';
}
}, 10000); // Check every 10 seconds
}
private async checkHealth(): Promise<boolean> {
try {
const response = await fetch('http://localhost:3456/health', {
timeout: 5000
});
const data = await response.json();
return data.status === 'ok';
} catch {
return false;
}
}
async stop(): Promise<void> {
if (this.status.status !== 'running') {
throw new Error('Service is not running');
}
this.status.status = 'stopping';
this.emit('stopping');
try {
// Stop health monitoring
if (this.healthCheckInterval) {
clearInterval(this.healthCheckInterval);
}
// Graceful shutdown
await this.gracefulShutdown();
// Stop watchdog
if (this.watchdog) {
this.watchdog.kill();
}
// Stop service
if (process.platform === 'linux') {
await this.execCommand(`systemctl stop ${this.config.name}`);
} else {
await this.stopNativeService();
}
this.status.status = 'stopped';
this.emit('stopped');
} catch (error) {
this.status.lastError = error.message;
throw error;
}
}
private async gracefulShutdown(): Promise<void> {
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error('Graceful shutdown timeout'));
}, this.config.gracefulShutdownTimeout);
// Send shutdown signal via IPC
this.sendIPCMessage({ type: 'shutdown' }, (error) => {
clearTimeout(timeout);
if (error) reject(error);
else resolve();
});
});
}
async restart(): Promise<void> {
this.emit('restarting');
await this.stop();
await new Promise(resolve => setTimeout(resolve, 1000));
await this.start();
}
async uninstall(): Promise<void> {
if (this.status.status === 'running') {
await this.stop();
}
if (process.platform === 'linux') {
await this.execCommand(`systemctl disable ${this.config.name}`);
await fs.unlink(`/etc/systemd/system/${this.config.name}.service`);
await this.execCommand('systemctl daemon-reload');
} else {
await this.uninstallNativeService();
}
this.emit('uninstalled');
}
getStatus(): ServiceStatus {
return { ...this.status };
}
private sendIPCMessage(message: any, callback?: (error?: Error) => void): void {
// Implementation for IPC communication
const client = require('net').createConnection(this.ipcPath, () => {
client.write(JSON.stringify(message));
client.end();
callback?.();
});
client.on('error', callback);
}
private async execCommand(command: string): Promise<string> {
const { exec } = require('child_process');
return new Promise((resolve, reject) => {
exec(command, (error, stdout, stderr) => {
if (error) reject(error);
else resolve(stdout);
});
});
}
private startNativeService(): Promise<void> {
return new Promise((resolve, reject) => {
this.service.on('start', () => resolve());
this.service.on('error', reject);
this.service.start();
});
}
private stopNativeService(): Promise<void> {
return new Promise((resolve, reject) => {
this.service.on('stop', () => resolve());
this.service.on('error', reject);
this.service.stop();
});
}
private uninstallNativeService(): Promise<void> {
return new Promise((resolve, reject) => {
this.service.on('uninstall', () => resolve());
this.service.on('error', reject);
this.service.uninstall();
});
}
}
```
### Watchdog Process
**File: `src/services/watchdog.ts`**
```typescript
// src/services/watchdog.ts
import { fork, ChildProcess } from 'child_process';
import * as net from 'net';
import * as fs from 'fs';
import * as path from 'path';
interface WatchdogConfig {
serviceName: string;
scriptPath: string;
ipcPath: string;
maxRestarts: number;
restartDelay: number;
healthCheckInterval: number;
memoryThreshold: number; // MB
cpuThreshold: number; // percentage
}
interface ProcessMetrics {
memory: number;
cpu: number;
handles: number;
uptime: number;
}
class Watchdog {
private config: WatchdogConfig;
private childProcess?: ChildProcess;
private restartCount: number = 0;
private isShuttingDown: boolean = false;
private metrics: ProcessMetrics = {
memory: 0,
cpu: 0,
handles: 0,
uptime: 0
};
private startTime: number = Date.now();
private cpuUsageHistory: number[] = [];
constructor(config: WatchdogConfig) {
this.config = config;
this.setupSignalHandlers();
this.setupIPCServer();
}
private setupSignalHandlers(): void {
process.on('SIGINT', () => this.shutdown());
process.on('SIGTERM', () => this.shutdown());
process.on('uncaughtException', (error) => {
console.error('Watchdog uncaught exception:', error);
this.shutdown();
});
}
private setupIPCServer(): void {
const server = net.createServer((socket) => {
socket.on('data', (data) => {
try {
const message = JSON.parse(data.toString());
this.handleIPCMessage(message);
} catch (error) {
console.error('Invalid IPC message:', error);
}
});
});
// Clean up existing socket
if (fs.existsSync(this.config.ipcPath)) {
fs.unlinkSync(this.config.ipcPath);
}
server.listen(this.config.ipcPath);
}
private handleIPCMessage(message: any): void {
switch (message.type) {
case 'status':
this.sendStatus();
break;
case 'restart':
this.restartService();
break;
case 'shutdown':
this.shutdown();
break;
case 'metrics':
this.sendMetrics();
break;
}
}
async start(): Promise<void> {
console.log(`Watchdog starting for ${this.config.serviceName}`);
await this.startService();
// Start monitoring
setInterval(() => this.monitorService(), this.config.healthCheckInterval);
setInterval(() => this.collectMetrics(), 5000);
}
private async startService(): Promise<void> {
if (this.childProcess) {
return;
}
try {
this.childProcess = fork(this.config.scriptPath, [], {
detached: false,
env: {
...process.env,
WATCHDOG_ENABLED: 'true',
WATCHDOG_PID: String(process.pid)
}
});
this.childProcess.on('exit', (code, signal) => {
this.handleServiceExit(code, signal);
});
this.childProcess.on('error', (error) => {
this.handleServiceError(error);
});
this.childProcess.on('message', (message) => {
this.handleServiceMessage(message);
});
// Send parent message about successful start
if (process.send) {
process.send({
type: 'service_started',
pid: this.childProcess.pid
});
}
this.startTime = Date.now();
console.log(`Service started with PID: ${this.childProcess.pid}`);
} catch (error) {
console.error('Failed to start service:', error);
this.handleServiceError(error);
}
}
private handleServiceExit(code: number | null, signal: string | null): void {
console.log(`Service exited with code ${code} and signal ${signal}`);
this.childProcess = undefined;
if (this.isShuttingDown) {
return;
}
// Check if we should restart
if (this.restartCount >= this.config.maxRestarts) {
console.error('Maximum restart attempts reached');
if (process.send) {
process.send({
type: 'max_restarts_reached',
restartCount: this.restartCount
});
}
process.exit(1);
}
// Restart after delay
this.restartCount++;
console.log(`Restarting service (attempt ${this.restartCount}/${this.config.maxRestarts})`);
setTimeout(() => {
this.startService();
}, this.config.restartDelay);
if (process.send) {
process.send({
type: 'service_crashed',
code,
signal,
restartCount: this.restartCount
});
}
}
private handleServiceError(error: Error): void {
console.error('Service error:', error);
if (process.send) {
process.send({
type: 'service_error',
error: error.message
});
}
}
private handleServiceMessage(message: any): void {
// Handle messages from the service
switch (message.type) {
case 'health':
this.updateHealth(message.data);
break;
case 'metrics':
this.updateMetrics(message.data);
break;
case 'ready':
console.log('Service reported ready');
if (process.send) {
process.send({
type: 'service_ready'
});
}
break;
}
}
private async monitorService(): Promise<void> {
if (!this.childProcess || this.isShuttingDown) {
return;
}
// Check if process is responsive
const isResponsive = await this.checkResponsiveness();
if (!isResponsive) {
console.warn('Service is unresponsive, restarting...');
await this.restartService();
return;
}
// Check resource usage
if (this.metrics.memory > this.config.memoryThreshold) {
console.warn(`Memory usage (${this.metrics.memory}MB) exceeds threshold, restarting...`);
await this.restartService();
return;
}
// Check CPU usage (average over last 5 samples)
const avgCpu = this.cpuUsageHistory.slice(-5).reduce((a, b) => a + b, 0) / 5;
if (avgCpu > this.config.cpuThreshold) {
console.warn(`CPU usage (${avgCpu}%) exceeds threshold, restarting...`);
await this.restartService();
return;
}
}
private async checkResponsiveness(): Promise<boolean> {
return new Promise((resolve) => {
if (!this.childProcess) {
resolve(false);
return;
}
const timeout = setTimeout(() => {
resolve(false);
}, 5000);
const messageHandler = (msg: any) => {
if (msg.type === 'pong') {
clearTimeout(timeout);
this.childProcess?.off('message', messageHandler);
resolve(true);
}
};
this.childProcess.on('message', messageHandler);
this.childProcess.send({ type: 'ping' });
});
}
private async collectMetrics(): Promise<void> {
if (!this.childProcess || !this.childProcess.pid) {
return;
}
try {
const pid = this.childProcess.pid;
// Get process info based on platform
if (process.platform === 'linux') {
await this.collectLinuxMetrics(pid);
} else if (process.platform === 'win32') {
await this.collectWindowsMetrics(pid);
} else if (process.platform === 'darwin') {
await this.collectMacMetrics(pid);
}
this.metrics.uptime = Date.now() - this.startTime;
} catch (error) {
console.error('Failed to collect metrics:', error);
}
}
private async collectLinuxMetrics(pid: number): Promise<void> {
const statPath = `/proc/${pid}/stat`;
const statusPath = `/proc/${pid}/status`;
try {
const stat = await fs.promises.readFile(statPath, 'utf8');
const status = await fs.promises.readFile(statusPath, 'utf8');
// Parse memory from status
const vmRssMatch = status.match(/VmRSS:\s+(\d+)\s+kB/);
if (vmRssMatch) {
this.metrics.memory = parseInt(vmRssMatch[1]) / 1024; // Convert to MB
}
// Parse CPU from stat (simplified)
const statFields = stat.split(' ');
const utime = parseInt(statFields[13]);
const stime = parseInt(statFields[14]);
const totalTime = utime + stime;
// Calculate CPU percentage (this is simplified)
const cpuPercent = (totalTime / this.metrics.uptime) * 100;
this.metrics.cpu = cpuPercent;
this.cpuUsageHistory.push(cpuPercent);
// Keep only last 10 samples
if (this.cpuUsageHistory.length > 10) {
this.cpuUsageHistory.shift();
}
} catch (error) {
// Process might have died
}
}
private async collectWindowsMetrics(pid: number): Promise<void> {
const { exec } = require('child_process');
return new Promise((resolve) => {
exec(`wmic process where ProcessId=${pid} get WorkingSetSize,UserModeTime,KernelModeTime`, (error, stdout) => {
if (error) {
resolve();
return;
}
const lines = stdout.trim().split('\n');
if (lines.length >= 2) {
const values = lines[1].trim().split(/\s+/);
if (values.length >= 3) {
this.metrics.memory = parseInt(values[2]) / 1024 / 1024; // Convert to MB
// CPU calculation would be more complex on Windows
}
}
resolve();
});
});
}
private async collectMacMetrics(pid: number): Promise<void> {
const { exec } = require('child_process');
return new Promise((resolve) => {
exec(`ps -p ${pid} -o rss,pcpu`, (error, stdout) => {
if (error) {
resolve();
return;
}
const lines = stdout.trim().split('\n');
if (lines.length >= 2) {
const values = lines[1].trim().split(/\s+/);
if (values.length >= 2) {
this.metrics.memory = parseInt(values[0]) / 1024; // Convert to MB
this.metrics.cpu = parseFloat(values[1]);
this.cpuUsageHistory.push(this.metrics.cpu);
if (this.cpuUsageHistory.length > 10) {
this.cpuUsageHistory.shift();
}
}
}
resolve();
});
});
}
private async restartService(): Promise<void> {
console.log('Restarting service...');
if (this.childProcess) {
// Try graceful shutdown first
this.childProcess.send({ type: 'shutdown' });
// Wait for graceful shutdown
await new Promise<void>((resolve) => {
const timeout = setTimeout(() => {
// Force kill if not shut down gracefully
if (this.childProcess) {
this.childProcess.kill('SIGKILL');
}
resolve();
}, 10000);
if (this.childProcess) {
this.childProcess.once('exit', () => {
clearTimeout(timeout);
resolve();
});
} else {
clearTimeout(timeout);
resolve();
}
});
}
// Clear the reference
this.childProcess = undefined;
// Start service again
await this.startService();
if (process.send) {
process.send({
type: 'service_restarted',
pid: this.childProcess?.pid
});
}
}
private updateHealth(health: any): void {
// Update health status based on service report
}
private updateMetrics(metrics: any): void {
// Update metrics based on service report
}
private sendStatus(): void {
// Send current status via IPC
}
private sendMetrics(): void {
// Send current metrics via IPC
}
private async shutdown(): Promise<void> {
console.log('Watchdog shutting down...');
this.isShuttingDown = true;
if (this.childProcess) {
// Graceful shutdown
this.childProcess.send({ type: 'shutdown' });
// Wait for child to exit
await new Promise<void>((resolve) => {
const timeout = setTimeout(() => {
if (this.childProcess) {
this.childProcess.kill('SIGKILL');
}
resolve();
}, 30000);
if (this.childProcess) {
this.childProcess.once('exit', () => {
clearTimeout(timeout);
resolve();
});
} else {
clearTimeout(timeout);
resolve();
}
});
}
process.exit(0);
}
}
// Main entry point
const config: WatchdogConfig = {
serviceName: process.env.SERVICE_NAME || 'claude-code-router',
scriptPath: path.join(__dirname, '..', 'server.js'),
ipcPath: process.env.IPC_PATH || '/tmp/claude-code-router.sock',
maxRestarts: parseInt(process.env.MAX_RESTARTS || '5'),
restartDelay: parseInt(process.env.RESTART_DELAY || '5000'),
healthCheckInterval: parseInt(process.env.HEALTH_CHECK_INTERVAL || '10000'),
memoryThreshold: parseInt(process.env.MEMORY_THRESHOLD || '512'), // MB
cpuThreshold: parseInt(process.env.CPU_THRESHOLD || '80') // percentage
};
const watchdog = new Watchdog(config);
watchdog.start().catch((error) => {
console.error('Failed to start watchdog:', error);
process.exit(1);
});
```
### Connection Manager
**File: `src/services/ConnectionManager.ts`**
```typescript
// src/services/ConnectionManager.ts
import { EventEmitter } from 'events';
import * as http from 'http';
import * as net from 'net';
export interface ConnectionInfo {
id: string;
type: 'http' | 'websocket' | 'tcp';
remoteAddress: string;
startTime: number;
lastActivity: number;
}
export class ConnectionManager extends EventEmitter {
private connections: Map<string, net.Socket | http.IncomingMessage> = new Map();
private connectionInfo: Map<string, ConnectionInfo> = new Map();
private isShuttingDown: boolean = false;
private shutdownTimeout: number = 30000; // 30 seconds default
private drainTimeout: number = 5000; // 5 seconds to drain new connections
private connectionIdCounter: number = 0;
constructor(options?: { shutdownTimeout?: number; drainTimeout?: number }) {
super();
if (options?.shutdownTimeout) {
this.shutdownTimeout = options.shutdownTimeout;
}
if (options?.drainTimeout) {
this.drainTimeout = options.drainTimeout;
}
}
/**
* Track a new connection
*/
trackConnection(connection: net.Socket | http.IncomingMessage, type: 'http' | 'websocket' | 'tcp' = 'http'): string {
if (this.isShuttingDown) {
// Reject new connections during shutdown
this.rejectConnection(connection);
return '';
}
const id = this.generateConnectionId();
const socket = this.getSocket(connection);
this.connections.set(id, connection);
this.connectionInfo.set(id, {
id,
type,
remoteAddress: socket.remoteAddress || 'unknown',
startTime: Date.now(),
lastActivity: Date.now()
});
// Set up cleanup on connection close
socket.once('close', () => {
this.untrackConnection(id);
});
socket.once('error', () => {
this.untrackConnection(id);
});
// Track activity
socket.on('data', () => {
const info = this.connectionInfo.get(id);
if (info) {
info.lastActivity = Date.now();
}
});
this.emit('connection:added', id);
return id;
}
/**
* Untrack a connection
*/
untrackConnection(id: string): void {
this.connections.delete(id);
this.connectionInfo.delete(id);
this.emit('connection:removed', id);
}
/**
* Get active connection count
*/
getActiveCount(): number {
return this.connections.size;
}
/**
* Get connection statistics
*/
getStats(): {
total: number;
byType: Record<string, number>;
averageDuration: number;
activeDurations: number[];
} {
const byType: Record<string, number> = {};
const durations: number[] = [];
const now = Date.now();
for (const info of this.connectionInfo.values()) {
byType[info.type] = (byType[info.type] || 0) + 1;
durations.push(now - info.startTime);
}
return {
total: this.connections.size,
byType,
averageDuration: durations.length > 0
? durations.reduce((a, b) => a + b, 0) / durations.length
: 0,
activeDurations: durations
};
}
/**
* Start graceful shutdown
*/
async gracefulShutdown(): Promise<void> {
if (this.isShuttingDown) {
return;
}
this.isShuttingDown = true;
this.emit('shutdown:started');
console.log(`Starting graceful shutdown with ${this.connections.size} active connections`);
// Phase 1: Stop accepting new connections (handled by isShuttingDown flag)
// Phase 2: Send connection close headers to HTTP connections
await this.drainHttpConnections();
// Phase 3: Wait for connections to close naturally
await this.waitForConnectionsToClose();
// Phase 4: Force close remaining connections
await this.forceCloseConnections();
this.emit('shutdown:completed');
console.log('Graceful shutdown completed');
}
/**
* Drain HTTP connections by sending Connection: close
*/
private async drainHttpConnections(): Promise<void> {
const httpConnections = Array.from(this.connectionInfo.entries())
.filter(([_, info]) => info.type === 'http');
for (const [id, info] of httpConnections) {
const connection = this.connections.get(id);
if (connection && 'setHeader' in connection) {
try {
(connection as any).setHeader('Connection', 'close');
} catch (error) {
// Header already sent, ignore
}
}
}
// Give connections time to receive the close header
await new Promise(resolve => setTimeout(resolve, this.drainTimeout));
}
/**
* Wait for connections to close naturally
*/
private async waitForConnectionsToClose(): Promise<void> {
const startTime = Date.now();
const checkInterval = 100; // Check every 100ms
return new Promise((resolve) => {
const checkConnections = () => {
const elapsed = Date.now() - startTime;
if (this.connections.size === 0) {
console.log('All connections closed naturally');
resolve();
return;
}
if (elapsed >= this.shutdownTimeout) {
console.log(`Shutdown timeout reached with ${this.connections.size} connections remaining`);
resolve();
return;
}
// Log progress
if (elapsed % 1000 === 0) {
console.log(`Waiting for ${this.connections.size} connections to close (${elapsed}ms elapsed)`);
}
setTimeout(checkConnections, checkInterval);
};
checkConnections();
});
}
/**
* Force close remaining connections
*/
private async forceCloseConnections(): Promise<void> {
if (this.connections.size === 0) {
return;
}
console.log(`Force closing ${this.connections.size} remaining connections`);
const closePromises: Promise<void>[] = [];
for (const [id, connection] of this.connections.entries()) {
closePromises.push(this.forceCloseConnection(id, connection));
}
await Promise.all(closePromises);
}
/**
* Force close a single connection
*/
private async forceCloseConnection(id: string, connection: net.Socket | http.IncomingMessage): Promise<void> {
return new Promise((resolve) => {
const socket = this.getSocket(connection);
const info = this.connectionInfo.get(id);
if (info) {
console.log(`Force closing ${info.type} connection from ${info.remoteAddress} (active for ${Date.now() - info.startTime}ms)`);
}
// Set a timeout for the destroy operation
const destroyTimeout = setTimeout(() => {
resolve();
}, 1000);
socket.once('close', () => {
clearTimeout(destroyTimeout);
resolve();
});
try {
socket.destroy();
} catch (error) {
// Socket already destroyed
clearTimeout(destroyTimeout);
resolve();
}
});
}
/**
* Reject new connections during shutdown
*/
private rejectConnection(connection: net.Socket | http.IncomingMessage): void {
const socket = this.getSocket(connection);
if ('writeHead' in connection && typeof connection.writeHead === 'function') {
// HTTP response
try {
(connection as any).writeHead(503, {
'Content-Type': 'text/plain',
'Connection': 'close',
'Retry-After': '60'
});
(connection as any).end('Service is shutting down');
} catch (error) {
// Headers already sent
}
}
try {
socket.destroy();
} catch (error) {
// Already destroyed
}
}
/**
* Get the underlying socket from a connection
*/
private getSocket(connection: net.Socket | http.IncomingMessage): net.Socket {
if ('socket' in connection && connection.socket) {
return connection.socket;
}
return connection as net.Socket;
}
/**
* Generate a unique connection ID
*/
private generateConnectionId(): string {
return `conn_${Date.now()}_${++this.connectionIdCounter}`;
}
/**
* Monitor connection health
*/
monitorHealth(): {
healthy: boolean;
issues: string[];
metrics: {
activeConnections: number;
connectionRate: number;
errorRate: number;
};
} {
const stats = this.getStats();
const issues: string[] = [];
let healthy = true;
// Check for too many connections
if (stats.total > 1000) {
issues.push(`High connection count: ${stats.total}`);
healthy = false;
}
// Check for long-lived connections
const longConnections = stats.activeDurations.filter(d => d > 300000); // 5 minutes
if (longConnections.length > stats.total * 0.1) {
issues.push(`${longConnections.length} connections active for >5 minutes`);
}
return {
healthy,
issues,
metrics: {
activeConnections: stats.total,
connectionRate: 0, // Would need to track this over time
errorRate: 0 // Would need to track errors
}
};
}
}
// Singleton instance
export const connectionManager = new ConnectionManager();
```
### Updated Server
**File: `src/server.ts`**
```typescript
// src/server.ts
import express, { Application, Request, Response, NextFunction } from 'express';
import * as http from 'http';
import * as net from 'net';
import { ConnectionManager } from './services/ConnectionManager';
import { HealthMonitor } from './services/HealthMonitor';
import { CONFIG_FILE } from './constants';
import { router } from './utils/router';
export interface ServerConfig {
port: number;
jsonPath: string;
initialConfig: any;
gracefulShutdownTimeout?: number;
}
export class ClaudeCodeRouterServer {
private app: Application;
private server?: http.Server;
private config: ServerConfig;
private connectionManager: ConnectionManager;
private healthMonitor: HealthMonitor;
private isShuttingDown: boolean = false;
private startTime: number = Date.now();
constructor(config: ServerConfig) {
this.config = config;
this.app = express();
this.connectionManager = new ConnectionManager({
shutdownTimeout: config.gracefulShutdownTimeout || 30000
});
this.healthMonitor = new HealthMonitor();
this.setupMiddleware();
this.setupRoutes();
this.setupHealthEndpoints();
this.setupIPCHandlers();
}
private setupMiddleware(): void {
// Parse JSON bodies
this.app.use(express.json({ limit: '50mb' }));
// Track connections
this.app.use((req: Request, res: Response, next: NextFunction) => {
if (this.isShuttingDown) {
res.status(503).json({
error: 'Service is shutting down',
retryAfter: 60
});
return;
}
const connectionId = this.connectionManager.trackConnection(req, 'http');
res.locals.connectionId = connectionId;
// Clean up on response finish
res.on('finish', () => {
if (connectionId) {
this.connectionManager.untrackConnection(connectionId);
}
});
next();
});
// Request logging
this.app.use((req: Request, res: Response, next: NextFunction) => {
const start = Date.now();
res.on('finish', () => {
const duration = Date.now() - start;
console.log(`${req.method} ${req.path} - ${res.statusCode} (${duration}ms)`);
// Track metrics
this.healthMonitor.recordRequest({
method: req.method,
path: req.path,
statusCode: res.statusCode,
duration
});
});
next();
});
// Error handling
this.app.use((err: Error, req: Request, res: Response, next: NextFunction) => {
console.error('Request error:', err);
this.healthMonitor.recordError(err);
res.status(500).json({
error: 'Internal server error',
message: process.env.NODE_ENV === 'development' ? err.message : undefined
});
});
}
private setupRoutes(): void {
// Main routing logic
this.app.use('/', async (req: Request, res: Response, next: NextFunction) => {
try {
await router(req, res, this.config.initialConfig);
} catch (error) {
next(error);
}
});
}
private setupHealthEndpoints(): void {
// Basic health check
this.app.get('/health', (req: Request, res: Response) => {
const health = this.healthMonitor.getHealth();
const status = health.status === 'healthy' ? 200 : 503;
res.status(status).json(health);
});
// Detailed health check
this.app.get('/health/detailed', (req: Request, res: Response) => {
const health = this.healthMonitor.getDetailedHealth();
const connections = this.connectionManager.getStats();
res.json({
...health,
connections,
server: {
uptime: Date.now() - this.startTime,
version: process.env.npm_package_version || 'unknown',
node: process.version,
pid: process.pid,
memory: process.memoryUsage(),
cpu: process.cpuUsage()
}
});
});
// Readiness check
this.app.get('/ready', (req: Request, res: Response) => {
if (this.isShuttingDown) {
res.status(503).json({ ready: false, reason: 'shutting_down' });
return;
}
const health = this.healthMonitor.getHealth();
if (health.status !== 'healthy') {
res.status(503).json({ ready: false, reason: 'unhealthy' });
return;
}
res.json({ ready: true });
});
// Liveness check
this.app.get('/alive', (req: Request, res: Response) => {
res.json({ alive: true, pid: process.pid });
});
// Metrics endpoint
this.app.get('/metrics', (req: Request, res: Response) => {
const metrics = this.healthMonitor.getMetrics();
const connections = this.connectionManager.getStats();
// Prometheus format
const output = [
`# HELP http_requests_total Total number of HTTP requests`,
`# TYPE http_requests_total counter`,
`http_requests_total ${metrics.totalRequests}`,
'',
`# HELP http_errors_total Total number of HTTP errors`,
`# TYPE http_errors_total counter`,
`http_errors_total ${metrics.totalErrors}`,
'',
`# HELP http_request_duration_seconds HTTP request latencies`,
`# TYPE http_request_duration_seconds histogram`,
`http_request_duration_seconds_sum ${metrics.totalDuration / 1000}`,
`http_request_duration_seconds_count ${metrics.totalRequests}`,
'',
`# HELP active_connections Number of active connections`,
`# TYPE active_connections gauge`,
`active_connections ${connections.total}`,
'',
`# HELP process_uptime_seconds Process uptime`,
`# TYPE process_uptime_seconds counter`,
`process_uptime_seconds ${(Date.now() - this.startTime) / 1000}`
].join('\n');
res.type('text/plain').send(output);
});
}
private setupIPCHandlers(): void {
// Handle messages from watchdog
process.on('message', (message: any) => {
switch (message.type) {
case 'ping':
if (process.send) {
process.send({ type: 'pong' });
}
break;
case 'shutdown':
this.gracefulShutdown();
break;
case 'health':
if (process.send) {
process.send({
type: 'health',
data: this.healthMonitor.getHealth()
});
}
break;
case 'metrics':
if (process.send) {
process.send({
type: 'metrics',
data: this.healthMonitor.getMetrics()
});
}
break;
}
});
// Setup IPC server for admin commands
if (process.platform !== 'win32') {
this.setupUnixSocketServer();
}
}
private setupUnixSocketServer(): void {
const socketPath = `/tmp/claude-code-router-admin.sock`;
// Clean up existing socket
try {
require('fs').unlinkSync(socketPath);
} catch (error) {
// Ignore if doesn't exist
}
const adminServer = net.createServer((socket) => {
socket.on('data', (data) => {
try {
const command = JSON.parse(data.toString());
this.handleAdminCommand(command, socket);
} catch (error) {
socket.write(JSON.stringify({ error: 'Invalid command' }));
socket.end();
}
});
});
adminServer.listen(socketPath);
}
private handleAdminCommand(command: any, socket: net.Socket): void {
switch (command.type) {
case 'status':
socket.write(JSON.stringify({
status: 'ok',
health: this.healthMonitor.getHealth(),
connections: this.connectionManager.getStats()
}));
break;
case 'reload-config':
// Implement config reload
this.reloadConfiguration();
socket.write(JSON.stringify({ status: 'reloaded' }));
break;
case 'clear-cache':
// Implement cache clearing
socket.write(JSON.stringify({ status: 'cleared' }));
break;
default:
socket.write(JSON.stringify({ error: 'Unknown command' }));
}
socket.end();
}
private reloadConfiguration(): void {
try {
// Reload configuration without restart
delete require.cache[require.resolve(this.config.jsonPath)];
const newConfig = require(this.config.jsonPath);
// Update configuration
this.config.initialConfig = newConfig;
console.log('Configuration reloaded successfully');
} catch (error) {
console.error('Failed to reload configuration:', error);
}
}
async start(): Promise<void> {
return new Promise((resolve, reject) => {
try {
this.server = this.app.listen(this.config.port, () => {
console.log(`Claude Code Router running on port ${this.config.port}`);
// Notify watchdog that we're ready
if (process.send) {
process.send({ type: 'ready' });
}
resolve();
});
// Track server-level connections
this.server.on('connection', (socket: net.Socket) => {
this.connectionManager.trackConnection(socket, 'tcp');
});
// Handle server errors
this.server.on('error', (error: Error) => {
console.error('Server error:', error);
this.healthMonitor.recordError(error);
reject(error);
});
} catch (error) {
reject(error);
}
});
}
async stop(): Promise<void> {
await this.gracefulShutdown();
}
private async gracefulShutdown(): Promise<void> {
if (this.isShuttingDown) {
return;
}
console.log('Starting graceful shutdown...');
this.isShuttingDown = true;
// Stop accepting new connections
if (this.server) {
this.server.close(() => {
console.log('Server stopped accepting new connections');
});
}
// Wait for existing connections to close
await this.connectionManager.gracefulShutdown();
// Final cleanup
console.log('Shutdown complete');
process.exit(0);
}
}
// Health Monitor implementation
class HealthMonitor {
private metrics = {
totalRequests: 0,
totalErrors: 0,
totalDuration: 0,
statusCodes: new Map<number, number>(),
errors: [] as Array<{ timestamp: number; error: string }>,
lastError?: { timestamp: number; error: string }
};
recordRequest(data: {
method: string;
path: string;
statusCode: number;
duration: number;
}): void {
this.metrics.totalRequests++;
this.metrics.totalDuration += data.duration;
const count = this.metrics.statusCodes.get(data.statusCode) || 0;
this.metrics.statusCodes.set(data.statusCode, count + 1);
if (data.statusCode >= 500) {
this.metrics.totalErrors++;
}
}
recordError(error: Error): void {
const errorData = {
timestamp: Date.now(),
error: error.message
};
this.metrics.errors.push(errorData);
this.metrics.lastError = errorData;
// Keep only last 100 errors
if (this.metrics.errors.length > 100) {
this.metrics.errors.shift();
}
}
getHealth(): { status: string; checks: any } {
const errorRate = this.metrics.totalRequests > 0
? this.metrics.totalErrors / this.metrics.totalRequests
: 0;
const recentErrors = this.metrics.errors.filter(
e => e.timestamp > Date.now() - 60000
).length;
let status = 'healthy';
const checks: any = {};
// Check error rate
if (errorRate > 0.1) {
status = 'unhealthy';
checks.errorRate = { status: 'fail', message: `Error rate ${(errorRate * 100).toFixed(2)}%` };
} else {
checks.errorRate = { status: 'pass' };
}
// Check recent errors
if (recentErrors > 10) {
status = 'unhealthy';
checks.recentErrors = { status: 'fail', message: `${recentErrors} errors in last minute` };
} else {
checks.recentErrors = { status: 'pass' };
}
return { status, checks };
}
getDetailedHealth(): any {
return {
...this.getHealth(),
metrics: this.getMetrics(),
recentErrors: this.metrics.errors.slice(-10)
};
}
getMetrics(): any {
return {
totalRequests: this.metrics.totalRequests,
totalErrors: this.metrics.totalErrors,
totalDuration: this.metrics.totalDuration,
averageLatency: this.metrics.totalRequests > 0
? this.metrics.totalDuration / this.metrics.totalRequests
: 0,
statusCodes: Object.fromEntries(this.metrics.statusCodes),
lastError: this.metrics.lastError
};
}
}
// Export factory function for backward compatibility
export function createServer(config: ServerConfig): ClaudeCodeRouterServer {
return new ClaudeCodeRouterServer(config);
}
```
### CLI Integration
**File: `src/cli-service.ts`**
```typescript
// src/cli-service.ts
import { ServiceManager } from './services/ServiceManager';
import * as path from 'path';
import * as fs from 'fs';
import { spawn } from 'child_process';
const HELP_TEXT = `
Claude Code Router Service Manager
Commands:
install Install as system service
uninstall Remove system service
start Start the service
stop Stop the service
restart Restart the service
status Show service status
logs Show service logs
health Check service health
Options:
--port Port to run on (default: 3456)
--config Path to config file
--help Show this help message
`;
class CLIServiceManager {
private serviceManager: ServiceManager;
private serviceName = 'claude-code-router';
constructor() {
const scriptPath = path.join(__dirname, 'server-wrapper.js');
this.serviceManager = new ServiceManager({
name: this.serviceName,
displayName: 'Claude Code Router',
description: 'Routes Claude Code requests to different LLM providers',
script: scriptPath,
env: {
NODE_ENV: 'production',
SERVICE_PORT: process.env.PORT || '3456',
CONFIG_PATH: process.env.CONFIG_PATH || path.join(process.env.HOME || '', '.claude-code-router', 'config.json')
},
maxRestarts: 5,
restartDelay: 5000,
gracefulShutdownTimeout: 30000
});
this.setupEventHandlers();
}
private setupEventHandlers(): void {
this.serviceManager.on('installed', () => {
console.log('✅ Service installed successfully');
console.log('Run "ccr service start" to start the service');
});
this.serviceManager.on('uninstalled', () => {
console.log('✅ Service uninstalled successfully');
});
this.serviceManager.on('started', () => {
console.log('✅ Service started successfully');
});
this.serviceManager.on('stopped', () => {
console.log('✅ Service stopped successfully');
});
this.serviceManager.on('restarted', () => {
console.log('✅ Service restarted successfully');
});
this.serviceManager.on('crashed', (error) => {
console.error('❌ Service crashed:', error);
});
this.serviceManager.on('failed', (reason) => {
console.error('❌ Service failed:', reason);
});
}
async execute(command: string, args: string[]): Promise<void> {
try {
switch (command) {
case 'instal