UNPKG

graceful-cluster

Version:

Gracefully restart node.js http cluster with zero downtime. Shutdown server without active inbound connections reset.

222 lines (175 loc) 7.47 kB
var cluster = require('cluster'); var numCPUs = require('os').cpus().length; var GracefulCluster = module.exports; /* Starts node.js cluster with graceful restart/shutdown. Params: - options.serverFunction - required, function with worker logic. - options.log - function, custom log function, console.log used by default. - options.shutdownTimeout - ms, force worker shutdown on SIGTERM timeout. - options.disableGraceful - disable graceful shutdown for faster debug. - options.restartOnMemory - bytes, restart worker on memory usage. - options.restartOnTimeout - ms, restart worker by timer. - options.workersCount - workers count, if not specified `os.cpus().length` will be used. Graceful restart performed by USR2 signal: pkill -USR2 <cluster_process_name> or kill -s SIGUSR2 <cluster_pid> */ GracefulCluster.start = function(options) { var serverFunction = options.serverFunction; if (!serverFunction) { throw new Error('Graceful cluster: `options.serverFunction` required.'); } var exitFunction = options.exitFunction || (function gracefulClusterExit () { process.exit(0); }); var log = options.log || console.log; var shutdownTimeout = options.shutdownTimeout || 5000; var disableGraceful = options.disableGraceful; var workersCount = options.workersCount || numCPUs; if (cluster.isMaster) { var currentRestartingPid = null; var currentWorkersCount = 0; var listeningWorkersCount = 0; var restartQueue = []; var shutdownTimer = null; var sigkill = false; // Prevent killing all workers at same time when restarting. function checkRestartQueue() { // Kill one worker only if maximum count are working. if (restartQueue.length > 0 && listeningWorkersCount === workersCount && !currentRestartingPid) { var pid = restartQueue.shift(); try { // Store process id to wait for its finish. currentRestartingPid = pid; // Send SIGTERM signal to worker. SIGTERM starts graceful shutdown of worker inside it. process.kill(pid); } catch(ex) { // Reset current pid. currentRestartingPid = null; // If no process killed, try next in queue. process.nextTick(checkRestartQueue); // Fail silent on 'No such process'. May occur when kill message received after kill initiated but not finished. if (ex.code !== 'ESRCH') { throw ex; } } } } // Create fork with 'on restart' message event listener. function fork() { cluster.fork().on('message', function(message) { if (message.cmd === 'restart' && message.pid && restartQueue.indexOf(message.pid) === -1) { // When worker asks to restart gracefully in cluster, then add it to restart queue. restartQueue.push(message.pid); checkRestartQueue(); } }); } // Fork workers. for (var i = 0; i < workersCount; i++) { fork(); } // Check if has alive workers and exit. function checkIfNoWorkersAndExit() { if (!currentWorkersCount) { log('Cluster graceful shutdown: done.'); if (shutdownTimer) clearTimeout(shutdownTimer); exitFunction(); } else { log('Cluster graceful shutdown: wait ' + currentWorkersCount + ' worker' + (currentWorkersCount > 1 ? 's' : '') + '.'); } } function startShutdown() { if (disableGraceful) { if (shutdownTimer) clearTimeout(shutdownTimer); exitFunction(); return; } // Log how many workers alive. checkIfNoWorkersAndExit(); if (sigkill) { return; } // Shutdown timeout. shutdownTimer = setTimeout(function() { log('Cluster graceful shutdown: timeout, force exit.'); exitFunction(); }, shutdownTimeout); // Shutdown mode. sigkill = true; for (var id in cluster.workers) { // Send SIGTERM signal to all workers. SIGTERM starts graceful shutdown of worker inside it. process.kill(cluster.workers[id].process.pid); } } process.on('SIGTERM',startShutdown); process.on('SIGINT',startShutdown); // Gracefuly restart with 'kill -s SIGUSR2 <pid>'. process.on('SIGUSR2',function() { for (var id in cluster.workers) { // Push all workers to restart queue. var pid = cluster.workers[id].process.pid; if (restartQueue.indexOf(pid) === -1) { restartQueue.push(pid); } } checkRestartQueue(); }); cluster.on('fork', function(worker) { currentWorkersCount++; worker.on('listening', function() { listeningWorkersCount++; // New worker online, maybe all online, try restart other. checkRestartQueue(); }); log('Cluster: worker ' + worker.process.pid + ' started.'); }); cluster.on('exit', function(worker, code, signal) { // Mark process finished. if (currentRestartingPid === worker.process.pid) { currentRestartingPid = null; } currentWorkersCount--; listeningWorkersCount--; if (sigkill) { checkIfNoWorkersAndExit(); return; } log('Cluster: worker ' + worker.process.pid + ' died (code: ' + code + '), restarting...'); fork(); }); process.on('uncaughtException', function(err) { if (disableGraceful) { log('Cluster error:', err.stack); } else { log('Cluster error:', err.message); } }); } else { // Start worker. serverFunction(); // Self restart logic. if (options.restartOnMemory) { setInterval(function() { var mem = process.memoryUsage().rss; if (mem > options.restartOnMemory) { log('Cluster: worker ' + process.pid + ' used too much memory (' + Math.round(mem / (1024*1024)) + ' MB), restarting...'); GracefulCluster.gracefullyRestartCurrentWorker(); } }, 1000); } if (options.restartOnTimeout) { setInterval(function() { log('Cluster: worker ' + process.pid + ' restarting by timer...'); GracefulCluster.gracefullyRestartCurrentWorker(); }, options.restartOnTimeout); } } }; GracefulCluster.gracefullyRestartCurrentWorker = function() { // Perform restart by cluster to prevent all workers offline. process.send({ cmd: 'restart', pid: process.pid }); };