UNPKG

elasticsearch-watchdog

Version:

A watchdog of elasticsearch - cluster nodes' statuses monitor, auto restart, keep PRIMARY node unique.

465 lines (432 loc) 13 kB
var util = require('util'), helper = require('./util/helper'), path = require('path'), fs = require('fs'), OpenSSH = require('ssh2'), async = require('async'), req = require('req-fast'), dynamicTimer = require('dynamic-timer'), EventEmitter = require('events').EventEmitter, exec = require('child_process').exec, _ = require('lodash'), VARS = require('./util/vars'); module.exports = Watchdog; /** * Watching statuses of ElasticSearch. * @param {Object} options * @return {Watchdog} * @constructor */ function Watchdog(options){ if (!(this instanceof Watchdog)) { return new Watchdog(options); } EventEmitter.call(this); this.status = VARS.STATUS.WAITING; this._reset(); this._genOptions(options); } Watchdog.status = VARS.STATUS; util.inherits(Watchdog, EventEmitter); // methods. _.assign(Watchdog.prototype, { watching : watching, end : end, _persistent: _persistent, _esStatuses: _esStatuses, _esStatus : _esStatus, _forward : _forward, _restartES : _restartES, _openSSH : _openSSH, _genOptions: _genOptions, _reset : _reset }); function _emitInfo(m, t){ this.emit('info', { type : t || 'info', message: '[' + m.prefix.toUpperCase() + Array(15 - m.prefix.length).join(' ') + '] ' + m.message }); } /** * Start watching. * @return {*} */ function watching(){ var timestamp = this._localStorage.timestamp, health = this._localStorage.health, wait = this.conf.http.wait; if (timestamp && health && (timestamp = Date.now() - timestamp) < wait && _.some(health, function(v, k){ return v == VARS.NODE_STATUS.ERROR || v == VARS.NODE_STATUS.REBOOT; })) { timestamp = wait - timestamp; if (timestamp > wait || timestamp < 0) { timestamp = wait; } this.status = VARS.STATUS.SLEEPING; _emitInfo.call(this, { prefix : 'sleeping', message: timestamp + ' milliseconds' }); return setTimeout(function(ctx){ ctx.watching(); }, timestamp, this); } var dynamicTimeout = dynamicTimer({ seed : 1e3, strategy: { 'low' : 'lucas', 'medium' : 'fibonacci', 'high' : 'dayan', 'critical': 'procession' }[this.conf.watchdog.frequency], overrun : dynamicTimer.state.RESET }); dynamicTimeout.on('tick', function(){ _emitInfo.call(this, { prefix : 'checking', message: this._dynamicTimeout.attempts + ' next after ' + this._dynamicTimeout.delay + ' millisecond(s).' }); dynamicTimeout.pause(); this._esStatuses(this.conf.nodes, VARS.FACET.HEALTH); }.bind(this)); dynamicTimeout.start(); this._dynamicTimeout = dynamicTimeout; this.status = VARS.STATUS.WATCHING; } /** * End it. */ function end(){ this._dynamicTimeout && this._dynamicTimeout.stop(); this._dynamicTimeout = null; this.removeAllListeners(); this.status = VARS.STATUS.RECALL; } /** * Check ElasticSearch nodes' healths(_cluster/health) or states(_cluster/state/master_node,nodes) with retry attempts. * @param {Object} nodes * @param {String} facet * @private */ function _esStatuses(nodes, facet){ async.parallel(_.mapValues(nodes, function(v){ return function(next){ this._esStatus({ uri : v.elasticsearch + '/_cluster/' + facet, timeout: this.conf.http.timeout }, next); }.bind(this); }.bind(this)), function(err, result){ if (this.status != VARS.STATUS.WATCHING) { return; } // merge normal nodes. this._nornodes = _(result).pick(function(v, k){ return v != VARS.NODE_STATUS.UNKNOWN; }).merge(this._nornodes).value(); // increase attempts. this._attempts += 1; // when over max attempts, means nodes' healths were in terrible conditions. this._forward(this._attempts >= this.conf.http.retry, facet); }.bind(this)); } /** * Check a node's status in the cluster. * @param {Object} options * @param {Function} callback * @private */ function _esStatus(options, callback){ var facet = options.uri.substr(options.uri.lastIndexOf('/') + 1); if (facet == VARS.FACET.STATE) { options.uri += '/master_node,nodes' } req(options, function(err, resp){ if (this.status != VARS.STATUS.WATCHING) { return callback(); } // caught error. var errMsg; if (err) { errMsg = err.message; } else if (!resp) { errMsg = 'unresponsive'; } else if (resp.statusCode != 200) { errMsg = 'status code ' + resp.statusCode || '---'; } if (errMsg) { _emitInfo.call(this, { prefix : 'request', message: '`' + options.uri + '`: ' + errMsg }, 'error'); return callback(null, VARS.NODE_STATUS.UNKNOWN); } var noResponse = (err || !resp || resp.statusCode != 200), arg = VARS.NODE_STATUS.UNKNOWN; if (!noResponse && facet == VARS.FACET.HEALTH) { arg = resp.body.status; } else if (!noResponse && facet == VARS.FACET.STATE) { var trans_addr = resp.body.nodes[resp.body.master_node].transport_address, addr = trans_addr.match(/inet\[\/(\S+):\d+\]/i); if (addr.length >= 2) { arg = addr[1]; } } var tag; if (facet == VARS.FACET.STATE) { tag = 'MASTER_NODE'; } else { tag = facet.toUpperCase(); } // log _emitInfo.call(this, { prefix : tag, message: options.host + ' -> ' + arg.toUpperCase() }); // trigger callback. callback(null, arg); }.bind(this)); } /** * Do jobs after node status being checked. * @param {Boolean} overrun * @param {String} facet * @private */ function _forward(overrun, facet){ // abnormal nodes. var abnNodes = _.omit(this.conf.nodes, _.keys(this._nornodes)), hasAbnormalNode = _.size(abnNodes) > 0; // has abnormal node, retry until maximize attempts, otherwise go to STEP-1. if (hasAbnormalNode && !overrun) { return setTimeout(function(ctx){ ctx._esStatuses(abnNodes, facet); }, this.conf.http.delay, this); } var stuck = hasAbnormalNode && overrun; var _nodes = _.assign(this._nornodes, _.mapValues(abnNodes, function(){ return VARS.NODE_STATUS.UNKNOWN; })); if (facet == VARS.FACET.STATE) { var _primNodes = {}; for (var k in _nodes) { var primary = _nodes[k]; (_primNodes[primary] = _primNodes[primary] || []).push(k); } _nodes = _primNodes; } // Persist nodes' data. this._persistent(_nodes, facet); if (this.conf.elasticsearch.autorestart && stuck) { return this._restartES(abnNodes, facet); } // Resort nodes by statuses. var abnNodes; if (facet == VARS.FACET.HEALTH) { // Everything goes fine, we need to check status whether it satisfies the configured condition or not. abnNodes = _(this._nornodes).omit(function(value){ return !!~this.conf.elasticsearch.status.indexOf(value); }, this).keys().value(); } else if (facet == VARS.FACET.STATE) { // Check primary node. // The minority is subordinate to the majority. if (this.conf.elasticsearch.primary == 'MS2M') { var children = 0, primary; for (var k in _nodes) { (_nodes[k].length > children) && (children = _nodes[k].length, primary = k); } delete _nodes[primary]; abnNodes = Object.keys(_nodes); } else { // Primary node is specific. abnNodes = _(_nodes).omit(this.conf.elasticsearch.primary).values().flatten().value(); } } // If have abnormal nodes, try to restart ElasticSearch. if (this.conf.elasticsearch.autorestart && abnNodes && abnNodes.length > 0) { abnNodes = _.pick(this.conf.nodes, abnNodes); if (_.size(abnNodes) > 0) { return this._restartES(abnNodes, facet); } } // Otherwise reset binding properties. this._reset(); // If current facet is health, then try to check cluster's states. if (facet == VARS.FACET.HEALTH && _.size(this.conf.nodes) > 1) { return this._esStatuses(this.conf.nodes, VARS.FACET.STATE); } // One loop was finished, resume timer. this._dynamicTimeout.resume(); }; /** * Restart ElasticSearch. * @param {Object} nodes * @param {String} facet * @private */ function _restartES(nodes, facet){ var es_delay = this.conf.elasticsearch.delay; async.parallel(_.mapValues(nodes, function(v){ return this._openSSH(v.ssh, es_delay); }.bind(this)), function(err, result){ if (err) { _emitInfo.call(this, { prefix : 'restart', message: err.message }, 'error'); } else { this._persistent(_.assign(this._localStorage.health, result), VARS.FACET.HEALTH); this._reset(); _emitInfo.call(this, { prefix : 'sleeping', message: this.conf.http.wait + ' milliseconds' }); } setTimeout(function(ctx){ ctx._dynamicTimeout.resume(); _emitInfo.call(ctx, { prefix : 'awake', message: '_dynamicTimeout resume.' }); }, this.conf.http.wait, this); }.bind(this)); }; /** * Connect to server through openSSH. * @param {Object} ssh * @param {Number} delay * @return {Function} * @private */ function _openSSH(ssh, delay){ var called = false, onComplete = function(next, err, result){ var status = VARS.NODE_STATUS.REBOOT; if (err) { status = VARS.NODE_STATUS.ERROR; _emitInfo.call(this, { prefix : 'restart', message: ssh.host + ' failed: ' + err.message }, 'error'); } else { _emitInfo.call(this, { prefix : 'restarted', message: ssh.host }); } !called && next(null, status); called = true; }, fallbackStop = function(stopped, next){ if (stopped) { return next(); } var cmd = 'ps aux | grep org.elasticsearch.bootstrap.Elasticsearch | xargs kill -9'; _emitInfo.call(this, { prefix : 'stop', message: ssh.host + ' failed to stop, fallback to ' + cmd }); exec(cmd, function(){ _.delay(next, delay); }); }, esWaterfall = [ function(shellDone){ exec(ssh.es_stop, function(err){ _.delay(shellDone, delay, null, !err); }); }, fallbackStop.bind(this), function(shellDone){ exec(ssh.es_start, function(err){ _.delay(shellDone, delay, err ? new Error('can not start ElasticSearch') : null); }); }]; // Local if (this._ips && !!~this._ips.indexOf(ssh.host)) { return function(next){ _emitInfo.call(this, { prefix : 'restarting', message: 'Local ' + ssh.host }); async.waterfall(esWaterfall, onComplete.bind(this, next)); }.bind(this); } return function(next){ var sshCloned = _(ssh).chain().omit('stop', 'start').clone().value(); try { sshCloned.password = helper.decrypt(sshCloned.password); } catch (err) { return onComplete.call(this, next, new Error('decrypt password failed.')); } _emitInfo.call(this, { prefix : 'restarting', message: 'Remote ' + ssh.host }); // open ssh connection var conn = new OpenSSH(), _onComplete = function(err){ conn.end(); onComplete.call(this, next, err); }; // lisntening events. conn .on('ready', function(){ async.waterfall(esWaterfall, _onComplete.bind(this)); }.bind(this)) .on('error', _onComplete.bind(this)); conn.connect(sshCloned); }.bind(this); }; /** * Generate options. * @param {Object} options * @private */ function _genOptions(options){ // configuration file must be provided. if (!options || !options.conf) { throw new Error('`conf` is required.'); } try { this.conf = helper.loadConfig(options.conf); } catch (err) { throw err; } this._uid = options.uid || helper._randomNo(); this._ips = helper.ips; // nodes' data. this._localStorage = {}; var dataPath = path.resolve(helper.ROOT, 'data', this.conf.watchdog.name + '.' + this._uid + '.json'); if (fs.existsSync(dataPath)) { var data = fs.readFileSync(dataPath, {encoding: 'utf-8'}); try { this._localStorage = JSON.parse(data); } catch (err) { // parsed fail, ignore data. } } }; /** * Reset bind properties. * @private */ function _reset(){ this._attempts = 0; this._nornodes = {}; } /** * Persist nodes' statuses. * @param {Object} nodes * @param {String} facet * @private */ function _persistent(nodes, facet){ var data = this._localStorage; data.timestamp = Date.now(); data[facet] = _.clone(nodes); var dir = path.resolve(helper.ROOT, 'data'); !fs.existsSync(dir) && fs.mkdirSync(dir); fs.writeFileSync(path.resolve(dir, this.conf.watchdog.name + '.' + this._uid + '.json'), new Buffer(JSON.stringify(data, null, 2), 'utf-8')); this._localStorage = data; };