elasticsearch-watchdog
Version:
A watchdog of elasticsearch - cluster nodes' statuses monitor, auto restart, keep PRIMARY node unique.
465 lines (432 loc) • 13 kB
JavaScript
var util = require('util'),
helper = require('./util/helper'),
path = require('path'),
fs = require('fs'),
OpenSSH = require('ssh2'),
async = require('async'),
req = require('req-fast'),
dynamicTimer = require('dynamic-timer'),
EventEmitter = require('events').EventEmitter,
exec = require('child_process').exec,
_ = require('lodash'),
VARS = require('./util/vars');
module.exports = Watchdog;
/**
* Watching statuses of ElasticSearch.
* @param {Object} options
* @return {Watchdog}
* @constructor
*/
function Watchdog(options){
if (!(this instanceof Watchdog)) {
return new Watchdog(options);
}
EventEmitter.call(this);
this.status = VARS.STATUS.WAITING;
this._reset();
this._genOptions(options);
}
Watchdog.status = VARS.STATUS;
util.inherits(Watchdog, EventEmitter);
// methods.
_.assign(Watchdog.prototype, {
watching : watching,
end : end,
_persistent: _persistent,
_esStatuses: _esStatuses,
_esStatus : _esStatus,
_forward : _forward,
_restartES : _restartES,
_openSSH : _openSSH,
_genOptions: _genOptions,
_reset : _reset
});
function _emitInfo(m, t){
this.emit('info', {
type : t || 'info',
message: '[' + m.prefix.toUpperCase() + Array(15 - m.prefix.length).join(' ') + '] ' + m.message
});
}
/**
* Start watching.
* @return {*}
*/
function watching(){
var timestamp = this._localStorage.timestamp,
health = this._localStorage.health,
wait = this.conf.http.wait;
if (timestamp && health && (timestamp = Date.now() - timestamp) < wait && _.some(health, function(v, k){
return v == VARS.NODE_STATUS.ERROR || v == VARS.NODE_STATUS.REBOOT;
})) {
timestamp = wait - timestamp;
if (timestamp > wait || timestamp < 0) {
timestamp = wait;
}
this.status = VARS.STATUS.SLEEPING;
_emitInfo.call(this, {
prefix : 'sleeping',
message: timestamp + ' milliseconds'
});
return setTimeout(function(ctx){
ctx.watching();
}, timestamp, this);
}
var dynamicTimeout = dynamicTimer({
seed : 1e3,
strategy: {
'low' : 'lucas',
'medium' : 'fibonacci',
'high' : 'dayan',
'critical': 'procession'
}[this.conf.watchdog.frequency],
overrun : dynamicTimer.state.RESET
});
dynamicTimeout.on('tick', function(){
_emitInfo.call(this, {
prefix : 'checking',
message: this._dynamicTimeout.attempts + ' next after ' + this._dynamicTimeout.delay + ' millisecond(s).'
});
dynamicTimeout.pause();
this._esStatuses(this.conf.nodes, VARS.FACET.HEALTH);
}.bind(this));
dynamicTimeout.start();
this._dynamicTimeout = dynamicTimeout;
this.status = VARS.STATUS.WATCHING;
}
/**
* End it.
*/
function end(){
this._dynamicTimeout && this._dynamicTimeout.stop();
this._dynamicTimeout = null;
this.removeAllListeners();
this.status = VARS.STATUS.RECALL;
}
/**
* Check ElasticSearch nodes' healths(_cluster/health) or states(_cluster/state/master_node,nodes) with retry attempts.
* @param {Object} nodes
* @param {String} facet
* @private
*/
function _esStatuses(nodes, facet){
async.parallel(_.mapValues(nodes, function(v){
return function(next){
this._esStatus({
uri : v.elasticsearch + '/_cluster/' + facet,
timeout: this.conf.http.timeout
}, next);
}.bind(this);
}.bind(this)), function(err, result){
if (this.status != VARS.STATUS.WATCHING) {
return;
}
// merge normal nodes.
this._nornodes = _(result).pick(function(v, k){
return v != VARS.NODE_STATUS.UNKNOWN;
}).merge(this._nornodes).value();
// increase attempts.
this._attempts += 1;
// when over max attempts, means nodes' healths were in terrible conditions.
this._forward(this._attempts >= this.conf.http.retry, facet);
}.bind(this));
}
/**
* Check a node's status in the cluster.
* @param {Object} options
* @param {Function} callback
* @private
*/
function _esStatus(options, callback){
var facet = options.uri.substr(options.uri.lastIndexOf('/') + 1);
if (facet == VARS.FACET.STATE) {
options.uri += '/master_node,nodes'
}
req(options, function(err, resp){
if (this.status != VARS.STATUS.WATCHING) {
return callback();
}
// caught error.
var errMsg;
if (err) {
errMsg = err.message;
} else if (!resp) {
errMsg = 'unresponsive';
} else if (resp.statusCode != 200) {
errMsg = 'status code ' + resp.statusCode || '---';
}
if (errMsg) {
_emitInfo.call(this, {
prefix : 'request',
message: '`' + options.uri + '`: ' + errMsg
}, 'error');
return callback(null, VARS.NODE_STATUS.UNKNOWN);
}
var noResponse = (err || !resp || resp.statusCode != 200), arg = VARS.NODE_STATUS.UNKNOWN;
if (!noResponse && facet == VARS.FACET.HEALTH) {
arg = resp.body.status;
} else if (!noResponse && facet == VARS.FACET.STATE) {
var trans_addr = resp.body.nodes[resp.body.master_node].transport_address,
addr = trans_addr.match(/inet\[\/(\S+):\d+\]/i);
if (addr.length >= 2) {
arg = addr[1];
}
}
var tag;
if (facet == VARS.FACET.STATE) {
tag = 'MASTER_NODE';
} else {
tag = facet.toUpperCase();
}
// log
_emitInfo.call(this, {
prefix : tag,
message: options.host + ' -> ' + arg.toUpperCase()
});
// trigger callback.
callback(null, arg);
}.bind(this));
}
/**
* Do jobs after node status being checked.
* @param {Boolean} overrun
* @param {String} facet
* @private
*/
function _forward(overrun, facet){
// abnormal nodes.
var abnNodes = _.omit(this.conf.nodes, _.keys(this._nornodes)),
hasAbnormalNode = _.size(abnNodes) > 0;
// has abnormal node, retry until maximize attempts, otherwise go to STEP-1.
if (hasAbnormalNode && !overrun) {
return setTimeout(function(ctx){
ctx._esStatuses(abnNodes, facet);
}, this.conf.http.delay, this);
}
var stuck = hasAbnormalNode && overrun;
var _nodes = _.assign(this._nornodes, _.mapValues(abnNodes, function(){
return VARS.NODE_STATUS.UNKNOWN;
}));
if (facet == VARS.FACET.STATE) {
var _primNodes = {};
for (var k in _nodes) {
var primary = _nodes[k];
(_primNodes[primary] = _primNodes[primary] || []).push(k);
}
_nodes = _primNodes;
}
// Persist nodes' data.
this._persistent(_nodes, facet);
if (this.conf.elasticsearch.autorestart && stuck) {
return this._restartES(abnNodes, facet);
}
// Resort nodes by statuses.
var abnNodes;
if (facet == VARS.FACET.HEALTH) {
// Everything goes fine, we need to check status whether it satisfies the configured condition or not.
abnNodes = _(this._nornodes).omit(function(value){
return !!~this.conf.elasticsearch.status.indexOf(value);
}, this).keys().value();
} else if (facet == VARS.FACET.STATE) {
// Check primary node.
// The minority is subordinate to the majority.
if (this.conf.elasticsearch.primary == 'MS2M') {
var children = 0, primary;
for (var k in _nodes) {
(_nodes[k].length > children) && (children = _nodes[k].length, primary = k);
}
delete _nodes[primary];
abnNodes = Object.keys(_nodes);
} else {
// Primary node is specific.
abnNodes = _(_nodes).omit(this.conf.elasticsearch.primary).values().flatten().value();
}
}
// If have abnormal nodes, try to restart ElasticSearch.
if (this.conf.elasticsearch.autorestart && abnNodes && abnNodes.length > 0) {
abnNodes = _.pick(this.conf.nodes, abnNodes);
if (_.size(abnNodes) > 0) {
return this._restartES(abnNodes, facet);
}
}
// Otherwise reset binding properties.
this._reset();
// If current facet is health, then try to check cluster's states.
if (facet == VARS.FACET.HEALTH && _.size(this.conf.nodes) > 1) {
return this._esStatuses(this.conf.nodes, VARS.FACET.STATE);
}
// One loop was finished, resume timer.
this._dynamicTimeout.resume();
};
/**
* Restart ElasticSearch.
* @param {Object} nodes
* @param {String} facet
* @private
*/
function _restartES(nodes, facet){
var es_delay = this.conf.elasticsearch.delay;
async.parallel(_.mapValues(nodes, function(v){
return this._openSSH(v.ssh, es_delay);
}.bind(this)), function(err, result){
if (err) {
_emitInfo.call(this, {
prefix : 'restart',
message: err.message
}, 'error');
} else {
this._persistent(_.assign(this._localStorage.health, result), VARS.FACET.HEALTH);
this._reset();
_emitInfo.call(this, {
prefix : 'sleeping',
message: this.conf.http.wait + ' milliseconds'
});
}
setTimeout(function(ctx){
ctx._dynamicTimeout.resume();
_emitInfo.call(ctx, {
prefix : 'awake',
message: '_dynamicTimeout resume.'
});
}, this.conf.http.wait, this);
}.bind(this));
};
/**
* Connect to server through openSSH.
* @param {Object} ssh
* @param {Number} delay
* @return {Function}
* @private
*/
function _openSSH(ssh, delay){
var called = false,
onComplete = function(next, err, result){
var status = VARS.NODE_STATUS.REBOOT;
if (err) {
status = VARS.NODE_STATUS.ERROR;
_emitInfo.call(this, {
prefix : 'restart',
message: ssh.host + ' failed: ' + err.message
}, 'error');
} else {
_emitInfo.call(this, {
prefix : 'restarted',
message: ssh.host
});
}
!called && next(null, status);
called = true;
},
fallbackStop = function(stopped, next){
if (stopped) {
return next();
}
var cmd = 'ps aux | grep org.elasticsearch.bootstrap.Elasticsearch | xargs kill -9';
_emitInfo.call(this, {
prefix : 'stop',
message: ssh.host + ' failed to stop, fallback to ' + cmd
});
exec(cmd, function(){
_.delay(next, delay);
});
},
esWaterfall = [
function(shellDone){
exec(ssh.es_stop, function(err){
_.delay(shellDone, delay, null, !err);
});
},
fallbackStop.bind(this),
function(shellDone){
exec(ssh.es_start, function(err){
_.delay(shellDone, delay, err ? new Error('can not start ElasticSearch') : null);
});
}];
// Local
if (this._ips && !!~this._ips.indexOf(ssh.host)) {
return function(next){
_emitInfo.call(this, {
prefix : 'restarting',
message: 'Local ' + ssh.host
});
async.waterfall(esWaterfall, onComplete.bind(this, next));
}.bind(this);
}
return function(next){
var sshCloned = _(ssh).chain().omit('stop', 'start').clone().value();
try {
sshCloned.password = helper.decrypt(sshCloned.password);
} catch (err) {
return onComplete.call(this, next, new Error('decrypt password failed.'));
}
_emitInfo.call(this, {
prefix : 'restarting',
message: 'Remote ' + ssh.host
});
// open ssh connection
var conn = new OpenSSH(),
_onComplete = function(err){
conn.end();
onComplete.call(this, next, err);
};
// lisntening events.
conn
.on('ready', function(){
async.waterfall(esWaterfall, _onComplete.bind(this));
}.bind(this))
.on('error', _onComplete.bind(this));
conn.connect(sshCloned);
}.bind(this);
};
/**
* Generate options.
* @param {Object} options
* @private
*/
function _genOptions(options){
// configuration file must be provided.
if (!options || !options.conf) {
throw new Error('`conf` is required.');
}
try {
this.conf = helper.loadConfig(options.conf);
} catch (err) {
throw err;
}
this._uid = options.uid || helper._randomNo();
this._ips = helper.ips;
// nodes' data.
this._localStorage = {};
var dataPath = path.resolve(helper.ROOT, 'data', this.conf.watchdog.name + '.' + this._uid + '.json');
if (fs.existsSync(dataPath)) {
var data = fs.readFileSync(dataPath, {encoding: 'utf-8'});
try {
this._localStorage = JSON.parse(data);
} catch (err) {
// parsed fail, ignore data.
}
}
};
/**
* Reset bind properties.
* @private
*/
function _reset(){
this._attempts = 0;
this._nornodes = {};
}
/**
* Persist nodes' statuses.
* @param {Object} nodes
* @param {String} facet
* @private
*/
function _persistent(nodes, facet){
var data = this._localStorage;
data.timestamp = Date.now();
data[facet] = _.clone(nodes);
var dir = path.resolve(helper.ROOT, 'data');
!fs.existsSync(dir) && fs.mkdirSync(dir);
fs.writeFileSync(path.resolve(dir, this.conf.watchdog.name + '.' + this._uid + '.json'),
new Buffer(JSON.stringify(data, null, 2), 'utf-8'));
this._localStorage = data;
};