UNPKG

antietcd

Version:

Simplistic etcd replacement based on TinyRaft

539 lines (511 loc) 19.4 kB
// Clustering for AntiEtcd // (c) Vitaliy Filippov, 2024 // License: Mozilla Public License 2.0 or Vitastor Network Public License 1.1 const ws = require('ws'); const TinyRaft = require('tinyraft'); const { runCallbacks, RequestError } = require('./common.js'); const LEADER_MISMATCH = 'raft leader/term mismatch'; const LEADER_ONLY = 1; const NO_WAIT_QUORUM = 2; const READ_FROM_FOLLOWER = 4; class AntiCluster { constructor(antietcd) { this.antietcd = antietcd; this.cfg = antietcd.cfg; this.cluster_connections = {}; this.last_request_id = 1; this.subrequests = {}; this.synced = false; this.wait_sync = []; if (!this.cfg.node_id || !this.cfg.cluster_key) { throw new Error('node_id and cluster_key are required in configuration if cluster is set'); } if (!(this.cfg.cluster instanceof Object)) { this.cfg.cluster = (''+this.cfg.cluster).trim().split(/[\s,]*,[\s,]*/) .reduce((a, c) => { c = c.split(/\s*=\s*/); a[c[0]] = c[1]; return a; }, {}); } this.raft = new TinyRaft({ nodes: Object.keys(this.cfg.cluster), nodeId: this.cfg.node_id, heartbeatTimeout: this.cfg.heartbeat_timeout, electionTimeout: this.cfg.election_timeout, leaderPriority: this.cfg.leader_priority||undefined, initialTerm: this.antietcd.stored_term, send: (to, msg) => this._sendRaftMessage(to, msg), }); this.raft.on('change', (event) => this._handleRaftChange(event)); this.raft.start(); // Connect to all nodes and reconnect forever for (const node_id in this.cfg.cluster) { this.connectToNode(node_id); } } connectToNode(node_id) { if (node_id != this.cfg.node_id && this.cfg.cluster[node_id] && (!this.cluster_connections[node_id] || !this.antietcd.clients[this.cluster_connections[node_id]])) { const socket = new ws.WebSocket(this.cfg.cluster[node_id].replace(/^http/, 'ws'), this.antietcd.tls); const client_id = this.antietcd._startWebsocket(socket, () => setTimeout(() => this.connectToNode(node_id), this.cfg.reconnect_interval||1000)); this.cluster_connections[node_id] = client_id; socket.on('open', () => { if (this.antietcd.clients[client_id]) { this.antietcd.clients[client_id].ready = true; this.antietcd.clients[client_id].raft_node_id = node_id; this.antietcd.clients[client_id].addr = socket._socket.remoteAddress+':'+socket._socket.remotePort; socket.send(JSON.stringify({ identify: { key: this.cfg.cluster_key, node_id: this.cfg.node_id } })); this.raft.start(); } }); } } _peerRequest(client, request, timeout) { const request_id = this.last_request_id++; request.request_id = request_id; client.socket.send(JSON.stringify(request)); const req = this.subrequests[request_id] = { client_id: client.id }; const promise = new Promise(ok => req.cb = ok); req.timer_id = setTimeout(() => this._completeRequest(null, request_id, { error: 'timeout' }), timeout); return promise; } async replicateChange(msg) { if (this.raft.state !== TinyRaft.LEADER) { return; } const mod_revision = this.antietcd.etctree.mod_revision; await this._requestFollowers({ replicate: msg }, this.cfg.replication_timeout||1000); // We have a guarantee that all revisions before mod_revision are applied by followers, // because replication messages are either processed synchronously or serialized in // AntiPersistence against <wait_persist> this.sync_revision = mod_revision; if (this.sync_revision - this.antietcd.etctree.compact_revision > (this.cfg.compact_revisions||1000)*2) { const revision = this.sync_revision - (this.cfg.compact_revisions||1000); await this._requestFollowers({ compact: { revision } }, this.cfg.compact_timeout||1000); this.antietcd.etctree.compact(revision); } } _log(msg) { if (this.cfg.log_level > 0) { console.log(msg); } } async _requestFollowers(msg, timeout) { msg.term = this.raft.term; const followers = this.raft.followers; for (const follower of followers) { if (follower != this.cfg.node_id) { const client = this._getPeer(follower); if (!client) { // One of peers is unavailable - immediate failure, request should be retried this._log('Lost peer connection during replication - restarting election'); this.raft.start(); throw new RequestError(503, 'Peer connection is lost, please retry request'); } } } const promises = []; for (const follower of followers) { if (follower != this.cfg.node_id) { const client = this._getPeer(follower); const promise = this._peerRequest(client, msg, timeout); promises.push(promise); } } const results = await Promise.all(promises); let i = 0; for (const follower of followers) { if (follower != this.cfg.node_id) { const result = results[i]; if (!result || result.error) { // One of peers is unavailable - immediate failure, request should be retried this._log('Replication failed ('+follower+': '+(result ? result.error : 'no result')+') - restarting election'); this.raft.start(); throw new RequestError(503, 'Replication failed, please retry request'); } i++; } } } _completeRequest(client_id, request_id, result) { const req = this.subrequests[request_id]; if (!req || client_id && req.client_id != client_id) { return; } delete this.subrequests[request_id]; if (req.timer_id) { clearTimeout(req.timer_id); req.timer_id = null; } req.cb(result); } _handleRaftChange(event) { this.antietcd.emit('raftchange', event); this._log( 'Raft '+this.cfg.node_id+': '+(event.state == TinyRaft.FOLLOWER ? 'following '+event.leader : event.state)+ ', term '+event.term+(event.state == TinyRaft.LEADER ? ', followers: '+event.followers.join(', ') : '') ); if (event.state == TinyRaft.LEADER) { // (Re)sync with the new set of followers this._resync(event.followers); this.antietcd.etctree.resume_leases(); } else { this.synced = false; this.resync_state = null; this.antietcd.etctree.pause_leases(); } } _resync(followers) { this.synced = false; if (!this.resync_state) { this.resync_state = { dumps: {}, loads: {}, }; } const seen = {}; for (const f of followers) { seen[f] = true; if (f != this.cfg.node_id && !(f in this.resync_state.dumps)) { const client = this._getPeer(f); if (client) { this.resync_state.dumps[f] = null; this._peerRequest(client, { request: {}, handler: 'dump' }, this.cfg.dump_timeout||5000).then(res => { if (this.resync_state && client.raft_node_id && (client.raft_node_id in this.resync_state.dumps)) { if (res.error) { console.error(client.raft_node_id+' dump failed with error: '+res.error); } else { this._log( 'Got dump from '+client.raft_node_id+' with stored term '+res.term+ ', mod_revision '+res.mod_revision+', compact_revision '+res.compact_revision ); } this.resync_state.dumps[client.raft_node_id] = res.error ? null : res; this._continueResync(); } }); } } } for (const f in this.resync_state.dumps) { if (!seen[f]) { delete this.resync_state.dumps[f]; } } this._continueResync(); } _continueResync() { if (!this.resync_state || Object.values(this.resync_state.dumps).filter(d => !d).length > 0) { // Some dump(s) are still pending return; } this.resync_state.dumps[this.cfg.node_id] = { ...this.antietcd.etctree.dump(), term: this.antietcd.stored_term }; let max_term = -1, with_max = []; for (const follower in this.resync_state.dumps) { const dump = this.resync_state.dumps[follower]; if (dump.term > max_term) { max_term = dump.term; with_max = [ follower ]; } else if (dump.term == max_term) { with_max.push(follower); } } if (max_term < 0 || with_max.length == 0) { throw new Error('BUG: no max term during resync'); } this._log('Local term '+this.antietcd.stored_term+', max follower term '+max_term+' at nodes '+with_max.join(', ')); with_max = with_max.filter(w => w != this.cfg.node_id); // Merge databases of all nodes with maximum term // Force other nodes to replicate the merged DB, throwing away their own states for (let i = 0; i < with_max.length; i++) { const update_only = !(i == 0 && this.antietcd.stored_term != max_term); this._log(update_only ? 'Updating database from node '+with_max[i]+' state' : 'Copying node '+with_max[i]+' state'); this.antietcd.etctree.load(this.resync_state.dumps[with_max[i]], update_only); } let wait = 0; const load_request = { term: this.raft.term, load: this.antietcd.etctree.dump() }; for (const follower in this.resync_state.dumps) { if (follower != this.cfg.node_id) { const dump = this.resync_state.dumps[follower]; if (dump.term <= max_term) { const client = this._getPeer(follower); if (!client) { this._log('Lost peer connection during resync - restarting election'); this.raft.start(); return; } this._log('Copying state to '+follower); const loadstate = this.resync_state.loads[follower] = {}; wait++; this._peerRequest(client, load_request, this.cfg.load_timeout||5000).then(res => { loadstate.result = res; this._finishResync(); }); } } } if (!wait) { this._finishResync(); } } _finishResync() { if (!this.resync_state || Object.values(this.resync_state.dumps).filter(d => !d).length > 0 || Object.values(this.resync_state.loads).filter(d => !d.result).length > 0) { return; } // All current peers have copied the database, we can proceed this.antietcd.stored_term = this.raft.term; this.synced = true; runCallbacks(this, 'wait_sync', []); this._log( 'Synchronized with followers, new term is '+this.raft.term+ ', mod_revision '+this.antietcd.etctree.mod_revision+', compact_revision '+this.antietcd.etctree.compact_revision ); } _isWrite(path, data) { if (path == 'kv_txn') { return (data.compare && data.compare.length || data.success && data.success.filter(f => f.request_put || f.requestPut || f.request_delete_range || f.requestDeleteRange).length || data.failure && data.failure.filter(f => f.request_put || f.requestPut || f.request_delete_range || f.requestDeleteRange).length); } return path != 'kv_range'; } async checkRaftState(path, leaderonly, data) { if (!this.raft) { return null; } if (leaderonly == LEADER_ONLY && this.raft.state != TinyRaft.LEADER) { throw new RequestError(503, 'Not leader'); } if (leaderonly == NO_WAIT_QUORUM && this.raft.state == TinyRaft.CANDIDATE) { throw new RequestError(503, 'Quorum not available'); } if (!this.synced) { // Wait for quorum / initial sync with timeout await new Promise((ok, no) => { this.wait_sync.push(ok); setTimeout(() => { this.wait_sync = this.wait_sync.filter(cb => cb != ok); no(new RequestError(503, 'Quorum not available')); }, this.cfg.wait_quorum_timeout||30000); }); } if (this.raft.state == TinyRaft.FOLLOWER && (this._isWrite(path, data) || !this.cfg.stale_read && !(leaderonly & READ_FROM_FOLLOWER))) { // Forward to leader return await this._forwardToLeader(path, data); } return null; } async _forwardToLeader(handler, data) { const client = this._getPeer(this.raft.leader); if (!client) { throw new RequestError(503, 'Leader is unavailable'); } return await this._peerRequest(client, { handler, request: data }, this.cfg.forward_timeout||1000); } handleWsMsg(client, msg) { if (msg.raft) { if (client.raft_node_id) { this.raft.onReceive(client.raft_node_id, msg.raft); } } else if (msg.identify) { if (msg.identify.key === this.cfg.cluster_key && msg.identify.node_id != this.cfg.node_id) { client.raft_node_id = msg.identify.node_id; this._log('Got a connection from '+client.raft_node_id); } } else if (msg.load) { this._handleLoadMsg(client, msg).catch(console.error); } else if (msg.replicate) { this._handleReplicateMsg(client, msg).catch(console.error); } else if (msg.request) { this._handleRequestMsg(client, msg).catch(console.error); } else if (msg.reply) { this._completeRequest(client.id, msg.request_id, msg.reply); } else if (msg.compact) { this._handleCompactMsg(client, msg); } } async _handleRequestMsg(client, msg) { try { const res = await this.antietcd.api(msg.handler, msg.request); client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: res })); } catch (e) { console.error(e); client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: e.message } })); } } async _handleLoadMsg(client, msg) { if (client.raft_node_id && this.raft.state == TinyRaft.FOLLOWER && this.raft.leader === client.raft_node_id && this.raft.term == msg.term) { this.antietcd.etctree.load(msg.load); if (this.antietcd.persistence) { await this.antietcd.persistence.persist(); } this.antietcd.stored_term = msg.term; this.synced = true; runCallbacks(this, 'wait_sync', []); this._log( 'Synchronized with leader, new term is '+this.raft.term+ ', mod_revision '+this.antietcd.etctree.mod_revision+', compact_revision '+this.antietcd.etctree.compact_revision ); client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: {} })); } else { client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: LEADER_MISMATCH } })); } } async _handleReplicateMsg(client, msg) { if (client.raft_node_id && this.raft.state == TinyRaft.FOLLOWER && this.raft.leader === client.raft_node_id && this.raft.term == msg.term) { await this.antietcd.etctree.apply_replication(msg.replicate); client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: {} })); } else { client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: LEADER_MISMATCH } })); } } _handleCompactMsg(client, msg) { if (client.raft_node_id && this.raft.state == TinyRaft.FOLLOWER && this.raft.leader === client.raft_node_id && this.raft.term == msg.term) { this.antietcd.etctree.compact(msg.compact.revision); this._log('Compacted deletions up to '+msg.compact.revision); client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: {} })); } else { client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: LEADER_MISMATCH } })); } } _getPeer(to) { if (to == this.cfg.node_id) { throw new Error('BUG: attempt to get connection to self'); } const client_id = this.cluster_connections[to]; if (!client_id) { return null; } const client = this.antietcd.clients[client_id]; if (!client || !client.ready) { return null; } return client; } _sendRaftMessage(to, msg) { const client = this._getPeer(to); if (client) { client.socket.send(JSON.stringify({ raft: msg })); } } } AntiCluster.LEADER_ONLY = LEADER_ONLY; AntiCluster.NO_WAIT_QUORUM = NO_WAIT_QUORUM; AntiCluster.READ_FROM_FOLLOWER = READ_FROM_FOLLOWER; module.exports = AntiCluster;