antietcd
Version: 
Simplistic etcd replacement based on TinyRaft
539 lines (511 loc) • 19.4 kB
JavaScript
// Clustering for AntiEtcd
// (c) Vitaliy Filippov, 2024
// License: Mozilla Public License 2.0 or Vitastor Network Public License 1.1
const ws = require('ws');
const TinyRaft = require('tinyraft');
const { runCallbacks, RequestError } = require('./common.js');
const LEADER_MISMATCH = 'raft leader/term mismatch';
const LEADER_ONLY = 1;
const NO_WAIT_QUORUM = 2;
const READ_FROM_FOLLOWER = 4;
class AntiCluster
{
    constructor(antietcd)
    {
        this.antietcd = antietcd;
        this.cfg = antietcd.cfg;
        this.cluster_connections = {};
        this.last_request_id = 1;
        this.subrequests = {};
        this.synced = false;
        this.wait_sync = [];
        if (!this.cfg.node_id || !this.cfg.cluster_key)
        {
            throw new Error('node_id and cluster_key are required in configuration if cluster is set');
        }
        if (!(this.cfg.cluster instanceof Object))
        {
            this.cfg.cluster = (''+this.cfg.cluster).trim().split(/[\s,]*,[\s,]*/)
                .reduce((a, c) => { c = c.split(/\s*=\s*/); a[c[0]] = c[1]; return a; }, {});
        }
        this.raft = new TinyRaft({
            nodes: Object.keys(this.cfg.cluster),
            nodeId: this.cfg.node_id,
            heartbeatTimeout: this.cfg.heartbeat_timeout,
            electionTimeout: this.cfg.election_timeout,
            leaderPriority: this.cfg.leader_priority||undefined,
            initialTerm: this.antietcd.stored_term,
            send: (to, msg) => this._sendRaftMessage(to, msg),
        });
        this.raft.on('change', (event) => this._handleRaftChange(event));
        this.raft.start();
        // Connect to all nodes and reconnect forever
        for (const node_id in this.cfg.cluster)
        {
            this.connectToNode(node_id);
        }
    }
    connectToNode(node_id)
    {
        if (node_id != this.cfg.node_id && this.cfg.cluster[node_id] &&
            (!this.cluster_connections[node_id] || !this.antietcd.clients[this.cluster_connections[node_id]]))
        {
            const socket = new ws.WebSocket(this.cfg.cluster[node_id].replace(/^http/, 'ws'), this.antietcd.tls);
            const client_id = this.antietcd._startWebsocket(socket, () => setTimeout(() => this.connectToNode(node_id), this.cfg.reconnect_interval||1000));
            this.cluster_connections[node_id] = client_id;
            socket.on('open', () =>
            {
                if (this.antietcd.clients[client_id])
                {
                    this.antietcd.clients[client_id].ready = true;
                    this.antietcd.clients[client_id].raft_node_id = node_id;
                    this.antietcd.clients[client_id].addr = socket._socket.remoteAddress+':'+socket._socket.remotePort;
                    socket.send(JSON.stringify({ identify: { key: this.cfg.cluster_key, node_id: this.cfg.node_id } }));
                    this.raft.start();
                }
            });
        }
    }
    _peerRequest(client, request, timeout)
    {
        const request_id = this.last_request_id++;
        request.request_id = request_id;
        client.socket.send(JSON.stringify(request));
        const req = this.subrequests[request_id] = { client_id: client.id };
        const promise = new Promise(ok => req.cb = ok);
        req.timer_id = setTimeout(() => this._completeRequest(null, request_id, { error: 'timeout' }), timeout);
        return promise;
    }
    async replicateChange(msg)
    {
        if (this.raft.state !== TinyRaft.LEADER)
        {
            return;
        }
        const mod_revision = this.antietcd.etctree.mod_revision;
        await this._requestFollowers({ replicate: msg }, this.cfg.replication_timeout||1000);
        // We have a guarantee that all revisions before mod_revision are applied by followers,
        // because replication messages are either processed synchronously or serialized in
        // AntiPersistence against <wait_persist>
        this.sync_revision = mod_revision;
        if (this.sync_revision - this.antietcd.etctree.compact_revision > (this.cfg.compact_revisions||1000)*2)
        {
            const revision = this.sync_revision - (this.cfg.compact_revisions||1000);
            await this._requestFollowers({ compact: { revision } }, this.cfg.compact_timeout||1000);
            this.antietcd.etctree.compact(revision);
        }
    }
    _log(msg)
    {
        if (this.cfg.log_level > 0)
        {
            console.log(msg);
        }
    }
    async _requestFollowers(msg, timeout)
    {
        msg.term = this.raft.term;
        const followers = this.raft.followers;
        for (const follower of followers)
        {
            if (follower != this.cfg.node_id)
            {
                const client = this._getPeer(follower);
                if (!client)
                {
                    // One of peers is unavailable - immediate failure, request should be retried
                    this._log('Lost peer connection during replication - restarting election');
                    this.raft.start();
                    throw new RequestError(503, 'Peer connection is lost, please retry request');
                }
            }
        }
        const promises = [];
        for (const follower of followers)
        {
            if (follower != this.cfg.node_id)
            {
                const client = this._getPeer(follower);
                const promise = this._peerRequest(client, msg, timeout);
                promises.push(promise);
            }
        }
        const results = await Promise.all(promises);
        let i = 0;
        for (const follower of followers)
        {
            if (follower != this.cfg.node_id)
            {
                const result = results[i];
                if (!result || result.error)
                {
                    // One of peers is unavailable - immediate failure, request should be retried
                    this._log('Replication failed ('+follower+': '+(result ? result.error : 'no result')+') - restarting election');
                    this.raft.start();
                    throw new RequestError(503, 'Replication failed, please retry request');
                }
                i++;
            }
        }
    }
    _completeRequest(client_id, request_id, result)
    {
        const req = this.subrequests[request_id];
        if (!req || client_id && req.client_id != client_id)
        {
            return;
        }
        delete this.subrequests[request_id];
        if (req.timer_id)
        {
            clearTimeout(req.timer_id);
            req.timer_id = null;
        }
        req.cb(result);
    }
    _handleRaftChange(event)
    {
        this.antietcd.emit('raftchange', event);
        this._log(
            'Raft '+this.cfg.node_id+': '+(event.state == TinyRaft.FOLLOWER ? 'following '+event.leader : event.state)+
            ', term '+event.term+(event.state == TinyRaft.LEADER ? ', followers: '+event.followers.join(', ') : '')
        );
        if (event.state == TinyRaft.LEADER)
        {
            // (Re)sync with the new set of followers
            this._resync(event.followers);
            this.antietcd.etctree.resume_leases();
        }
        else
        {
            this.synced = false;
            this.resync_state = null;
            this.antietcd.etctree.pause_leases();
        }
    }
    _resync(followers)
    {
        this.synced = false;
        if (!this.resync_state)
        {
            this.resync_state = {
                dumps: {},
                loads: {},
            };
        }
        const seen = {};
        for (const f of followers)
        {
            seen[f] = true;
            if (f != this.cfg.node_id && !(f in this.resync_state.dumps))
            {
                const client = this._getPeer(f);
                if (client)
                {
                    this.resync_state.dumps[f] = null;
                    this._peerRequest(client, { request: {}, handler: 'dump' }, this.cfg.dump_timeout||5000).then(res =>
                    {
                        if (this.resync_state && client.raft_node_id &&
                            (client.raft_node_id in this.resync_state.dumps))
                        {
                            if (res.error)
                            {
                                console.error(client.raft_node_id+' dump failed with error: '+res.error);
                            }
                            else
                            {
                                this._log(
                                    'Got dump from '+client.raft_node_id+' with stored term '+res.term+
                                    ', mod_revision '+res.mod_revision+', compact_revision '+res.compact_revision
                                );
                            }
                            this.resync_state.dumps[client.raft_node_id] = res.error ? null : res;
                            this._continueResync();
                        }
                    });
                }
            }
        }
        for (const f in this.resync_state.dumps)
        {
            if (!seen[f])
            {
                delete this.resync_state.dumps[f];
            }
        }
        this._continueResync();
    }
    _continueResync()
    {
        if (!this.resync_state ||
            Object.values(this.resync_state.dumps).filter(d => !d).length > 0)
        {
            // Some dump(s) are still pending
            return;
        }
        this.resync_state.dumps[this.cfg.node_id] = { ...this.antietcd.etctree.dump(), term: this.antietcd.stored_term };
        let max_term = -1, with_max = [];
        for (const follower in this.resync_state.dumps)
        {
            const dump = this.resync_state.dumps[follower];
            if (dump.term > max_term)
            {
                max_term = dump.term;
                with_max = [ follower ];
            }
            else if (dump.term == max_term)
            {
                with_max.push(follower);
            }
        }
        if (max_term < 0 || with_max.length == 0)
        {
            throw new Error('BUG: no max term during resync');
        }
        this._log('Local term '+this.antietcd.stored_term+', max follower term '+max_term+' at nodes '+with_max.join(', '));
        with_max = with_max.filter(w => w != this.cfg.node_id);
        // Merge databases of all nodes with maximum term
        // Force other nodes to replicate the merged DB, throwing away their own states
        for (let i = 0; i < with_max.length; i++)
        {
            const update_only = !(i == 0 && this.antietcd.stored_term != max_term);
            this._log(update_only ? 'Updating database from node '+with_max[i]+' state' : 'Copying node '+with_max[i]+' state');
            this.antietcd.etctree.load(this.resync_state.dumps[with_max[i]], update_only);
        }
        let wait = 0;
        const load_request = { term: this.raft.term, load: this.antietcd.etctree.dump() };
        for (const follower in this.resync_state.dumps)
        {
            if (follower != this.cfg.node_id)
            {
                const dump = this.resync_state.dumps[follower];
                if (dump.term <= max_term)
                {
                    const client = this._getPeer(follower);
                    if (!client)
                    {
                        this._log('Lost peer connection during resync - restarting election');
                        this.raft.start();
                        return;
                    }
                    this._log('Copying state to '+follower);
                    const loadstate = this.resync_state.loads[follower] = {};
                    wait++;
                    this._peerRequest(client, load_request, this.cfg.load_timeout||5000).then(res =>
                    {
                        loadstate.result = res;
                        this._finishResync();
                    });
                }
            }
        }
        if (!wait)
        {
            this._finishResync();
        }
    }
    _finishResync()
    {
        if (!this.resync_state ||
            Object.values(this.resync_state.dumps).filter(d => !d).length > 0 ||
            Object.values(this.resync_state.loads).filter(d => !d.result).length > 0)
        {
            return;
        }
        // All current peers have copied the database, we can proceed
        this.antietcd.stored_term = this.raft.term;
        this.synced = true;
        runCallbacks(this, 'wait_sync', []);
        this._log(
            'Synchronized with followers, new term is '+this.raft.term+
            ', mod_revision '+this.antietcd.etctree.mod_revision+', compact_revision '+this.antietcd.etctree.compact_revision
        );
    }
    _isWrite(path, data)
    {
        if (path == 'kv_txn')
        {
            return (data.compare && data.compare.length ||
                data.success && data.success.filter(f => f.request_put || f.requestPut || f.request_delete_range || f.requestDeleteRange).length ||
                data.failure && data.failure.filter(f => f.request_put || f.requestPut || f.request_delete_range || f.requestDeleteRange).length);
        }
        return path != 'kv_range';
    }
    async checkRaftState(path, leaderonly, data)
    {
        if (!this.raft)
        {
            return null;
        }
        if (leaderonly == LEADER_ONLY && this.raft.state != TinyRaft.LEADER)
        {
            throw new RequestError(503, 'Not leader');
        }
        if (leaderonly == NO_WAIT_QUORUM && this.raft.state == TinyRaft.CANDIDATE)
        {
            throw new RequestError(503, 'Quorum not available');
        }
        if (!this.synced)
        {
            // Wait for quorum / initial sync with timeout
            await new Promise((ok, no) =>
            {
                this.wait_sync.push(ok);
                setTimeout(() =>
                {
                    this.wait_sync = this.wait_sync.filter(cb => cb != ok);
                    no(new RequestError(503, 'Quorum not available'));
                }, this.cfg.wait_quorum_timeout||30000);
            });
        }
        if (this.raft.state == TinyRaft.FOLLOWER &&
            (this._isWrite(path, data) || !this.cfg.stale_read && !(leaderonly & READ_FROM_FOLLOWER)))
        {
            // Forward to leader
            return await this._forwardToLeader(path, data);
        }
        return null;
    }
    async _forwardToLeader(handler, data)
    {
        const client = this._getPeer(this.raft.leader);
        if (!client)
        {
            throw new RequestError(503, 'Leader is unavailable');
        }
        return await this._peerRequest(client, { handler, request: data }, this.cfg.forward_timeout||1000);
    }
    handleWsMsg(client, msg)
    {
        if (msg.raft)
        {
            if (client.raft_node_id)
            {
                this.raft.onReceive(client.raft_node_id, msg.raft);
            }
        }
        else if (msg.identify)
        {
            if (msg.identify.key === this.cfg.cluster_key &&
                msg.identify.node_id != this.cfg.node_id)
            {
                client.raft_node_id = msg.identify.node_id;
                this._log('Got a connection from '+client.raft_node_id);
            }
        }
        else if (msg.load)
        {
            this._handleLoadMsg(client, msg).catch(console.error);
        }
        else if (msg.replicate)
        {
            this._handleReplicateMsg(client, msg).catch(console.error);
        }
        else if (msg.request)
        {
            this._handleRequestMsg(client, msg).catch(console.error);
        }
        else if (msg.reply)
        {
            this._completeRequest(client.id, msg.request_id, msg.reply);
        }
        else if (msg.compact)
        {
            this._handleCompactMsg(client, msg);
        }
    }
    async _handleRequestMsg(client, msg)
    {
        try
        {
            const res = await this.antietcd.api(msg.handler, msg.request);
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: res }));
        }
        catch (e)
        {
            console.error(e);
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: e.message } }));
        }
    }
    async _handleLoadMsg(client, msg)
    {
        if (client.raft_node_id && this.raft.state == TinyRaft.FOLLOWER &&
            this.raft.leader === client.raft_node_id && this.raft.term == msg.term)
        {
            this.antietcd.etctree.load(msg.load);
            if (this.antietcd.persistence)
            {
                await this.antietcd.persistence.persist();
            }
            this.antietcd.stored_term = msg.term;
            this.synced = true;
            runCallbacks(this, 'wait_sync', []);
            this._log(
                'Synchronized with leader, new term is '+this.raft.term+
                ', mod_revision '+this.antietcd.etctree.mod_revision+', compact_revision '+this.antietcd.etctree.compact_revision
            );
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: {} }));
        }
        else
        {
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: LEADER_MISMATCH } }));
        }
    }
    async _handleReplicateMsg(client, msg)
    {
        if (client.raft_node_id && this.raft.state == TinyRaft.FOLLOWER &&
            this.raft.leader === client.raft_node_id && this.raft.term == msg.term)
        {
            await this.antietcd.etctree.apply_replication(msg.replicate);
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: {} }));
        }
        else
        {
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: LEADER_MISMATCH } }));
        }
    }
    _handleCompactMsg(client, msg)
    {
        if (client.raft_node_id && this.raft.state == TinyRaft.FOLLOWER &&
            this.raft.leader === client.raft_node_id && this.raft.term == msg.term)
        {
            this.antietcd.etctree.compact(msg.compact.revision);
            this._log('Compacted deletions up to '+msg.compact.revision);
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: {} }));
        }
        else
        {
            client.socket.send(JSON.stringify({ request_id: msg.request_id, reply: { error: LEADER_MISMATCH } }));
        }
    }
    _getPeer(to)
    {
        if (to == this.cfg.node_id)
        {
            throw new Error('BUG: attempt to get connection to self');
        }
        const client_id = this.cluster_connections[to];
        if (!client_id)
        {
            return null;
        }
        const client = this.antietcd.clients[client_id];
        if (!client || !client.ready)
        {
            return null;
        }
        return client;
    }
    _sendRaftMessage(to, msg)
    {
        const client = this._getPeer(to);
        if (client)
        {
            client.socket.send(JSON.stringify({ raft: msg }));
        }
    }
}
AntiCluster.LEADER_ONLY = LEADER_ONLY;
AntiCluster.NO_WAIT_QUORUM = NO_WAIT_QUORUM;
AntiCluster.READ_FROM_FOLLOWER = READ_FROM_FOLLOWER;
module.exports = AntiCluster;