UNPKG

@cocalc/hub

Version:
245 lines 10.4 kB
"use strict"; /* * This file is part of CoCalc: Copyright © 2020 Sagemath, Inc. * License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.setup_health_checks = exports.process_alive = exports.set_agent_endpoint = void 0; // endpoints for various health checks const logger_1 = __importDefault(require("@cocalc/backend/logger")); const { new_counter } = require("@cocalc/hub/metrics-recorder"); const record_connect_error_1 = require("@cocalc/database/postgres/record-connect-error"); const misc_1 = require("@cocalc/util/misc"); const net_1 = require("net"); const validator_1 = require("validator"); const hub_register_1 = require("./hub_register"); const logger = (0, logger_1.default)("hub:healthcheck"); const { debug: L } = logger; const HEALTHCHECKS = new_counter("healthchecks_total", "test healthcheck counter", ["status"]); // self termination is only activated, if there is a COCALC_HUB_SELF_TERMINATE environment variable // it's value is an interval in hours, minimum and maximum, for how long it should be alive // and a drain period in minutes at the end. // e.g. "24,48,15" for an uptime between 1 and 2 days and 15 minutes of draining function init_self_terminate() { const D = logger.extend("init_self_terminate").debug; const startup = Date.now(); const conf = process.env.COCALC_HUB_SELF_TERMINATE; if (conf == null) { D("COCALC_HUB_SELF_TERMINATE env var not set, hence no self-termination"); return { startup }; } const [from_str, to_str, drain_str] = conf.trim().split(","); if (!(0, validator_1.isFloat)(from_str, { gt: 0 })) throw new Error("COCALC_HUB_SELF_TERMINATE/from not a positive float"); if (!(0, validator_1.isFloat)(to_str, { gt: 0 })) throw new Error("COCALC_HUB_SELF_TERMINATE/to not a positive float"); if (!(0, validator_1.isFloat)(drain_str, { gt: 0 })) throw new Error("COCALC_HUB_SELF_TERMINATE/drain not a positive float"); const from = parseFloat(from_str); const to = parseFloat(to_str); const drain_h = parseFloat(drain_str) / 60; // minutes to hours D("parsed data:", { from, to, drain_h }); if (from > to) throw Error("COCALC_HUB_SELF_TERMINATE 'from' must be smaller than 'to', e.g. '24,48,15'"); const uptime = Math.random() * (to - from); // hours const hours2ms = 1000 * 60 * 60; const shutdown = startup + (from + uptime) * hours2ms; const drain = shutdown - drain_h * hours2ms; if (startup > drain) { throw new Error(`COCALC_HUB_SELF_TERMINATE: startup must be smaller than drain – ${startup}>${drain}`); } D({ startup: new Date(startup).toISOString(), drain: new Date(drain).toISOString(), shutdown: new Date(shutdown).toISOString(), uptime: (0, misc_1.seconds2hms)((hours2ms * uptime) / 1000), draintime: (0, misc_1.seconds2hms)((drain_h * hours2ms) / 1000), }); return { startup, shutdown, drain }; } const { startup, shutdown, drain } = init_self_terminate(); let agent_port = 0; let agent_host = "0.0.0.0"; function set_agent_endpoint(port, host) { L(`set_agent_endpoint ${agent_host}:${agent_port}`); agent_port = port; agent_host = host; } exports.set_agent_endpoint = set_agent_endpoint; let agent_check_server; // HAProxy agent-check TCP endpoint // https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-agent-check // for development, set the env var in your startup script or terminal init file // export COCALC_HUB_SELF_TERMINATE=.1,.2,1 // and then query it like that // $ telnet 0.0.0.0 $(cat $COCALC_ROOT/dev/project/ports/agent-port) function setup_agent_check() { if (agent_port == 0 || drain == null) { L("setup_agent_check: agent_port not set, no agent checks"); return; } // TODO this could also return a "weight" for this server, based on load values // there is also "drain", but we set it to "10%" to avoid a nasty situation, when all endpoints are draining. // ATTN: weight must be set as well, which is poorly documented here: // https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-weight agent_check_server = (0, net_1.createServer)((c) => { let msg = Date.now() < drain ? "ready up 100%" : "10%"; c.write(msg + "\r\n"); c.destroy(); }); agent_check_server.listen(agent_port, agent_host); L(`setup_agent_check: listening on ${agent_host}:${agent_port}`); } // this could be directly in setup_health_checks, but we also need it in proxy.coffee // proxy.coffee must be rewritten and restructured first – just wrapping it with a router // didn't work at all for me function process_alive() { let txt = "alive: YES"; let is_dead = true; if (!(0, hub_register_1.database_is_working)()) { // this will stop haproxy from routing traffic to us // until db connection starts working again. txt = "alive: NO – database not working"; } else if (shutdown != null && Date.now() > shutdown) { txt = "alive: NO – shutdown initiated"; } else { is_dead = false; } const code = is_dead ? 404 : 200; return { txt, code }; } exports.process_alive = process_alive; function checkConcurrent(db) { const c = db.concurrent(); if (c >= db._concurrent_warn) { return { status: `hub not healthy, since concurrent ${c} >= ${db._concurrent_warn}`, abort: true, }; } else { return { status: `concurrent ${c} < ${db._concurrent_warn}` }; } } function checkUptime() { const now = Date.now(); const uptime = (0, misc_1.seconds2hms)((now - startup) / 1000); if (shutdown != null && drain != null) { if (now >= shutdown) { const msg = `uptime ${uptime} – expired, terminating now`; L(msg); return { status: msg, abort: true }; } else { const until = (0, misc_1.seconds2hms)((shutdown - now) / 1000); const drain_str = drain > now ? `draining in ${(0, misc_1.seconds2hms)((drain - now) / 1000)}` : "draining now"; const msg = `uptime ${uptime}${drain_str} – terminating in ${until}`; L(msg); return { status: msg }; } } else { const msg = `uptime ${uptime} – no self-termination`; L(msg); return { status: msg }; } } // if there are is no connection to the database for that many minutes, // declare the hub unhealthy const DB_ERRORS_THRESHOLD_MIN = parseInt(process.env.COCALC_DB_ERRORS_THRESHOLD_MIN ?? "5"); function checkDBConnectivity() { if (DB_ERRORS_THRESHOLD_MIN <= 0) { return { status: "db connectivity check disabled" }; } const num = (0, record_connect_error_1.howLongDisconnectedMins)(); if (num == null) { return { status: "no DB connection problems", abort: false }; } // round num to 2 decimal places const numStr = num.toFixed(2); const above = num >= DB_ERRORS_THRESHOLD_MIN; const status = above ? `DB problems for ${numStr} >= ${DB_ERRORS_THRESHOLD_MIN} mins` : `DB problems for ${numStr} < ${DB_ERRORS_THRESHOLD_MIN} mins`; return { status, abort: above }; } // same note as above for process_alive() async function process_health_check(db, extra = []) { let any_abort = false; let txt = "healthchecks:\n"; for (const test of [ () => checkConcurrent(db), checkUptime, checkDBConnectivity, ...extra, ]) { try { const { status, abort = false } = await test(); const statusTxt = abort ? "FAIL" : "OK"; txt += `${status}${statusTxt}\n`; any_abort = any_abort || abort; L(`process_health_check: ${status}${statusTxt}`); } catch (err) { L(`process_health_check ERRROR: ${err}`); HEALTHCHECKS.labels("ERROR").inc(); } } const code = any_abort ? 404 : 200; HEALTHCHECKS.labels(any_abort ? "FAIL" : "OK").inc(); return { code, txt }; } async function setup_health_checks(opts) { const { db, extra, router } = opts; setup_agent_check(); // used by HAPROXY for testing that this hub is OK to receive traffic router.get("/alive", (_, res) => { const { code, txt } = process_alive(); res.type("txt"); res.status(code); res.send(txt); }); // this is a more general check than concurrent-warn // additionally to checking the database condition, it also self-terminates // this hub if it is running for quite some time. beyond that, in the future // there could be even more checks on top of that. router.get("/healthcheck", async (_, res) => { const { txt, code } = await process_health_check(db, extra); res.status(code); res.type("txt"); res.send(txt); }); // /concurrent-warn -- could be used by kubernetes to decide whether or not to kill the container; if // below the warn thresh, returns number of concurrent connection; if hits warn, then // returns 404 error, meaning hub may be unhealthy. Kubernetes will try a few times before // killing the container. Will also return 404 if there is no working database connection. router.get("/concurrent-warn", (_, res) => { res.type("txt"); if (!(0, hub_register_1.database_is_working)()) { L("/concurrent-warn: not healthy, since database connection not working"); res.status(404).end(); return; } const c = db.concurrent(); if (c >= db._concurrent_warn) { L(`/concurrent-warn: not healthy, since concurrent ${c} >= ${db._concurrent_warn}`); res.status(404).end(); return; } res.send(`${c}`); }); // Return number of concurrent connections (could be useful) router.get("/concurrent", (_, res) => { res.type("txt"); res.send(`${db.concurrent()}`); }); } exports.setup_health_checks = setup_health_checks; //# sourceMappingURL=health-checks.js.map