@cocalc/hub
Version:
CoCalc: Backend webserver component
245 lines • 10.4 kB
JavaScript
;
/*
* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
* License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.setup_health_checks = exports.process_alive = exports.set_agent_endpoint = void 0;
// endpoints for various health checks
const logger_1 = __importDefault(require("@cocalc/backend/logger"));
const { new_counter } = require("@cocalc/hub/metrics-recorder");
const record_connect_error_1 = require("@cocalc/database/postgres/record-connect-error");
const misc_1 = require("@cocalc/util/misc");
const net_1 = require("net");
const validator_1 = require("validator");
const hub_register_1 = require("./hub_register");
const logger = (0, logger_1.default)("hub:healthcheck");
const { debug: L } = logger;
const HEALTHCHECKS = new_counter("healthchecks_total", "test healthcheck counter", ["status"]);
// self termination is only activated, if there is a COCALC_HUB_SELF_TERMINATE environment variable
// it's value is an interval in hours, minimum and maximum, for how long it should be alive
// and a drain period in minutes at the end.
// e.g. "24,48,15" for an uptime between 1 and 2 days and 15 minutes of draining
function init_self_terminate() {
const D = logger.extend("init_self_terminate").debug;
const startup = Date.now();
const conf = process.env.COCALC_HUB_SELF_TERMINATE;
if (conf == null) {
D("COCALC_HUB_SELF_TERMINATE env var not set, hence no self-termination");
return { startup };
}
const [from_str, to_str, drain_str] = conf.trim().split(",");
if (!(0, validator_1.isFloat)(from_str, { gt: 0 }))
throw new Error("COCALC_HUB_SELF_TERMINATE/from not a positive float");
if (!(0, validator_1.isFloat)(to_str, { gt: 0 }))
throw new Error("COCALC_HUB_SELF_TERMINATE/to not a positive float");
if (!(0, validator_1.isFloat)(drain_str, { gt: 0 }))
throw new Error("COCALC_HUB_SELF_TERMINATE/drain not a positive float");
const from = parseFloat(from_str);
const to = parseFloat(to_str);
const drain_h = parseFloat(drain_str) / 60; // minutes to hours
D("parsed data:", { from, to, drain_h });
if (from > to)
throw Error("COCALC_HUB_SELF_TERMINATE 'from' must be smaller than 'to', e.g. '24,48,15'");
const uptime = Math.random() * (to - from); // hours
const hours2ms = 1000 * 60 * 60;
const shutdown = startup + (from + uptime) * hours2ms;
const drain = shutdown - drain_h * hours2ms;
if (startup > drain) {
throw new Error(`COCALC_HUB_SELF_TERMINATE: startup must be smaller than drain – ${startup}>${drain}`);
}
D({
startup: new Date(startup).toISOString(),
drain: new Date(drain).toISOString(),
shutdown: new Date(shutdown).toISOString(),
uptime: (0, misc_1.seconds2hms)((hours2ms * uptime) / 1000),
draintime: (0, misc_1.seconds2hms)((drain_h * hours2ms) / 1000),
});
return { startup, shutdown, drain };
}
const { startup, shutdown, drain } = init_self_terminate();
let agent_port = 0;
let agent_host = "0.0.0.0";
function set_agent_endpoint(port, host) {
L(`set_agent_endpoint ${agent_host}:${agent_port}`);
agent_port = port;
agent_host = host;
}
exports.set_agent_endpoint = set_agent_endpoint;
let agent_check_server;
// HAProxy agent-check TCP endpoint
// https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-agent-check
// for development, set the env var in your startup script or terminal init file
// export COCALC_HUB_SELF_TERMINATE=.1,.2,1
// and then query it like that
// $ telnet 0.0.0.0 $(cat $COCALC_ROOT/dev/project/ports/agent-port)
function setup_agent_check() {
if (agent_port == 0 || drain == null) {
L("setup_agent_check: agent_port not set, no agent checks");
return;
}
// TODO this could also return a "weight" for this server, based on load values
// there is also "drain", but we set it to "10%" to avoid a nasty situation, when all endpoints are draining.
// ATTN: weight must be set as well, which is poorly documented here:
// https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-weight
agent_check_server = (0, net_1.createServer)((c) => {
let msg = Date.now() < drain ? "ready up 100%" : "10%";
c.write(msg + "\r\n");
c.destroy();
});
agent_check_server.listen(agent_port, agent_host);
L(`setup_agent_check: listening on ${agent_host}:${agent_port}`);
}
// this could be directly in setup_health_checks, but we also need it in proxy.coffee
// proxy.coffee must be rewritten and restructured first – just wrapping it with a router
// didn't work at all for me
function process_alive() {
let txt = "alive: YES";
let is_dead = true;
if (!(0, hub_register_1.database_is_working)()) {
// this will stop haproxy from routing traffic to us
// until db connection starts working again.
txt = "alive: NO – database not working";
}
else if (shutdown != null && Date.now() > shutdown) {
txt = "alive: NO – shutdown initiated";
}
else {
is_dead = false;
}
const code = is_dead ? 404 : 200;
return { txt, code };
}
exports.process_alive = process_alive;
function checkConcurrent(db) {
const c = db.concurrent();
if (c >= db._concurrent_warn) {
return {
status: `hub not healthy, since concurrent ${c} >= ${db._concurrent_warn}`,
abort: true,
};
}
else {
return { status: `concurrent ${c} < ${db._concurrent_warn}` };
}
}
function checkUptime() {
const now = Date.now();
const uptime = (0, misc_1.seconds2hms)((now - startup) / 1000);
if (shutdown != null && drain != null) {
if (now >= shutdown) {
const msg = `uptime ${uptime} – expired, terminating now`;
L(msg);
return { status: msg, abort: true };
}
else {
const until = (0, misc_1.seconds2hms)((shutdown - now) / 1000);
const drain_str = drain > now
? `draining in ${(0, misc_1.seconds2hms)((drain - now) / 1000)}`
: "draining now";
const msg = `uptime ${uptime} – ${drain_str} – terminating in ${until}`;
L(msg);
return { status: msg };
}
}
else {
const msg = `uptime ${uptime} – no self-termination`;
L(msg);
return { status: msg };
}
}
// if there are is no connection to the database for that many minutes,
// declare the hub unhealthy
const DB_ERRORS_THRESHOLD_MIN = parseInt(process.env.COCALC_DB_ERRORS_THRESHOLD_MIN ?? "5");
function checkDBConnectivity() {
if (DB_ERRORS_THRESHOLD_MIN <= 0) {
return { status: "db connectivity check disabled" };
}
const num = (0, record_connect_error_1.howLongDisconnectedMins)();
if (num == null) {
return { status: "no DB connection problems", abort: false };
}
// round num to 2 decimal places
const numStr = num.toFixed(2);
const above = num >= DB_ERRORS_THRESHOLD_MIN;
const status = above
? `DB problems for ${numStr} >= ${DB_ERRORS_THRESHOLD_MIN} mins`
: `DB problems for ${numStr} < ${DB_ERRORS_THRESHOLD_MIN} mins`;
return { status, abort: above };
}
// same note as above for process_alive()
async function process_health_check(db, extra = []) {
let any_abort = false;
let txt = "healthchecks:\n";
for (const test of [
() => checkConcurrent(db),
checkUptime,
checkDBConnectivity,
...extra,
]) {
try {
const { status, abort = false } = await test();
const statusTxt = abort ? "FAIL" : "OK";
txt += `${status} – ${statusTxt}\n`;
any_abort = any_abort || abort;
L(`process_health_check: ${status} – ${statusTxt}`);
}
catch (err) {
L(`process_health_check ERRROR: ${err}`);
HEALTHCHECKS.labels("ERROR").inc();
}
}
const code = any_abort ? 404 : 200;
HEALTHCHECKS.labels(any_abort ? "FAIL" : "OK").inc();
return { code, txt };
}
async function setup_health_checks(opts) {
const { db, extra, router } = opts;
setup_agent_check();
// used by HAPROXY for testing that this hub is OK to receive traffic
router.get("/alive", (_, res) => {
const { code, txt } = process_alive();
res.type("txt");
res.status(code);
res.send(txt);
});
// this is a more general check than concurrent-warn
// additionally to checking the database condition, it also self-terminates
// this hub if it is running for quite some time. beyond that, in the future
// there could be even more checks on top of that.
router.get("/healthcheck", async (_, res) => {
const { txt, code } = await process_health_check(db, extra);
res.status(code);
res.type("txt");
res.send(txt);
});
// /concurrent-warn -- could be used by kubernetes to decide whether or not to kill the container; if
// below the warn thresh, returns number of concurrent connection; if hits warn, then
// returns 404 error, meaning hub may be unhealthy. Kubernetes will try a few times before
// killing the container. Will also return 404 if there is no working database connection.
router.get("/concurrent-warn", (_, res) => {
res.type("txt");
if (!(0, hub_register_1.database_is_working)()) {
L("/concurrent-warn: not healthy, since database connection not working");
res.status(404).end();
return;
}
const c = db.concurrent();
if (c >= db._concurrent_warn) {
L(`/concurrent-warn: not healthy, since concurrent ${c} >= ${db._concurrent_warn}`);
res.status(404).end();
return;
}
res.send(`${c}`);
});
// Return number of concurrent connections (could be useful)
router.get("/concurrent", (_, res) => {
res.type("txt");
res.send(`${db.concurrent()}`);
});
}
exports.setup_health_checks = setup_health_checks;
//# sourceMappingURL=health-checks.js.map