@cocalc/project
Version:
CoCalc: project daemon
274 lines (243 loc) • 10.2 kB
text/coffeescript
#########################################################################
# This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
# License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details
#########################################################################
###
Some code specific to running a project in the KuCalc environment.
###
fs = require('fs')
async = require('async')
misc = require('@cocalc/util/misc')
misc_node = require('@cocalc/backend/misc_node')
path = require('path')
{execSync} = require('child_process')
{defaults} = misc = require('@cocalc/util/misc')
{start_ts, session_id} = require('./consts')
# global variable
PROJECT_ID = undefined
PREFIX = 'cocalc_project_'
# Prometheus client setup -- https://github.com/siimon/prom-client
prom_client = require('prom-client')
# additionally, record GC statistics
# https://www.npmjs.com/package/prometheus-gc-stats
## I'm commenting this out because the package prometheus-gc-stats
## on npm very explicitly says it does not support prom-client
## version 13, which is what we have installed everywhere. That
## version is a significant breaking change from version 12, so
## I'm also not comfortable reverting back. Harald I think force
## upgraded prom-client to version 13 in this commit: b31e087ea2c640f494db15b652d9d0f86e7bd8a5
# require('prometheus-gc-stats')()()
# collect some recommended default metrics every 10 seconds
prom_client.collectDefaultMetrics(timeout: 10 * 1000)
# --- end prometheus setup
# This gets **changed** to true, if a certain
# command line flag is passed in.
exports.IN_KUCALC = false
# status information
current_status = {}
exports.init = (client) ->
# update project status every 30s
# TODO: could switch to faster when it's changing and slower when it isn't.
f = -> update_project_status(client)
f()
setInterval(f, 30000)
update_project_status = (client, cb) ->
dbg = client.dbg("update_status")
dbg()
status = undefined
async.series([
(cb) ->
compute_status (err, s) ->
status = s
if not err
current_status = s
cb(err)
(cb) ->
client.query
query :
projects : {project_id:client.client_id(), status: status}
cb : cb
], (err) ->
cb?(err)
)
exports.compute_status = compute_status = (cb) ->
status =
time : (new Date()).getTime()
memory : {rss: 0}
disk_MB : 0
cpu : {}
start_ts : start_ts
session_id : session_id
processes : {}
async.parallel([
(cb) ->
compute_status_disk(status, cb)
(cb) ->
cgroup_stats(status, cb)
(cb) ->
processes_info(status, cb)
(cb) ->
compute_status_tmp(status, cb)
], (err) ->
cb(err, status)
)
compute_status_disk = (status, cb) ->
disk_usage "$HOME", (err, x) ->
status.disk_MB = x
cb(err)
processes_info = (status, cb) ->
cols = ['pid','lstart','time','rss','args']
misc_node.execute_code
command : 'ps'
args : ['--no-header', '-o', cols.join(','), '-u', 'user']
bash : false
cb : (err, out) ->
if err or out.exit_code != 0
cb(err)
else
cnt = -1 # no need to account for the ps process itself!
# TODO parsing anything out of ps is really hard :-(
# but we want to know how many sage, jupyter, console, etc. instances are running.
for line in out.stdout.split('\n')
if line.length > 0
cnt += 1
status.processes.count = cnt
cb()
# NOTE: we use tmpfs for /tmp, so RAM usage is the **sum** of /tmp and what
# processes use.
compute_status_tmp = (status, cb) ->
disk_usage "/tmp", (err, x) ->
status.memory.rss += 1000*x
cb(err)
# this grabs the memory stats directly from the sysfs cgroup files
# the actual usage is the sum of the rss values plus cache, but we leave cache aside
cgroup_stats = (status, cb) ->
async.parallel({
memory : (cb) ->
fs.readFile '/sys/fs/cgroup/memory/memory.stat', 'utf8', (err, data) ->
if err
cb(err)
return
stats = {}
for line in data.split('\n')
[key, value] = line.split(' ')
try
stats[key] = parseInt(value)
cb(null, stats)
cpu : (cb) ->
fs.readFile '/sys/fs/cgroup/cpu,cpuacct/cpuacct.usage', 'utf8', (err, data) ->
if err
cb(err)
return
try
cb(null, parseFloat(data) / Math.pow(10, 9))
catch
cb(null, 0.0)
oom : (cb) ->
fs.readFile '/sys/fs/cgroup/memory/memory.oom_control', 'utf8', (err, data) ->
if err
cb(err)
return
try
for line in data.split('\n')
# search string includes a trailing space, otherwise it matches 'oom_kill_disable'!
if misc.startswith(line, 'oom_kill ')
cb(null, parseInt(line.split(' ')[1]))
return
cb(null, 0)
}, (err, res) ->
kib = 1024 # convert to kibibyte
# total_rss includes total_rss_huge
# Ref: https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
status.memory.rss += (res.memory.total_rss ? 0) / kib
status.memory.cache = (res.memory.total_cache ? 0) / kib
status.memory.limit = (res.memory.hierarchical_memory_limit ? 0) / kib
status.cpu.usage = res.cpu
status.oom_kills = res.oom
cb()
)
disk_usage = (path, cb) ->
misc_node.execute_code
command : "df -BM #{path} | tail -1 | awk '{gsub(\"M\",\"\");print $3}'"
bash : true
cb : (err, out) ->
if err
cb(err)
else
cb(undefined, parseInt(out.stdout))
# Every 60s, check if we can reach google's internal network -- in kucalc on GCE, this must be blocked.
# If we receive some information, exit with status code 99.
exports.init_gce_firewall_test = (logger, interval_ms=60*1000) ->
return # temporarily disabled
if not exports.IN_KUCALC
logger?.warn("not running firewall test -- not in kucalc")
return
URI = 'http://metadata.google.internal/computeMetadata/v1/'
test_firewall = ->
logger?.log("test_firewall")
request = require('request')
request(
timeout : 3000
headers :
'Metadata-Flavor' : 'Google'
uri: URI
method: 'GET'
, (err, res, body) ->
if err?.code == 'ETIMEDOUT'
logger?.log('test_firewall: timeout -> no action')
else
logger?.warn('test_firewall', res)
logger?.warn('test_firewall', body)
if res? or body?
logger?.warn('test_firewall: request went through and got a response -> exiting with code 99')
process.exit(99)
else
logger?.warn('test_firewall: request went through with no response -> no action')
)
test_firewall()
setInterval(test_firewall, interval_ms)
return
get_bugs_total = require('./bug-counter').default
exports.prometheus_metrics = (project_id) ->
labels = "project_id=\"#{project_id}\",session_id=\"#{session_id}\""
"""
# HELP cocalc_project_bugs_total The total number of caught bugs.
# TYPE cocalc_project_bugs_total counter
cocalc_project_bugs_total{#{labels}} #{get_bugs_total()}
# HELP cocalc_project_start_time when the project/session started
# TYPE cocalc_project_start_time counter
cocalc_project_start_time{#{labels}} #{start_ts}
# HELP cocalc_project_cpu_usage_seconds
# TYPE cocalc_project_cpu_usage_seconds counter
cocalc_project_cpu_usage_seconds{#{labels}} #{current_status.cpu?.usage ? 0.0}
# HELP cocalc_project_disk_usage_mb
# TYPE cocalc_project_disk_usage_mb gauge
cocalc_project_disk_usage_mb{#{labels}} #{current_status.disk_MB ? 0.0}
# HELP cocalc_project_memory_usage_ki
# TYPE cocalc_project_memory_usage_ki gauge
cocalc_project_memory_usage_ki{#{labels}} #{current_status.memory?.rss ? 0.0}
# HELP cocalc_project_memory_limit_ki
# TYPE cocalc_project_memory_limit_ki gauge
cocalc_project_memory_limit_ki{#{labels}} #{current_status.memory?.limit ? 0.0}
# HELP cocalc_project_running_processes_total
# TYPE cocalc_project_running_processes_total gauge
cocalc_project_running_processes_total{#{labels}} #{current_status.processes?.count ? 0}
# HELP cocalc_project_oom_kills_total
# TYPE cocalc_project_oom_kills_total counter
cocalc_project_oom_kills_total{#{labels}} #{current_status.oom_kills ? 0}
""" + '\n' # makes sure the response ends with a newline!
# called inside raw_server
exports.init_health_metrics = (raw_server, project_id) ->
return if not exports.IN_KUCALC
PROJECT_ID = project_id
# Setup health and metrics (no url base prefix needed)
raw_server.use '/health', (req, res) ->
res.setHeader("Content-Type", "text/plain")
res.setHeader('Cache-Control', 'no-cache, no-store')
res.send('OK')
# prometheus text format -- https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
raw_server.use '/metrics', (req, res) ->
res.setHeader("Content-Type", "text/plain; version=0.0.4")
res.header('Cache-Control', 'no-cache, no-store')
part1 = exports.prometheus_metrics(project_id)
res.send(part1 + '\n' + (await prom_client.register.metrics()) + '\n')