UNPKG

cdpc

Version:

child process management

2,095 lines (1,658 loc) 67.5 kB
'use strict' const process = require('node:process'); const {spawn} = require('node:child_process') const path = require('node:path') const fs = require('node:fs') const os = require('node:os') const crypto = require('node:crypto') const nps = require('./nps.js') const cgroup = require('./cgroup.js') let fsp = fs.promises let cpuTotal = os.cpus().length /** * 可用CPU用于进程的CPU资源占用控制 */ let availCPU = 1 if (cpuTotal == 2 || cpuTotal == 3) { availCPU = cpuTotal - 0.5 } else if (cpuTotal > 3) { availCPU = cpuTotal - 1 } /** * name * file * command * args * options * restart: always | count | fail | none * restartLimit * restartDelay default 100 * user * group */ /** * 我将尽可能把所有功能放在一个文件里,并分块以注释分隔和说明。 */ let outError = (err) => { setTimeout(() => { if (typeof err === 'string') { console.error(`\x1b[1;35m${err}\x1b[0m`) } else { console.error(err) } }, 120) } let _stat_index = { pgrp : 4, session : 5, utime : 13, stime : 14, cutime : 15, cstime : 16, vsize : 22, rss : 23, } /** * 多数情况下,默认的页大小都是4K,但是这跟系统以及硬件环境有关。不排除会有特殊情况出现,比如配置启用了hugepagesize。 * 如何计算页面大小:从开始启动,程序就获取自身的信息,通过获取status和stat文件的信息,计算出页面大小。 * 主要目的是仍然利用stat获取进程的负载信息,这里有cpu时间片,而status中无法获取。 * @param {number|string} pid * @param {object} pobj * @returns {object} */ async function parse_linux_proc_stat(pid, pobj) { if (!pid && pid !== 0) { return pobj } let comm = await fsp.readFile(`/proc/${pid}/comm`, {encoding: 'utf8'}) let data = await fsp.readFile(`/proc/${pid}/stat`, {encoding: 'utf8'}) let dlines = data.replace(comm, '$').split(' ').filter(p => p.length > 0) pobj.utime = parseInt(dlines[_stat_index.utime]) pobj.stime = parseInt(dlines[_stat_index.stime]) pobj.cutime = parseInt(dlines[_stat_index.cutime]) pobj.cstime = parseInt(dlines[_stat_index.cstime]) pobj.rss = parseInt(dlines[_stat_index.rss]) pobj.pgrp = parseInt(dlines[_stat_index.pgrp]) pobj.session = parseInt(dlines[_stat_index.session]) return pobj } async function get_linux_pagesize() { let obj = {} try { await parse_linux_proc_stat(process.pid, obj) let p = nps.parse_linux_status(process.pid) let pagesize = p.rss / obj.rss + 0.2 return isNaN(pagesize) ? false : parseInt(pagesize) } catch (err) { return false } } /** user (1) Time spent in user mode. nice (2) Time spent in user mode with low priority (nice). system (3) Time spent in system mode. idle (4) Time spent in the idle task. iowait (since Linux 2.5.41) (5) Time waiting for I/O to complete. irq (since Linux 2.6.0) (6) Time servicing interrupts. softirq (since Linux 2.6.0 (7) Time servicing softirqs. steal (since Linux 2.6.11) (8) Stolen time, which is the time spent in other operating systems when running in a irtualized environment guest (since Linux 2.6.24) (9) Time spent running a virtual CPU for guest operating systems under the control of the Linux kernel. guest_nice (since Linux 2.6.33) (10) Time spent running a niced guest. See: man 5 proc. */ async function parse_linux_cpu(options = {all: false}) { let data = await fsp.readFile('/proc/stat', {encoding: 'utf8'}) let dlines = data.split('\n') let cone = dlines[0].split(' ').filter(p => p.length) let tmpval = 0 for (let i = cone.length - 1; i > 0; i--) { tmpval += parseInt(cone[i]) } let totalCPULoad = { total: tmpval, idle: parseInt(cone[4]) } if (!options || !options.all) { return totalCPULoad } let cpus = [] cpus.push(totalCPULoad) let dtmp for (let i = 0; i < cpuTotal; i++) { tmpval = 0 dtmp = dlines[i+1].split(' ').filter(p => p.length) for (let k = dtmp.length - 1; k > 0; k--) { tmpval += parseInt(dtmp[k]) } cpus.push({ total: tmpval, idle: parseInt(dtmp[4]) }) } return cpus } /** * 计算总的CPU负载 * @param {object} c1 * @param {object} c2 */ function caclt_cpu_load(c1, c2) { let idle = c2.idle - c1.idle let total = c2.total - c1.total return (total * 1.0 - (idle * 1.0)) / total } function caclt_proc_total_time(p) { return p.utime + p.stime + p.cutime + p.cstime } /** * 计算单个进程的CPU负载 * @param {object} p1 * @param {object} p2 * @param {object} c1 * @param {object} c2 */ function caclt_proc_cpu(p1, p2, c1, c2) { let p1_total = caclt_proc_total_time(p1) let p2_total = caclt_proc_total_time(p2) return ((p2_total - p1_total) * 1.0 * cpuTotal) / (c2.total - c1.total) } function fmt_percent(n) { return (n * 100).toFixed(2) } let _linux_meminfo = {} async function parse_linux_mem() { let data = await fsp.readFile('/proc/meminfo', {encoding: 'utf8'}) let dlines = data.split('\n') let t let ktmp let k dlines.forEach((a,ind) => { if (!a) return t = a.split(' ').filter(p => p.length > 0) k = t[0].substring(0, t[0].length - 1) ktmp = _linux_meminfo[k] if (ktmp) { ktmp.value = parseInt(t[1]) ktmp.unit = t.length > 2 ? t[2].toLowerCase() : '' } else { _linux_meminfo[k] = { value: parseInt(t[1]), unit: t.length > 2 ? t[2].toLowerCase() : '' } } }) } function fmt_mem_value(v, unit='kb') { if (unit === 'kb') { return ((v * 1.0) / 1024).toFixed(2) } if (unit === 'mb') { return v.toFixed(2) } if (unit === 'b') { return (v * 1.0 / 1024 / 1024).toFixed(2) } } //判断一个数字在某个2的整数次幂范围,并返回最接近的整数。 function auto_pagesize(n) { if (n <= 4) return 4 let t = 2 let left_t,right_t for (let i = 2; i < 22; i++) { left_t = 2**i if (n === left_t) return n right_t = 2**(i+1) if (n === right_t) return n if (n > left_t && n < right_t) { return ((right_t - n) > (n - left_t)) ? left_t : right_t } } return n } /** * 解析进程的网络收发数据。 Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed lo: 49225601 141541 0 0 0 0 0 0 49225601 141541 0 0 0 0 0 0 eth0: 358472071 1977867 0 0 0 0 0 0 918282959 2145431 0 0 0 0 0 0 * @param {int} pid */ async function parse_linux_pid_net(pid, rtdata, rtindex=[1,2,9,10]) { try { if (!pid && pid !== 0) return false let netfile = `/proc/${pid}/net/dev` let netdata = await fsp.readFile(netfile, {encoding: 'utf8'}) let datalines = netdata.split('\n').slice(2) let last_recv = 0 let last_tran = 0 if (!rtdata) { rtdata = { recvBytes: 0, recvPackets: 0, transmitBytes: 0, transmitPackets: 0, getTime: 0, duration: 0, recvRate: 0, transmitRate: 0, devData: {} } } else { last_recv = rtdata.recvBytes last_tran = rtdata.transmitBytes rtdata.recvBytes = 0 rtdata.recvPackets = 0 rtdata.transmitBytes = 0 rtdata.transmitPackets = 0 } let dl, recv,recvpack,tran,tranpack let [rind, rpind, tind, tpind] = rtindex let tm = Date.now() let tmdiff = 0 for (let d of datalines) { dl = d.split(' ').filter(p => p.length > 0) if (dl.length < 11) continue recv = parseInt(dl[rind]) || 0 recvpack = parseInt(dl[rpind]) || 0 tran = parseInt(dl[tind]) || 0 tranpack = parseInt(dl[tpind]) || 0 rtdata.recvBytes += recv rtdata.recvPackets += recvpack rtdata.transmitBytes += tran rtdata.transmitPackets += tranpack rtdata.devData[dl[0]] = { recvBytes: recv, recvPackets: recvpack, transmitBytes: tran, transmitPackets: tranpack, } } rtdata.duration = tm - rtdata.getTime rtdata.getTime = tm rtdata.recvRate = parseInt((rtdata.recvBytes - last_recv) * 1000 / rtdata.duration) rtdata.transmitRate = parseInt((rtdata.transmitBytes - last_tran) * 1000 / rtdata.duration) return rtdata } catch(err) { return false } } async function parse_linux_socket(obj) { let data = await fsp.readFile(`/proc/net/sockstat`, {encoding: 'utf8'}) let dlines = data.split('\n').filter(p => p.length > 0) let tcps = dlines[1].split(' ').filter(p => p.length > 0) tcps.length > 2 && (obj.tcp.inuse = parseInt(tcps[2]) || 0) let udps = dlines[2].split(' ').filter(p => p.length > 0) udps.length > 2 && (obj.udp.inuse = parseInt(udps[2]) || 0) } async function parse_linux_socket6(obj) { let data = await fsp.readFile(`/proc/net/sockstat6`, {encoding: 'utf8'}) let dlines = data.split('\n').filter(p => p.length > 0) let tcps = dlines[0].split(' ').filter(p => p.length > 0) tcps.length > 2 && (obj.tcp.inuse = parseInt(tcps[2]) || 0) let udps = dlines[1].split(' ').filter(p => p.length > 0) udps.length > 2 && (obj.udp.inuse = parseInt(tcps[2]) || 0) } /** -- start cdpc -- */ function _checkAppName(name) { if (['__all__'].indexOf(name) >= 0) return false //16个字符用于名字,4个字符用于编号,8个字符作为备用空间,绝对足够使用。 return (/^[a-z1-9_][a-z0-9_-]{0,28}$/i).test(name) } function _realTimeout(timeout, stopTimeout) { let real_timeout = (isNaN(timeout) || timeout < 5) ? 5000 : timeout if (stopTimeout && stopTimeout >= 5 && stopTimeout <= 30000) { real_timeout = stopTimeout } return real_timeout } class CDPC { constructor(options={}) { this.cpus = os.cpus() this.cpuTotal = this.cpus.length this.state = { PREPARE: 'p', EXIT: 'e', RUNNING: 'r', PAUSE: 's', ERROR: '!' } this.nps = nps.nps this.anps = nps.anps this.stateName = {} for (let k in this.state) this.stateName[ this.state[k] ] = k this.command = process.argv[0] this.args = process.argv.slice(1) this.maxCount = 365435296162 //cache monitor timer this.monitorTimer = null this._procLoadCurPage = 1 this._totalPage = 0 /** * 记录在某些服务运行后才运行的依赖关系。 * app1 : [ * 'x', 'y', 'z' * ] * 表示x y z 服务要在app1运行后才运行。 * 此时,要对每个依赖的服务记录引用计数,一旦一个所依赖的服务运行,则引用计数 - 1。 */ this.relationAfter = {} this.childs = {} this.appName = {} //记录哪些App需要监控。 this.monitorChilds = [] this.notExit = false this.notExitButSpread = false this.childDetached = false this.beforeStartCallback = false this.userFile = '/etc/passwd' this.groupFile = '/etc/group' this.config = '' this.debug = false this.eventDir = '/tmp/cdpc_watch_wxm' this.notWatch = false this.showColor = false //若设置则表示要把负载信息写入到文件。 //若设置为--mem,则表示在内存存储,如何使用需要自己实现。 this.loadInfoFile = '' //text | json | html | colortext this.loadInfoType = 'text' this.errorHandle = (err, errname = '--ERR--') => { if (errname === '--ERR-CONFIG--') { this.debug && outError(this.lastErrorInfo) return } this.debug && console.error(errname, err) } if (!options || options.toString() !== '[object Object]') { options = {} } //SIGHUP 用于命令程序的配置读取。 this.signals = [ 'SIGINT', 'SIGTERM', 'SIGABRT', 'SIGQUIT' ] this.sigCount = { SIGTERM: 0, SIGINT: 0, SIGABRT: 0, SIGQUIT: 0 } //不太优雅的选项处理,不打算抽离出专门的选项处理模块。 for (let k in options) { switch (k) { case 'signalHandle': case 'onExit': case 'errorHandle': case 'beforeStartCallback': if (typeof options[k] === 'function') this[k] = options[k]; break case 'userFile': case 'groupFile': case 'config': if (typeof options[k] === 'string') { try { fs.accessSync(options[k]) this[k] = options[k] } catch (err) { console.error(err) } } break case 'eventDir': if (typeof options[k] === 'string') this[k] = options[k]; break case 'signalNotExit': case 'notExit': this.notExit = !!options[k] break case 'debug': case 'notWatch': case 'notExitButSpread': case 'childDetached': this[k] = !!options[k] break case 'loadInfoFile': if (typeof options[k] === 'string') this[k] = options[k]; break case 'loadInfoType': if (['text', 'json'].indexOf(options[k]) >= 0) this[k] = options[k]; break case 'showColor': this[k]= !!options[k] break default:; } }//end for process.on('exit', this.onExit.bind(this)) for (let sig of this.signals) { process.on(sig, this.signalHandle.bind(this)) this.sigCount[sig] = 0 } //若没有监听SIGHUP,则默认监听此信号用于重新加载配置文件,此操作会先停止所有服务,并重新启动。 if (process.listenerCount('SIGHUP') <= 0) { process.on('SIGHUP', async sig => { await this.reLoadConfig() }) } if (process.listenerCount('SIGALRM') <= 0) { process.on('SIGALRM', sig => {}) } this.lastErrorInfo = '' this.watchHandleTime = 490 this.watchHandleTable = {} this.watchEvents = [ 'stop', 'start', 'resume', 'pause', 'restart', 'remove', 'restartCount', 'resetCount', 'forceRemove', 'safeRemove', 'disable', 'enable', 'state' ] this.linuxUsers = {} this.linuxGroups = {} this.startWatch() this.initMonitorData() this.initCgroup() this.processData = this.nps() this.processDataTime = Date.now() } getProcessData() { let tm = Date.now() if (tm - this.processDataTime > 1000) { this.processData = this.nps() this.processDataTime = tm } return this.processData } initCgroup() { if (!process.geteuid || process.platform !== 'linux') return false if (process.geteuid() !== 0) return false this.cgroup = new cgroup() this.removeCgroup = async (name) => { let cgrp_path = this.cgroup.findPath(name) if (!cgrp_path) return false return new Promise((rv, rj) => { this.runChilds({ autoRemove: true, restart: 'count', restartLimit: 0, monitor: false, command: 'rmdir', args: [name], cwd: this.cgroup.cgdir, user: 'root', callback: (ch, cm, chs) => { ch.on('exit', (code) => { rv(code === 0 ? {ok: true} : {ok: false}) }) ch.on('error', err => { rj(err) }) } }) }).catch (err => { return {ok: false, message: err.message} }) } } //子进程被创建后会自动放入到父进程所在的cgroup async addToCgroup(chk, cname) { if (!this.cgroup || process.geteuid() ) return { ok: false, message: '非root用户不具备操作权限' } if (typeof chk === 'string') { chk = this.find(chk) } if (!chk || typeof chk !== 'object') return { ok: false, message: '不是合法的子进程'} if (!chk.child) return {ok: false, message: '未运行'} try { return this.cgroup.addPids(cname, [chk.child.pid]) } catch (err) { return {ok: false, message: err.message} } return {ok: true} } initMonitorData() { this.lastCPULoad = null this.loadinfo = { cpu: 0, cpun: 0, cpus: [], cpuns: [], socket: {tcp:{}, udp: {}}, socket6: {tcp:{}, udp: {}}, mem: _linux_meminfo } this._procPagesize = 5 this._curProcList = [] this._procLoopCount = 0 this._memPagesize = 4 this._lastCPULoad = null this._curCPULoad = null this.__child_number = 0 //采用步进式策略,定时器会非常快速的执行,但是不会每次都获取监控信息。 this.stepSlice = 5 this.maxStep = 100 this.stepCount = 100 this.dynamicMaxStep = 105 this.useDynamicStep = true //动态步进的每次前进间隔 this.dynamicStep = 1 if (process.platform !== 'linux') return false get_linux_pagesize().then(psize => { if (!psize || psize < 4) return false; //页面文件大小只能是2的整数次幂:4 8 16 ... Linux支持huagePagesize到2M,也可能极端情况会有1G。 this._memPagesize = auto_pagesize(psize) }) } setStepSlice(tslice) { if (typeof tslice !== 'number' || tslice < 1 || tslice > 100) return false this.stepSlice = tslice return this.stepSlice } setMaxStep(max_step=100, dyn_step=0) { if (isNaN(max_step)) return false if (isNaN(dyn_step)) dyn_step = 0 if (max_step >= 1 && max_step < 500) { this.maxStep = max_step } if (dyn_step > 0) { this.dynamicMaxStep = dyn_step } } setDynamicStep(max_step=0) { if (isNaN(max_step)) return false this.dynamicMaxStep = max_step return this.dynamicMaxStep } killChilds(sig, quiet=false) { let ch; let count = 0; for (let k in this.childs) { ch = this.childs[k]; if (ch.state !== this.state.EXIT && ch.child && ch.child.kill) { count++; try { ch.child.kill(sig); } catch (err) { !quiet && this.errorHandle(err, '--ERR-CHILD-KILL--'); } } } return count; } async signalHandle(sig) { if (this.notExit) { return } //为了保证一些终止信号在退出时不会重复发送。 ;(this.sigCount[sig] !== undefined) && (this.sigCount[sig] += 1); if (this.signals.indexOf(sig) >= 0) { //此处需要考虑是否替换为killAllChilds,目前运行良好,暂时可以不更新。 let count = this.killChilds(sig) //如果只是扩散但是不退出,则不必继续检测,直接返回。 if (this.notExitButSpread) { return } //此处不能立即退出,要给子进程留出时间进行清理工作。 for (let i = 0; i < 50; i++) { if (count <= 0) { process.exit(0) } await new Promise((rv, rj) => { setTimeout(() => { rv() }, 10) }) this.debug && i > 45 && console.log('存在没有终止的进程···') count = this.killChilds(sig) } if (count > 0) { this.debug && console.log('使用SIGKILL终止进程') this.killChilds('SIGKILL', true) } process.exit(0) } } /** * 当程序直接收到信号退出时,并不会执行此函数, * 所以一个进程是因为信号异常终止,其子进程会成为守护进程。 * */ onExit(code) { try { if (code === 0) { for (let k in this.sigCount) { if (this.sigCount[k] > 0) { this.sigCount[k] = 0 return } } this.killChilds('SIGTERM') //process.kill(process.pid, 'SIGTERM') } else { this.killChilds('SIGKILL') } } catch (err) { this.errorHandle(err, '--ERR-EXIT--') } } strong(unexp = null, unrej = null) { if (!unexp || typeof unexp !== 'function') { unexp = (err, orgi) => { this.errorHandle(err, `--ERR-${orgi}--`) } } process.on('uncaughtException', unexp) if (!unrej || typeof unrej !== 'function') { unrej = (reason, promise) => { this.errorHandle(reason, '--ERR-PROMISE--') } } process.on('unhandledRejection', unrej) } tnps(callback, timeout=1000) { return setInterval(() => { ;(typeof callback === 'function') && this.anps().then(data => {callback(data)}) }, timeout) } /** * 只会kill掉所有子进程 */ killAllChilds(pid, sig='SIGTERM') { if (process.platform !== 'linux') { return } try { let pst = nps.nps() pst.getAllChilds(pid, true).forEach(p => { process.kill(p, sig) }) } catch (err) { this.errorHandle(err, '--ERR-KILL-ALL-CHILDS--') } } initEventsDir() { this.watchEvents.forEach(d => { let dfile = `${this.eventDir}/${d}` try { fs.accessSync(dfile) let fst = fs.statSync(dfile) if (!fst.isDirectory()) { fs.unlinkSync(dfile) fs.mkdirSync(dfile, {mode: 0o755}) } } catch (err) { fs.mkdirSync(dfile, {mode: 0o755}) } }) } async startWatch() { if (this.notWatch) return; let evtst = true try { fs.accessSync(this.eventDir) } catch (err) { evtst = false } if (!evtst) { try { fs.mkdirSync(this.eventDir) } catch (err) { this.errorHandle(err, '--ERR-MKDIR-EVENT--') return; } } this.initEventsDir() this.watchEvents.filter(x => { if (['state', 'childs'].indexOf(x) < 0) return x }).forEach(d => { let evt_dir = `${this.eventDir}/${d}` let event_name = d fs.watch(evt_dir, (evt, fname) => { if (evt === 'rename') { try { fs.accessSync(`${evt_dir}/${fname}`) } catch (err) { return false } } if (!this.has(fname) && fname !== '__all__') { fs.unlink(`${evt_dir}/${fname}`, err => {err && this.errorHandle(err, '--ERR-REMOVE-NOT-HAS--')}) return false } let ek = event_name + '_' + fname let tm = Date.now() if (!this.watchHandleTable[ek]) { this.watchHandleTable[ek] = {time: tm} } else { if (tm < (this.watchHandleTable[ek].time + this.watchHandleTime + 10) ) { return false } } this.watchHandleTable[ek].time = tm let applist = [] if (fname !== '__all__') { applist.push(fname) } else { for (let k in this.appName) { applist.push(k) } } for (let a of applist) { this[event_name](a) } }) }) /** * 由于watch在测试发现,会连续触发两次,并且如果是rename事件不必处理,只需要对change事件处理。 * */ fs.watch(this.eventDir, (evt, fname) => { if (['load', 'reload'].indexOf(fname) < 0) return false let tm = Date.now() if (!this.watchHandleTable[fname]) { this.watchHandleTable[fname] = {time: tm} } else { if (tm < (this.watchHandleTable[fname].time + this.watchHandleTime + 10) ) { return false } } this.watchHandleTable[fname].time = tm if (fname === 'load') { try { let data = fs.readFileSync(`${this.eventDir}/${fname}`, {encoding: 'utf8'}); data = data.trim() this.loadConfig(data) } catch (err) { this.errorHandle(err, '--ERR-LOAD--') } } else if (fname === 'reload') { this.reLoadConfig() } }) } /** * reload事件触发不会导致程序重启,这要求在重新设定配置的过程中,必须要识别正在运行的程序 * @param {string} filename * @returns */ async reLoadConfig(filename = '') { if (!filename && !this.config) return false try { await fsp.access(filename || this.config, fs.constants.F_OK | fs.constants.R_OK) } catch (err) { this.errorHandle(err, '--ERR-RELOAD-CONFIG--') return false } return this.loadConfig(filename, true) } getUserId(uname) { if (typeof uname === 'string') { uname = [uname] } else if (!Array.isArray(uname)) { throw new Error('指定的用户必须是字符串或字符串数组。') } for (let name of uname) { if (this.linuxUsers[name]) return this.linuxUsers[name] } try { let data = fs.readFileSync(this.userFile, {encoding: 'utf8'}) let dlines = data.split('\n') .filter(p => p.length > 0) .map(a => { return a.split(':') }); for (let name of uname) { for (let d of dlines) { if (d[0] === name) { this.linuxUsers[name] = { uid: parseInt(d[2]), gid: parseInt(d[3]) } return this.linuxUsers[name] } } } } catch (err) { this.errorHandle(err, '--ERR-USER-ID--') return null } return null } getGroupId(grp) { if (typeof grp === 'string') { grp = [grp] } else if (!Array.isArray(grp)) { throw new Error('指定的用户组必须是字符串或字符串数组。') } for (let g of grp) { if (this.linuxGroups[g]) return this.linuxGroups[g] } try { let data = fs.readFileSync(this.groupFile, {encoding: 'utf8'}) let dlines = data.split('\n') .filter(p => p.length > 0) .map(a => { return a.split(':') }); for (let g of grp) { for (let d of dlines) { if (d[0] === g) { this.linuxGroups[g] = { gid: parseInt(d[2]) } return this.linuxGroups[g] } } } } catch (err) { this.errorHandle(err, '--ERR-GROUP-ID--') return null } return null } readConfig(filename = '', options = {}) { if (typeof filename === 'object') { options = filename filename = '' } let real_file = filename || this.config if (!real_file) return { ok: false, errmsg: 'filename is null' } let fst try { fs.accessSync(real_file) fst = fs.statSync(real_file) } catch (err) { return { ok: false, errmsg: 'file is not exists' } } let flist = [] if (fst.isFile()) { flist.push(real_file) } else if (fst.isDirectory()) { let files = fs.readdirSync(real_file, {withFileTypes: true}) let t for (let f of files) { if (!f.isFile()) continue t = f.name if (options && options.ignore && (options.ignore instanceof Array)) { if (options.ignore.indexOf(t) >= 0) continue } if (t.substring(t.length - 5) === '.json' || t.substring(t.length - 3) === '.js') { flist.push(`${real_file}/${t}`) } } } let cfglist = [] let data let fpath for (let f of flist) { try { fpath = path.resolve(f) //清理模块缓存,让require重新读取配置文件。 delete require.cache[fpath] data = require(f) if ( !Array.isArray(data) ) { data.configPath = fpath cfglist.push(data) } else { data.forEach(d => { d.configPath = fpath }) cfglist = cfglist.concat(data) } } catch (err) { this.errorHandle(err, '--ERR-READ-CONFIG--') } } return { ok: true, data: cfglist } } loadConfig(filename='', reload=false) { if (typeof filename === 'boolean') { reload = filename filename = '' } let r = this.readConfig(filename) if (!r.ok) { return r } /** * 重新加载模式,会遇到一些问题: * - 应用的name改了,或者是文件路径有所更改。 * - 实际的文件其实已经不存在了,其实就是这个服务的配置文件已经不存在了。 * 所有这些情况,都没有什么更好的办法,直接全部重启服务即可。 */ try { this.runChilds(r.data, reload) } catch (err) { this.errorHandle(err, '--ERR-LOAD-CONFIG--') return { ok: false, errmsg: err.message } } return { ok: true, errmsg: 'ok' } } run(config, reload=false) { return this.runChilds(config, reload) } runChilds(config, reload=false) { if ( !Array.isArray(config) ) { config = [ config ] } for (let cfg of config) { if ( this.checkConfig(cfg, reload) ) { this.tryMakeChild(cfg, reload) } else { this.errorHandle(this.lastErrorInfo, '--ERR-CONFIG--') } } } /** * 注意:如果依赖的命令不存在会导致问题,比如Node.js版本变化,旧的版本已经被删除了。 * 检测配置文件参数。 * @param {object} cfg */ checkConfig(cfg, reload=false) { if (!cfg.args || !Array.isArray(cfg.args)) cfg.args = [] if (!cfg.options) cfg.options = {} //这表示扩展,因为cfg.options.env默认是process.env if (cfg.env && typeof cfg.env === 'object') { if (!cfg.options.env || typeof cfg.options.env !== 'object') { cfg.options.env = { ...process.env } } for (let k in cfg.env) { cfg.options.env[k] = cfg.env[k] } } if (this.childDetached) { cfg.options.detached = true } if (!cfg.step || typeof cfg.step !== 'number' || cfg.step < 0) cfg.step = 0 if (!cfg.limit || typeof cfg.limit !== 'object') { cfg.limit = null } this.fmtLimit(cfg) if (!cfg.after || (!Array.isArray(cfg.after) && typeof cfg.after !== 'string' ) ) { cfg.after = null } else if (typeof cfg.after === 'string') { cfg.after = [ cfg.after ] } if (!cfg.name) { cfg.name = '' } else { if (cfg.name.length > 28) { cfg.name = cfg.name.substring(0, 28) } if (!_checkAppName(cfg.name)) { this.lastErrorInfo = `命名不合法:${cfg.name},支持字母数字下划线减号,并且以字母或数字开头,长度不超过28。` return false } } if (cfg.detail && cfg.detail.length > 50) { cfg.detail = cfg.detail.substring(0, 50) } if (this.appName[cfg.name]) { if (!reload) { this.lastErrorInfo = `${cfg.name}:应用名称冲突。(${cfg.name} conflict.)` return false } } if (cfg.file) { try { fs.accessSync(cfg.file) } catch (err) { this.debug && outError(err) return false } if (!cfg.command) { let extname = cfg.file.substring(cfg.file.length - 3) let extname2 = cfg.file.substring(cfg.file.length - 4) if (extname === '.js' || extname2 === '.cjs' || extname2 === '.mjs') { cfg.command = 'node' cfg.commandList = [this.command] } else if (extname === '.sh') { cfg.command = 'bash' cfg.commandList = [ 'sh' ] } else if (extname === '.py') { cfg.command = 'python' } else { this.lastErrorInfo = `${cfg.name} ${cfg.file}没有指定运行脚本的命令。` return false } } if (!cfg.command) { this.lastErrorInfo = `${cfg.name} 未指定命令` return false } let cwd = path.resolve( path.dirname(cfg.file) ) let cfile = path.basename(cfg.file) if (!cfg.options.cwd) { cfg.options.cwd = cwd } cfg.realfile = `${cwd}/${cfile}` if (cfg.args.indexOf(cfg.realfile) < 0) cfg.args.unshift(cfg.realfile); } if (!cfg.onlyArgs || !Array.isArray(cfg.onlyArgs)) { cfg.onlyArgs = [...cfg.args] } if (cfg.restartDelay === undefined || typeof cfg.restartDelay !== 'number') { cfg.restartDelay = 1000 } if (cfg.restartLimit !== undefined) { if (typeof cfg.restartLimit !== 'number') { cfg.restartLimit = 1 } else if (cfg.restartLimit > this.maxCount) { cfg.restartLimit = this.maxCount } } if (cfg.restart === undefined || ['count', 'fail', 'always', 'none', 'fail-count'].indexOf(cfg.restart) < 0) { cfg.restart = 'always' } if (cfg.stopTimeout === undefined || typeof cfg.stopTimeout !== 'number' || cfg.stopTimeout < 0) cfg.stopTimeout = 0; if (cfg.onceMode) { cfg.autoRemove = true cfg.restart = 'count' cfg.restartLimit = 0 } if (process.platform !== 'win32') { if (cfg.user) { let ug = this.getUserId(cfg.user) if (ug) { cfg.options.uid = ug.uid cfg.options.gid = ug.gid } } if (cfg.group) { let g = this.getGroupId(cfg.group) if (g) { cfg.options.gid = g.gid } } if (process.geteuid() === 0) return true if (cfg.options.uid !== undefined || cfg.options.gid !== undefined) { this.lastErrorInfo = `必须以root用户运行才可以改变子进程的uid和gid。\n\t${JSON.stringify(cfg.options)}` return false } } return true } fmtLimit(cfg) { if (cfg.limit) { //最大内存(KB)、最大内存的基础值(KB)、最长运行时间(ms)、频率(f/s)、一天最大允许的运行次数。 ;[ 'maxrss', 'rssOffset', 'maxtime', 'frequency', 'maxdaylimit', 'maxRestart' ].forEach(x => { if (cfg.limit[x] === undefined || typeof cfg.limit[x] !== 'number') { cfg.limit[x] = 0 } cfg.limit.rssRestartCount = 0 }) } } /** * * @param {object} chk * 根据配置生成名称,这样的名称在同样的配置上是唯一的。 */ makeName(chk) { let h = crypto.createHash('sm3') let args = Array.isArray(chk.args) ? [...chk.args] : [] args.sort((a,b) => {return a > b ? 1 : -1}) let data = `${chk.command}${args.join('::')}` h.update(data) let hname = h.digest('hex') let final_name = hname.substring(0, 18) if (this.appName[final_name]) { return this.serialName(final_name + '_') } return final_name } serialName(prefix='child_') { if (this.__child_number < 10000) { this.__child_number++ } else { this.__child_number = parseInt(Math.random() * 10000) + 10001 } return `${prefix}${this.__child_number}` } //查找依赖的应用,并更新afterCount setAfterCount(chd, op) { let name = chd.name; let tapp; if (!this.relationAfter[name]) return false; let rk = this.relationAfter[name]; for (let a of rk) { tapp = this.find(a); if (!tapp) continue; if (op === this.state.EXIT) { // 修改依赖引用计数,若是此时应用已经运行则无影响, // 但是若应用退出,此时如果之前依赖的服务已经退出,则必须要等到依赖的服务运行后才会继续重启。 tapp.afterCount += 1; } else if (op === this.state.RUNNING) { tapp.afterCount -= 1; if (tapp.afterCount <= 0) { //因为tapp是在外层,导致在异步情况下,最终会都去运行最后一个应用。 let tmp_app = tapp; queueMicrotask(() => { this.startChild(tmp_app); }); } } } return true; } appEventFile(evt, name) { ;(typeof name === 'object') && (name = name.name); return `${this.eventDir}/${evt}/${name}` } appStateFile(chk) { return this.eventDir + '/state/' + chk.name } async writeChildState(chk) { if (this.notWatch) return false let cname = this.appStateFile(chk) let str_state = '' switch (chk.state) { case this.state.ERROR: str_state = 'error' break case this.state.PREPARE: str_state = 'prepare' break case this.state.RUNNING: str_state = 'running' break case this.state.EXIT: str_state = 'exit' break case this.state.PAUSE: str_state = 'pause' break } if (chk.disabled) { str_state += '(disabled)' } str_state = `${str_state} ${chk.child ? (chk.child.pid||'0') : '0'} ${(new Date()).toLocaleString().replaceAll('/', '-')}` try { await fsp.writeFile(cname, str_state, {encoding: 'utf8'}) } catch(err) { return false } return true } /** * 更新子进程的command,这个操作目的在于如果旧的命令被删除,则需要自动更新。 * */ setChildCommand(name, command) { let ap = this.find(name) if (!ap) return false let old_ck = `${ap.command}\x00${ap.args.join('\x00')}` let ck = `${command}\x00${ap.args.join('\x00')}` ap.command = command delete this.childs[old_ck] this.childs[ck] = ap this.appName[name] = ck return ap } /** * * @param {object} cfg * @param {boolean} reload * @returns {object} */ tryMakeChild(cfg, reload=false) { if (!cfg.name) cfg.name = this.makeName(cfg) let ck = `${cfg.command}\x00${cfg.args.join('\x00')}` let child = null if (this.childs[ck]) { if (reload) { //重新加载,则删除旧的应用,重新创建新的应用。 child = this.childs[ck].child delete this.childs[ck] } else if (this.childs[ck].state !== this.state.EXIT) { return this.childs[ck] } } if (cfg.monitor === undefined) cfg.monitor = true try { this.beforeStartCallback && this.beforeStartCallback(cfg) } catch (err) {} //避免开发过程重置,需要再次进行格式化 this.fmtLimit(cfg) if (!this.childs[ck]) { this.childs[ck] = { name: cfg.name, detail: cfg.detail || '', command: cfg.command, //用于备份运行的命令 commandList: cfg.commandList && Array.isArray(cfg.commandList) ? cfg.commandList : [], commandIndex: 0, cmdline: ck, args: cfg.args, env: cfg.env, options: cfg.options, restartCache: cfg.restart, restart: cfg.restart, restartDelay: cfg.restartDelay, after: cfg.after || null, afterCount: 0, code: null, signal: null, lockForStart: false, lockStartTime: 0, cgroup: cfg.cgroup || '', //退出或重启的原因,一个字符串描述: event|op|detail cause: '', restartLimit: cfg.restartLimit, state: this.state.PREPARE, restartCount: 0, //是否正在重启的标志 restarting: false, //是否自动删除,只有在restart为count模式才起作用。 autoRemove: !!cfg.autoRemove, child: child, get pid() { return this.child ? this.child.pid : null }, callback: cfg.callback || null, onError: cfg.onError || null, monitor: !!cfg.monitor, lockReload: !!cfg.lockReload, stopTimer: null, stopTimeout: cfg.stopTimeout, configPath: cfg.configPath || '', disabled: !!cfg.disabled, onceMode: !!cfg.onceMode, //only为true表示只能有一个服务运行,比如数据库服务。 only: !!cfg.only, onlyArgs: cfg.onlyArgs || cfg.args, //是否强制模式,此模式下,会对检测已经运行的服务进行kill处理。 force: !!cfg.force, //资源限制,最开始针对云函数服务加入。 limit: cfg.limit || {maxrss: 0, rssOffset: 0, maxtime: 0, frequency: 0, maxdaylimit: 0, rssRestartCount: 0}, //运行时状态,主要还是针对云函数服务,用于识别是不是正在运行云函数,也可以作为其他使用,这是可以自定义的。 runstate: cfg.runstate || {}, monitorNetData: !!cfg.monitorNetData, loadinfo: { last: {}, cur: {}, cpu: 0, mem: 0, net: null } } //这种就是新的配置参数变化导致ck不一致,被标识为两个应用,但是名称一样。 if (this.appName[cfg.name] && ck !== this.appName[cfg.name]) { let old_ck = this.appName[cfg.name] let old_chk = this.childs[old_ck] this.childs[ck].child = old_chk.child delete this.childs[old_ck] let ind = this.monitorChilds.indexOf(old_ck) if (ind >= 0) { this.monitorChilds.splice(ind, 1) } } cfg.name && (this.appName[cfg.name] = ck); this.childs[ck].monitor && this.monitorChilds.indexOf(ck) < 0 && this.monitorChilds.push(ck); this.childs[ck].child && (this.childs[ck].state = this.state.RUNNING) } else { this.childs[ck].state = this.state.PREPARE } let chd = this.childs[ck] //如果child正在运行,则更改状态 if (chd.child && chd.child.pid && chd.child.exitCode === null && chd.state === this.state.EXIT) { chd.state = this.state.RUNNING } if (chd.state === this.state.RUNNING || chd.state === this.state.PAUSE) { return chd } if (!chd.after) return this.startChild(this.childs[ck]) chd.afterCount = chd.after.length //记录关系 for (let a of chd.after) { if (this.relationAfter[a]) { ;(this.relationAfter[a].indexOf(chd.name) < 0) && this.relationAfter[a].push(chd.name) } else this.relationAfter[a] = [ chd.name ] } let tapp = null for (let a of chd.after) { tapp = this.find(a) if (!tapp) { this.errorHandle(`${a} 不存在的应用导致依赖此应用的服务无法运行`, '--ERR-AFTER-NOT-FOUND--') continue } if (tapp && this.state.RUNNING === tapp.state) { chd.afterCount -= 1 } } if (chd.afterCount <= 0) return this.startChild(chd) return chd } startChild(chk) { let tm = Date.now() if (chk.lockForStart && tm < (100 + chk.lockStartTime)) return chk //目的在于避免重复启动 chk.lockForStart = true chk.lockStartTime = tm chk.state = this.state.PREPARE this.writeChildState(chk) if (chk.disabled) { chk.lockForStart = false chk.state = this.state.EXIT this.writeChildState(chk) return chk } if (chk.child && chk.child.pid) { chk.state = this.state.RUNNING chk.lockForStart = false return chk } //如果是only模式,则需要检测是否已经运行了。 /** * 如果已经存在服务,则当前检测会看到已经存在, * 但是因为不知道服务是由systemd管理的还是其他管理的服务, * 也可能是因为重启cdpcd导致的一些进程并不随着服务的退出而退出。 * 默认的模式下,不做处理,直接返回,这样的默认状态就是没有运行的,相关操作: * - 重启、启动、停止操作都不会有实质影响,因为记录状态总是EXIT。 * */ if (chk.only) { this.getProcessData() try { let nametable = {} nametable[chk.command] = true if (chk.command.indexOf('/') >= 0) { let ns = chk.command.split('/').filter(p => p.length > 0) nametable[ns[ns.length-1]] = true } chk.commandList.forEach(x => { !nametable[x] && (nametable[x] = true) }) let cmds = this.processData.getChildsByCMDArgs(nametable, chk.onlyArgs) if (cmds.length > 0) { if (chk.force) { process.kill(cmds[0].pid, 'SIGKILL') } else { chk.cause = `RUNNING|CMDLINE-CHECK|服务已经运行,并且不属于cdpc管理。` this.debug && console.error(chk.cause) return chk } } } catch (err) { this.errorHandle(err, '--ERR-CMDLINE-CHECK--') chk.cause = `${err.code||'CMDLINE'}|CMDLINE-CHECK|服务已经运行,并且不属于cdpc管理,无法强制退出。` return chk } } chk.cause = '' let ch = spawn(chk.command, chk.args, chk.options) chk.child = ch ch.on('spawn', (stdout, stderr) => { let cur = this.appName[chk.name] ? (this.childs[ this.appName[chk.name] ] || chk) : chk cur.lockForStart = false cur.restarting = false cur.state = this.state.RUNNING cur.cgroup && this.addToCgroup(cur, cur.cgroup) this.setAfterCount(cur, this.state.RUNNING) this.writeChildState(cur) }) let self = this ch.on('exit', (code, sig) => { let cur = this.appName[chk.name] ? (this.childs[ this.appName[chk.name] ] || chk) : chk cur.lockForStart = false cur.child = null cur.code = code cur.signal = sig cur.state = this.state.EXIT this.setAfterCount(cur, this.state.EXIT) cur.restart !== 'remove' && this.writeChildState(cur) //在触发exit事件的时候,后续添加的exit事件函数仍然会被执行,只有此阶段结束,才会在事件循环的下一个环节执行此微任务。 queueMicrotask(() => { try { ch && ch.removeAllListeners() ch = null } catch (err){} }) if (cur.restart === 'none' || cur.restart === 'remove') return //若收到信号退出,进程不做监听处理,code为null if ((cur.restart === 'fail' || cur.restart === 'fail-count') && cur.code === 0) { cur.autoRemove && self.remove(cur.name) return } if (cur.restart === 'count' || cur.restart === 'fail-count') { if (cur.restartCount >= cur.restartLimit) { cur.autoRemove && self.remove(cur.name) return } } (cur.restartCount < this.maxCount) && (cur.restartCount += 1) if (cur.restartDelay <= 0) { this.startChild(cur) } else { setTimeout(() => { this.startChild(cur) }, cur.restartDelay) } }) //一个错误事件不一定会导致触发exit事件,若通过callback自定义error事件处理需要注意。 ch.on('error', err => { let cur = this.appName[chk.name] ? (this.childs[ this.appName[chk.name] ] || chk) : chk cur.lockForStart = false cur.state = this.state.ERROR this.writeChildState(cur) cur.cause = `${err.code||'ERROR'}|ERROR|${err.message}` if (err.code === 'ENOENT') { //命令没有发现,此时检查是否有备用命令 if (cur.commandList && cur.commandList.length > 0) { let backup_command = cur.commandList[cur.commandIndex % cur.commandList.length] cur.commandIndex++ //已经重试过2次,不再重新尝试运行 if (cur.commandIndex >= cur.commandList.length * 2) { return this.errorHandle(err, '--ERR-CHILD--') } if (backup_command) { this.setChildCommand(cur.name, backup_command) return this.startChild(cur) } } } this.errorHandle(err, '--ERR-CHILD--') }) if (chk.onError && typeof chk.onError === 'function') { ch.on('error', chk.onError) } if (chk.callback && typeof chk.callback === 'function') { chk.callback(ch, this, chk) } return chk } matchCommandName(command, name, args) { if (!name || !command) return false if (command === name) return true let ind = command.lastIndexOf(name) if (command.substring(ind) === name) return true return false } /** * * @param {string} cmd * @param {function|null} callback * @returns */ findByCommand(cmd, callback=null) { let cb = typeof callback === 'function' ? callback : this.matchCommandName let chs = [] for (let k in this.childs) { if (cb(this.childs[k].command, cmd, this.childs[k].args)) { chs.push(this.childs[k].name) } } return chs } find(name) { if (name && typeof name === 'object') return name let ck = this.appName[name] if (!ck) return null return this.childs[ck] || null } disable(name) { let chk = this.find(name) if (!chk) return false chk.disabled = true this.writeChildState(chk) return chk } enable(name) { let chk = this.find(name) if (!chk) return false chk.disabled = false this.writeChildState(chk) return chk } doAllChilds(op='kill', sig='SIGTERM') { let ch; for (let k in this.childs) { ch = this.childs[k]; try { switch (op) { case 'kill': if (ch.state !== this.state.EXIT && ch.child && ch.child.kill) ch.child.kill(sig); break case 'stop': case 'remove': case 'safeRemove': ;(ch.state !== this.state.EXIT) && this[op](ch.name) break } } catch(err) { this.errorHandle(err, `--ERR-CHILD-${op.toUpperCase()}--`) } }//end for } remove(name) { let chk = this.find(name) if (!chk) return false if (chk.state === this.state.RUNNING) { chk.restart = 'remove' try { this.killAllChilds(chk.child.pid, 'SIGKILL') ;(chk.state === this.state.RUNNING) && chk.child.kill('SIGKILL') } catch (err){} chk.state = this.state.EXIT } let ck = this.appName[name] let ind = this.monitorChilds.indexOf(ck) ind >= 0 && this.monitorChilds.splice(ind, 1) delete this.appName[name] delete this.childs[ck] fs.unlink(this.appStateFile(chk), err => { this.debug && err && console.error(err) }) return chk } safeRemove(name, timeout=1000, callback=null) { if (typeof timeout === 'function') { callback = timeout timeout = 1000 } let chk = this.find(name) if (!chk) return false this.stop(name, timeout) setTimeout(() => { this.remove(name) try { callback && (typeof callback === 'function') && callback(chk) } catch (err) { this.debug && this.errorHandle(err, '--ERR-SAFE-REMOVE-CALLBACK--') } }, _realTimeout(timeout, chk.stopTimeout)) return chk } add(cfg) { if (!this.checkConfig(cfg)) return false return this.tryMakeChild(cfg) } has(name) { if (this.appName[name] && this.childs[ this.appName[name] ]) { return true } return false } stop(name, timeout=1000, callback=null) { let chk = this.find(name) if (!chk) return false if (!chk.child || chk.child.exitCode !== null) { chk.restart = 'none' chk.state = this.state.EXIT this.writeChildState(chk) if (callback && typeof callback === 'function') { return callback(chk) } return true } if (typeof timeout === 'function') { callback = timeout timeout = 5000 } let pid = chk.child.pid chk.restart = 'none' try { if (chk.state === this.state.PAUSE) { process.kill(chk.child.pid, 'SIGCONT') } this.killAllChilds(chk.child.pid, 'SIGTERM') process.kill(chk.child.pid, 'SIGTERM') } catch (err) { this.errorHandle(err, '--ERR-STOP--') } let callback_run = false queueMicrotask(() => { if (chk.state === this.state.EXIT) { this.writeChildState(chk) if (callback && (typeof callback === 'function')) { callback_run = true callback(chk) } } }) let check_callback = () => { chk.stopTimer = null try { if (chk.child && chk.child.exitCode === null && chk.child.pid === pid) { this.killAllChilds(pid, 'SIGKILL') process.kill(pid, 'SIGKILL') } } catch (err) { this.errorHandle(err, '--ERR-STOP-KILL--') } try { chk.state = this.state.EXIT this.writeChildState(chk) !callback_run && callback && (typeof callback === 'function') && callback(chk) } catch (err) { this.errorHandle(err, '--ERR-STOP-CALLBA