cdpc
Version:
child process management
2,047 lines (1,660 loc) • 88.8 kB
JavaScript
'use strict'
const process = require('node:process');
const {spawn} = require('node:child_process')
const path = require('node:path')
const fs = require('node:fs')
const os = require('node:os')
const crypto = require('node:crypto')
const nps = require('./nps.js')
const cgroup = require('./cgroup.js')
let fsp = fs.promises
let cpuTotal = os.cpus().length
/**
* 可用CPU用于进程的CPU资源占用控制
*/
let availCPU = 1
if (cpuTotal == 2 || cpuTotal == 3) {
availCPU = cpuTotal - 0.5
} else if (cpuTotal > 3) {
availCPU = cpuTotal - 1
}
/**
* name
* file
* command
* args
* options
* restart: always | count | fail | none
* restartLimit
* restartDelay default 100
* user
* group
*/
/**
* 我将尽可能把所有功能放在一个文件里,并分块以注释分隔和说明。
*/
let outError = (err) => {
setTimeout(() => {
if (typeof err === 'string') {
console.error(`\x1b[1;35m${err}\x1b[0m`)
} else {
console.error(err)
}
}, 120)
}
/* let _stat_index = {
pgrp : 4,
session : 5,
utime : 13,
stime : 14,
cutime : 15,
cstime : 16,
vsize : 22,
rss : 23,
} */
let _stat_index = {
pgrp : 2,
session : 3,
utime : 11,
stime : 12,
cutime : 13,
cstime : 14,
vsize : 20,
rss : 21,
}
/**
* 多数情况下,默认的页大小都是4K,但是这跟系统以及硬件环境有关。不排除会有特殊情况出现,比如配置启用了hugepagesize。
* 如何计算页面大小:从开始启动,程序就获取自身的信息,通过获取status和stat文件的信息,计算出页面大小。
* 主要目的是仍然利用stat获取进程的负载信息,这里有cpu时间片,而status中无法获取。
* @param {number|string} pid
* @param {object} pobj
* @returns {object}
*/
async function parse_linux_proc_stat(pid, pobj) {
if (!pid && pid !== 0) {
return pobj
}
let comm = await fsp.readFile(`/proc/${pid}/comm`, {encoding: 'utf8'})
let data = await fsp.readFile(`/proc/${pid}/stat`, {encoding: 'utf8'})
let lastParen = data.lastIndexOf(')')
if (lastParen === -1) return false
// 跳过 ') ' 最后的 )之后肯定是状态
let infoPart = data.substring(lastParen + 2).trim()
let dlines = infoPart.split(' ')
pobj.utime = parseInt(dlines[_stat_index.utime])
pobj.stime = parseInt(dlines[_stat_index.stime])
pobj.cutime = parseInt(dlines[_stat_index.cutime])
pobj.cstime = parseInt(dlines[_stat_index.cstime])
pobj.rss = parseInt(dlines[_stat_index.rss])
pobj.pgrp = parseInt(dlines[_stat_index.pgrp])
pobj.session = parseInt(dlines[_stat_index.session])
return pobj
}
async function get_linux_pagesize() {
let obj = {}
try {
await parse_linux_proc_stat(process.pid, obj)
let p = nps.parse_linux_status(process.pid)
let pagesize = p.rss / obj.rss + 0.2
return isNaN(pagesize) ? false : parseInt(pagesize)
} catch (err) {
return false
}
}
/**
user (1) Time spent in user mode.
nice (2) Time spent in user mode with low priority (nice).
system (3) Time spent in system mode.
idle (4) Time spent in the idle task.
iowait (since Linux 2.5.41)
(5) Time waiting for I/O to complete.
irq (since Linux 2.6.0)
(6) Time servicing interrupts.
softirq (since Linux 2.6.0
(7) Time servicing softirqs.
steal (since Linux 2.6.11)
(8) Stolen time, which is the time spent in other
operating systems when running in a irtualized environment
guest (since Linux 2.6.24)
(9) Time spent running a virtual CPU for guest operating systems under the control of the Linux kernel.
guest_nice (since Linux 2.6.33)
(10) Time spent running a niced guest.
See: man 5 proc.
*/
async function parse_linux_cpu(options = {all: false}) {
let data = await fsp.readFile('/proc/stat', {encoding: 'utf8'})
let dlines = data.split('\n')
let cone = dlines[0].split(' ').filter(p => p.length)
let tmpval = 0
for (let i = cone.length - 1; i > 0; i--) {
tmpval += parseInt(cone[i])
}
let totalCPULoad = {
total: tmpval,
idle: parseInt(cone[4])
}
if (!options || !options.all) {
return totalCPULoad
}
let cpus = []
cpus.push(totalCPULoad)
let dtmp
for (let i = 0; i < cpuTotal; i++) {
tmpval = 0
dtmp = dlines[i+1].split(' ').filter(p => p.length)
for (let k = dtmp.length - 1; k > 0; k--) {
tmpval += parseInt(dtmp[k])
}
cpus.push({
total: tmpval,
idle: parseInt(dtmp[4])
})
}
return cpus
}
/**
* 计算总的CPU负载
* @param {object} c1
* @param {object} c2
*/
function caclt_cpu_load(c1, c2) {
let idle = c2.idle - c1.idle
let total = c2.total - c1.total
return (total * 1.0 - (idle * 1.0)) / total
}
function caclt_proc_total_time(p) {
return p.utime + p.stime + p.cutime + p.cstime
}
/**
* 计算单个进程的CPU负载
* @param {object} p1
* @param {object} p2
* @param {object} c1
* @param {object} c2
*/
function caclt_proc_cpu(p1, p2, c1, c2) {
let p1_total = caclt_proc_total_time(p1)
let p2_total = caclt_proc_total_time(p2)
return ((p2_total - p1_total) * 1.0 * cpuTotal) / (c2.total - c1.total)
}
function fmt_percent(n) {
return (n * 100).toFixed(2)
}
let _linux_meminfo = {}
async function parse_linux_mem() {
let data = await fsp.readFile('/proc/meminfo', {encoding: 'utf8'})
let dlines = data.split('\n')
let t
let ktmp
let k
dlines.forEach((a,ind) => {
if (!a) return
t = a.split(' ').filter(p => p.length > 0)
k = t[0].substring(0, t[0].length - 1)
ktmp = _linux_meminfo[k]
if (ktmp) {
ktmp.value = parseInt(t[1])
ktmp.unit = t.length > 2 ? t[2].toLowerCase() : ''
} else {
_linux_meminfo[k] = {
value: parseInt(t[1]),
unit: t.length > 2 ? t[2].toLowerCase() : ''
}
}
})
}
function fmt_mem_value(v, unit='kb') {
if (unit === 'kb') {
return ((v * 1.0) / 1024).toFixed(2)
}
if (unit === 'mb') {
return v.toFixed(2)
}
if (unit === 'b') {
return (v * 1.0 / 1024 / 1024).toFixed(2)
}
}
//判断一个数字在某个2的整数次幂范围,并返回最接近的整数。
function auto_pagesize(n) {
if (n <= 4) return 4
let t = 2
let left_t,right_t
for (let i = 2; i < 22; i++) {
left_t = 2**i
if (n === left_t) return n
right_t = 2**(i+1)
if (n === right_t) return n
if (n > left_t && n < right_t) {
return ((right_t - n) > (n - left_t)) ? left_t : right_t
}
}
return n
}
/**
* 解析进程的网络收发数据。
* @param {int} pid
*/
async function parse_linux_pid_net(pid, rtdata, rtindex=[1,2,9,10]) {
try {
if (!pid && pid !== 0) return false
let netfile = `/proc/${pid}/net/dev`
let netdata = await fsp.readFile(netfile, {encoding: 'utf8'})
let datalines = netdata.split('\n').slice(2)
let last_recv = 0
let last_tran = 0
if (!rtdata) {
rtdata = {
recvBytes: 0,
recvPackets: 0,
transmitBytes: 0,
transmitPackets: 0,
getTime: 0,
duration: 0,
recvRate: 0,
transmitRate: 0,
devData: {}
}
} else {
last_recv = rtdata.recvBytes
last_tran = rtdata.transmitBytes
rtdata.recvBytes = 0
rtdata.recvPackets = 0
rtdata.transmitBytes = 0
rtdata.transmitPackets = 0
}
let dl, recv,recvpack,tran,tranpack
let [rind, rpind, tind, tpind] = rtindex
let tm = Date.now()
let tmdiff = 0
for (let d of datalines) {
dl = d.split(' ').filter(p => p.length > 0)
if (dl.length < 11) continue
recv = parseInt(dl[rind]) || 0
recvpack = parseInt(dl[rpind]) || 0
tran = parseInt(dl[tind]) || 0
tranpack = parseInt(dl[tpind]) || 0
rtdata.recvBytes += recv
rtdata.recvPackets += recvpack
rtdata.transmitBytes += tran
rtdata.transmitPackets += tranpack
rtdata.devData[dl[0]] = {
recvBytes: recv,
recvPackets: recvpack,
transmitBytes: tran,
transmitPackets: tranpack,
}
}
rtdata.duration = tm - rtdata.getTime
rtdata.getTime = tm
rtdata.recvRate = parseInt((rtdata.recvBytes - last_recv) * 1000 / rtdata.duration)
rtdata.transmitRate = parseInt((rtdata.transmitBytes - last_tran) * 1000 / rtdata.duration)
return rtdata
} catch(err) {
return false
}
}
async function parse_linux_socket(obj) {
let data = await fsp.readFile(`/proc/net/sockstat`, {encoding: 'utf8'})
let dlines = data.split('\n').filter(p => p.length > 0)
let tcps = dlines[1].split(' ').filter(p => p.length > 0)
tcps.length > 2 && (obj.tcp.inuse = parseInt(tcps[2]) || 0)
let udps = dlines[2].split(' ').filter(p => p.length > 0)
udps.length > 2 && (obj.udp.inuse = parseInt(udps[2]) || 0)
}
async function parse_linux_socket6(obj) {
let data = await fsp.readFile(`/proc/net/sockstat6`, {encoding: 'utf8'})
let dlines = data.split('\n').filter(p => p.length > 0)
let tcps = dlines[0].split(' ').filter(p => p.length > 0)
tcps.length > 2 && (obj.tcp.inuse = parseInt(tcps[2]) || 0)
let udps = dlines[1].split(' ').filter(p => p.length > 0)
// [FIX] 原代码误用 tcps[2],应为 udps[2]
udps.length > 2 && (obj.udp.inuse = parseInt(udps[2]) || 0)
}
/** -- start cdpc -- */
function _checkAppName(name) {
if (['__all__'].indexOf(name) >= 0) return false
// 长度上限 50 字符(首字符 1 + 后续 49)。
// 允许 @:用于 user@<username> 这类区分来源的命名(如授权用户的配置文件名)。
return (/^[a-z0-9_][a-z0-9_@-]{0,49}$/i).test(name)
}
// [FIX] 从配置文件路径派生 name:去扩展名后直接做 name,
// 不做空白替换等 sanitize,文件名必须本身符合 _checkAppName,否则返回 null。
function _deriveNameFromFile(fpath) {
let base = fpath.substring(fpath.lastIndexOf('/') + 1)
let dot = base.lastIndexOf('.')
if (dot > 0) {
let ext = base.substring(dot + 1).toLowerCase()
if (ext === 'js' || ext === 'cjs' || ext === 'mjs' || ext === 'json') {
base = base.substring(0, dot)
}
}
if (!_checkAppName(base)) return null
return base
}
function _realTimeout(timeout, stopTimeout) {
let real_timeout = (isNaN(timeout) || timeout < 5) ? 5000 : timeout
if (stopTimeout && stopTimeout >= 5 && stopTimeout <= 30000) {
real_timeout = stopTimeout
}
return real_timeout
}
class CDPC {
constructor(options={}) {
this.cpus = os.cpus()
this.cpuTotal = this.cpus.length
this.state = {
PREPARE: 'p',
EXIT: 'e',
RUNNING: 'r',
PAUSE: 's',
ERROR: '!'
}
this.nps = nps.nps
this.anps = nps.anps
this.stateName = {}
for (let k in this.state) this.stateName[ this.state[k] ] = k
this.command = process.argv[0]
this.args = process.argv.slice(1)
this.maxCount = 365435296162
//cache monitor timer
this.monitorTimer = null
/**
* 记录在某些服务运行后才运行的依赖关系。
* app1 : [
* 'x', 'y', 'z'
* ]
* 表示x y z 服务要在app1运行后才运行。
* 此时,要对每个依赖的服务记录引用计数,一旦一个所依赖的服务运行,则引用计数 - 1。
*/
this.relationAfter = {}
// [REFACTOR] childs 现在以 name 为主键(原来是 ck)。
// chk.ck 字段保留 cmdline 哈希用于变更检测(_cleanupOldByName / reload)。
// 原 appName 索引整体移除,直接 childs[name] 即可。
this.childs = {}
// monitorChilds 现在存 name(原来存 ck)。
this.monitorChilds = []
this.notExit = false
this.notExitButSpread = false
// detached 是 spawn 的能力,cdpc 始终透传(per-child cfg.options.detached)。
// allowDetached 是实例级策略闸门:默认 false,不放行;置 true 才允许 detached
// 子进程及其接管恢复。checkConfig 阶段对未放行的 detached 请求归正为 false。
this.allowDetached = false
this.beforeStartCallback = false
// [包A] loadConfig 完成后的回调;最近一次加载结果缓存。
this.onLoadConfig = false
this.lastConfigResult = null
// cgroup 子树的 base 目录,默认空(cgroup.js 回退到 /sys/fs/cgroup)。
// 设值后 limit 组建在该目录下而非 cgroup 根,避免进程脱离 systemd service cgroup。
this.cgroupBaseDir = ''
this.userFile = '/etc/passwd'
this.groupFile = '/etc/group'
this.config = ''
this.debug = false
this.eventDir = '/tmp/cdpc_watch_wxm'
this.notWatch = false
this.showColor = false
//若设置则表示要把负载信息写入到文件。
//若设置为--mem,则表示在内存存储,如何使用需要自己实现。
this.loadInfoFile = ''
//text | json | html | colortext
this.loadInfoType = 'text'
this.errorHandle = (err, errname = '--ERR--') => {
if (errname === '--ERR-CONFIG--') {
this.debug && outError(this.lastErrorInfo)
return
}
this.debug && console.error(errname, err)
}
if (!options || options.toString() !== '[object Object]') {
options = {}
}
//SIGHUP 用于命令程序的配置读取。
this.signals = [
'SIGINT', 'SIGTERM', 'SIGABRT', 'SIGQUIT'
]
this.sigCount = {
SIGTERM: 0,
SIGINT: 0,
SIGABRT: 0,
SIGQUIT: 0
}
//不太优雅的选项处理,不打算抽离出专门的选项处理模块。
for (let k in options) {
switch (k) {
case 'signalHandle':
case 'onExit':
case 'errorHandle':
case 'beforeStartCallback':
case 'onLoadConfig':
if (typeof options[k] === 'function') this[k] = options[k];
break
case 'userFile':
case 'groupFile':
case 'config':
if (typeof options[k] === 'string') {
try {
fs.accessSync(options[k])
this[k] = options[k]
} catch (err) {
console.error(err)
}
}
break
case 'eventDir':
if (typeof options[k] === 'string') this[k] = options[k];
break
case 'signalNotExit':
case 'notExit':
this.notExit = !!options[k]
break
case 'debug':
case 'notWatch':
case 'notExitButSpread':
case 'allowDetached':
this[k] = !!options[k]
break
case 'loadInfoFile':
if (typeof options[k] === 'string') this[k] = options[k];
break
case 'loadInfoType':
if (['text', 'json'].indexOf(options[k]) >= 0)
this[k] = options[k];
break
case 'showColor':
this[k]= !!options[k]
break
case 'cgroupBaseDir':
if (typeof options[k] === 'string') this[k] = options[k];
break
default:;
}
}//end for
process.on('exit', this.onExit.bind(this))
for (let sig of this.signals) {
process.on(sig, this.signalHandle.bind(this))
this.sigCount[sig] = 0
}
//若没有监听SIGHUP,则默认监听此信号用于重新加载配置文件,此操作会先停止所有服务,并重新启动。
if (process.listenerCount('SIGHUP') <= 0) {
process.on('SIGHUP', async sig => {
await this.reLoadConfig()
})
}
if (process.listenerCount('SIGALRM') <= 0) {
process.on('SIGALRM', sig => {})
}
this.lastErrorInfo = ''
this.watchHandleTime = 490
this.watchHandleTable = {}
this.watchEvents = [
'stop', 'start', 'resume', 'pause', 'restart', 'remove',
'restartCount', 'resetCount', 'forceRemove', 'safeRemove',
'disable', 'enable', 'state'
]
this.linuxUsers = {}
this.linuxGroups = {}
this.startWatch()
this.initMonitorData()
this.initCgroup()
this.processData = this.nps()
this.processDataTime = Date.now()
}
getProcessData() {
let tm = Date.now()
if (tm - this.processDataTime > 1000) {
this.processData = this.nps()
this.processDataTime = tm
}
return this.processData
}
initCgroup() {
if (!process.geteuid || process.platform !== 'linux') return false
if (process.geteuid() !== 0) return false
this.cgroup = new cgroup(this.cgroupBaseDir || undefined)
this.removeCgroup = async (name) => {
let cgrp_path = this.cgroup.findPath(name)
if (!cgrp_path) return false
return new Promise((rv, rj) => {
this.runChilds({
autoRemove: true,
restart: 'count',
restartLimit: 0,
monitor: false,
command: 'rmdir',
args: [name],
cwd: this.cgroup.cgdir,
user: 'root',
callback: (ch, cm, chs) => {
ch.on('exit', (code) => {
rv(code === 0 ? {ok: true} : {ok: false})
})
ch.on('error', err => {
rj(err)
})
}
})
}).catch (err => {
return {ok: false, message: err.message}
})
}
}
//子进程被创建后会自动放入到父进程所在的cgroup
async addToCgroup(chk, cname) {
if (!this.cgroup || process.geteuid() ) return {
ok: false,
message: '非root用户不具备操作权限'
}
if (typeof chk === 'string') {
chk = this.find(chk)
}
if (!chk || typeof chk !== 'object') return { ok: false, message: '不是合法的子进程'}
if (!chk.child) return {ok: false, message: '未运行'}
try {
return this.cgroup.addPids(cname, [chk.child.pid])
} catch (err) {
return {ok: false, message: err.message}
}
return {ok: true}
}
initMonitorData() {
this.lastCPULoad = null
this.loadinfo = {
cpu: 0,
cpun: 0,
cpus: [],
cpuns: [],
socket: {tcp:{}, udp: {}},
socket6: {tcp:{}, udp: {}},
mem: _linux_meminfo
}
// 滚动显示相关变量(替代分页)
this.scrollIndex = 0 // 当前滚动位置
this.headerLines = 6 // 头部信息占用行数(CPU/MEM/Socket/CPU列表)
this.linesPerService = 3 // 每个服务占用行数
this._memPagesize = 4
this._lastCPULoad = null
this._curCPULoad = null
this.__child_number = 0
//采用步进式策略,定时器会非常快速的执行,但是不会每次都获取监控信息。
this.stepSlice = 100
this.maxStep = 10
this.stepCount = 10
this.dynamicMaxStep = 15
this.useDynamicStep = true
//动态步进的每次前进间隔
this.dynamicStep = 1
if (process.platform !== 'linux') return false
get_linux_pagesize().then(psize => {
if (!psize || psize < 4) return false;
//页面文件大小只能是2的整数次幂:4 8 16 ... Linux支持huagePagesize到2M,也可能极端情况会有1G。
this._memPagesize = auto_pagesize(psize)
})
}
/**
* 初始化终端交互:键盘滚动、窗口自适应
*/
_initTerminal() {
if (!process.stdout.isTTY) return false
const readline = require('readline')
// 启用原始模式以捕获方向键
process.stdin.setRawMode(true)
process.stdin.resume()
process.stdin.setEncoding('utf8')
readline.emitKeypressEvents(process.stdin)
process.stdin.on('keypress', (str, key) => {
if (!key) return
const maxScroll = Math.max(0, this.monitorChilds.length - this._getVisibleCount())
if (key.name === 'up') {
this.scrollIndex = Math.max(0, this.scrollIndex - 1)
} else if (key.name === 'down') {
this.scrollIndex = Math.min(maxScroll, this.scrollIndex + 1)
} else if (key.name === 'q' || (key.ctrl && key.name === 'c')) {
// q 或 Ctrl+C 退出
process.stdin.setRawMode(false)
process.exit(0)
}
})
// 窗口大小变化时调整滚动位置
process.stdout.on('resize', () => {
const maxScroll = Math.max(0, this.monitorChilds.length - this._getVisibleCount())
this.scrollIndex = Math.min(this.scrollIndex, maxScroll)
})
return true
}
/**
* 计算当前终端可见的服务数量
*/
_getVisibleCount() {
if (!process.stdout.isTTY) return 10
const available = process.stdout.rows - this.headerLines - 1 // 减去底部状态栏
return Math.max(1, Math.floor(available / this.linesPerService))
}
setStepSlice(tslice) {
if (typeof tslice !== 'number' || tslice < 1 || tslice > 200) return false
this.stepSlice = tslice
return this.stepSlice
}
setMaxStep(max_step=100, dyn_step=0) {
if (isNaN(max_step)) return false
if (isNaN(dyn_step)) dyn_step = 0
if (max_step >= 1 && max_step < 500) {
this.maxStep = max_step
}
if (dyn_step > 0) {
this.dynamicMaxStep = dyn_step
}
}
setDynamicStep(max_step=0) {
if (isNaN(max_step)) return false
this.dynamicMaxStep = max_step
return this.dynamicMaxStep
}
killChilds(sig, quiet=false) {
let ch;
let count = 0;
for (let k in this.childs) {
ch = this.childs[k];
if (ch.state !== this.state.EXIT && ch.child && ch.child.kill) {
count++;
try {
ch.child.kill(sig);
} catch (err) {
!quiet && this.errorHandle(err, '--ERR-CHILD-KILL--');
}
}
}
return count;
}
async signalHandle(sig) {
if (this.notExit) {
return
}
//为了保证一些终止信号在退出时不会重复发送。
;(this.sigCount[sig] !== undefined) && (this.sigCount[sig] += 1);
if (this.signals.indexOf(sig) >= 0) {
//此处需要考虑是否替换为killAllChilds,目前运行良好,暂时可以不更新。
let count = this.killChilds(sig)
//如果只是扩散但是不退出,则不必继续检测,直接返回。
if (this.notExitButSpread) {
return
}
//此处不能立即退出,要给子进程留出时间进行清理工作。
for (let i = 0; i < 50; i++) {
if (count <= 0) {
process.exit(0)
}
await new Promise((rv, rj) => {
setTimeout(() => { rv() }, 10)
})
this.debug && i > 45 && console.log('存在没有终止的进程···')
count = this.killChilds(sig)
}
if (count > 0) {
this.debug && console.log('使用SIGKILL终止进程')
this.killChilds('SIGKILL', true)
}
process.exit(0)
}
}
/**
* 当程序直接收到信号退出时,并不会执行此函数,
* 所以一个进程是因为信号异常终止,其子进程会成为守护进程。
* */
onExit(code) {
try {
if (code === 0) {
for (let k in this.sigCount) {
if (this.sigCount[k] > 0) {
this.sigCount[k] = 0
return
}
}
this.killChilds('SIGTERM')
} else {
this.killChilds('SIGKILL')
}
// [FIX] 退出时清理所有 detached 进程的 PID 文件(正常退出说明已 kill)
this._cleanupAllPidFiles()
} catch (err) {
this.errorHandle(err, '--ERR-EXIT--')
}
}
// [FIX] 清理所有 PID 文件
_cleanupAllPidFiles() {
let pidDir = `${this.eventDir}/pids`
try {
fs.accessSync(pidDir)
let files = fs.readdirSync(pidDir)
for (let f of files) {
try {
fs.unlinkSync(`${pidDir}/${f}`)
} catch (e) {}
}
} catch (err) {
// pids 目录不存在则忽略
}
}
strong(unexp = null, unrej = null) {
if (!unexp || typeof unexp !== 'function') {
unexp = (err, orgi) => {
this.errorHandle(err, `--ERR-${orgi}--`)
}
}
process.on('uncaughtException', unexp)
if (!unrej || typeof unrej !== 'function') {
unrej = (reason, promise) => {
this.errorHandle(reason, '--ERR-PROMISE--')
}
}
process.on('unhandledRejection', unrej)
}
tnps(callback, timeout=1000) {
return setInterval(() => {
;(typeof callback === 'function') && this.anps().then(data => {callback(data)})
}, timeout)
}
/**
* 只会kill掉所有子进程
*/
killAllChilds(pid, sig='SIGTERM') {
if (process.platform !== 'linux') {
return
}
try {
let pst = nps.nps()
pst.getAllChilds(pid, true).forEach(p => {
process.kill(p, sig)
})
} catch (err) {
this.errorHandle(err, '--ERR-KILL-ALL-CHILDS--')
}
}
initEventsDir() {
this.watchEvents.forEach(d => {
let dfile = `${this.eventDir}/${d}`
try {
fs.accessSync(dfile)
let fst = fs.statSync(dfile)
if (!fst.isDirectory()) {
fs.unlinkSync(dfile)
fs.mkdirSync(dfile, {mode: 0o755})
}
} catch (err) {
fs.mkdirSync(dfile, {mode: 0o755})
}
})
// [FIX] 初始化 pids 目录,用于 detached 进程 PID 持久化
let pidDir = `${this.eventDir}/pids`
try {
fs.accessSync(pidDir)
let fst = fs.statSync(pidDir)
if (!fst.isDirectory()) {
fs.unlinkSync(pidDir)
fs.mkdirSync(pidDir, {mode: 0o755})
}
} catch (err) {
fs.mkdirSync(pidDir, {mode: 0o755})
}
}
async startWatch() {
if (this.notWatch) return;
let evtst = true
try {
fs.accessSync(this.eventDir)
} catch (err) {
evtst = false
}
if (!evtst) {
try {
fs.mkdirSync(this.eventDir)
} catch (err) {
this.errorHandle(err, '--ERR-MKDIR-EVENT--')
return;
}
}
this.initEventsDir()
this.watchEvents.filter(x => {
if (['state', 'childs'].indexOf(x) < 0) return x
}).forEach(d => {
let evt_dir = `${this.eventDir}/${d}`
let event_name = d
fs.watch(evt_dir, (evt, fname) => {
if (evt === 'rename') {
try {
fs.accessSync(`${evt_dir}/${fname}`)
} catch (err) {
return false
}
}
if (!this.has(fname) && fname !== '__all__') {
fs.unlink(`${evt_dir}/${fname}`, err => {err && this.errorHandle(err, '--ERR-REMOVE-NOT-HAS--')})
return false
}
let ek = event_name + '_' + fname
let tm = Date.now()
if (!this.watchHandleTable[ek]) {
this.watchHandleTable[ek] = {time: tm}
} else {
if (tm < (this.watchHandleTable[ek].time + this.watchHandleTime + 10) ) {
return false
}
}
this.watchHandleTable[ek].time = tm
let applist = []
if (fname !== '__all__') {
applist.push(fname)
} else {
for (let k in this.childs) {
applist.push(k)
}
}
for (let a of applist) {
this[event_name](a)
}
})
})
/**
* 由于watch在测试发现,会连续触发两次,并且如果是rename事件不必处理,只需要对change事件处理。
* */
fs.watch(this.eventDir, (evt, fname) => {
if (['load', 'reload'].indexOf(fname) < 0) return false
let tm = Date.now()
if (!this.watchHandleTable[fname]) {
this.watchHandleTable[fname] = {time: tm}
} else {
if (tm < (this.watchHandleTable[fname].time + this.watchHandleTime + 10) ) {
return false
}
}
this.watchHandleTable[fname].time = tm
if (fname === 'load') {
try {
let data = fs.readFileSync(`${this.eventDir}/${fname}`, {encoding: 'utf8'});
data = data.trim()
this.loadConfig(data)
} catch (err) {
this.errorHandle(err, '--ERR-LOAD--')
}
} else if (fname === 'reload') {
this.reLoadConfig()
}
})
}
/**
* reload事件触发不会导致程序重启,这要求在重新设定配置的过程中,必须要识别正在运行的程序
* @param {string} filename
* @returns
*/
async reLoadConfig(filename = '') {
if (!filename && !this.config) return false
try {
await fsp.access(filename || this.config, fs.constants.F_OK | fs.constants.R_OK)
} catch (err) {
this.errorHandle(err, '--ERR-RELOAD-CONFIG--')
return false
}
return this.loadConfig(filename, true)
}
getUserId(uname) {
if (typeof uname === 'string') {
uname = [uname]
} else if (!Array.isArray(uname)) {
throw new Error('指定的用户必须是字符串或字符串数组。')
}
for (let name of uname) {
if (this.linuxUsers[name]) return this.linuxUsers[name]
}
try {
let data = fs.readFileSync(this.userFile, {encoding: 'utf8'})
let dlines = data.split('\n')
.filter(p => p.length > 0)
.map(a => {
return a.split(':')
});
for (let name of uname) {
for (let d of dlines) {
if (d[0] === name) {
this.linuxUsers[name] = {
uid: parseInt(d[2]),
gid: parseInt(d[3])
}
return this.linuxUsers[name]
}
}
}
} catch (err) {
this.errorHandle(err, '--ERR-USER-ID--')
return null
}
return null
}
getGroupId(grp) {
if (typeof grp === 'string') {
grp = [grp]
} else if (!Array.isArray(grp)) {
throw new Error('指定的用户组必须是字符串或字符串数组。')
}
for (let g of grp) {
if (this.linuxGroups[g]) return this.linuxGroups[g]
}
try {
let data = fs.readFileSync(this.groupFile, {encoding: 'utf8'})
let dlines = data.split('\n')
.filter(p => p.length > 0)
.map(a => {
return a.split(':')
});
for (let g of grp) {
for (let d of dlines) {
if (d[0] === g) {
this.linuxGroups[g] = {
gid: parseInt(d[2])
}
return this.linuxGroups[g]
}
}
}
} catch (err) {
this.errorHandle(err, '--ERR-GROUP-ID--')
return null
}
return null
}
readConfig(filename = '', options = {}) {
if (typeof filename === 'object') {
options = filename
filename = ''
}
let real_file = filename || this.config
if (!real_file) return {
ok: false,
errmsg: 'filename is null'
}
let fst
try {
fs.accessSync(real_file)
fst = fs.statSync(real_file)
} catch (err) {
return {
ok: false,
errmsg: 'file is not exists'
}
}
let flist = []
if (fst.isFile()) {
flist.push(real_file)
}
else if (fst.isDirectory()) {
let files = fs.readdirSync(real_file, {withFileTypes: true})
let t
for (let f of files) {
if (!f.isFile()) continue
t = f.name
if (options && options.ignore && (options.ignore instanceof Array)) {
if (options.ignore.indexOf(t) >= 0) continue
}
if (t.substring(t.length - 5) === '.json' || t.substring(t.length - 3) === '.js')
{
flist.push(`${real_file}/${t}`)
}
}
}
let cfglist = []
// [FIX] 收集逐文件/逐配置项的跳过原因,调用方可结构化拿到。
let skipped = []
let data
let fpath
for (let f of flist) {
try {
fpath = path.resolve(f)
//清理模块缓存,让require重新读取配置文件。
delete require.cache[fpath]
data = require(f)
if ( !Array.isArray(data) ) {
data.configPath = fpath
// [FIX] 单 cfg 对象未指定 name 时,用文件名派生。
// 文件名必须符合 _checkAppName,否则拒绝该 cfg 并记入 skipped。
if (!data.name) {
let derived = _deriveNameFromFile(fpath)
if (!derived) {
skipped.push({
file: fpath,
code: 'BAD_FILENAME_AS_NAME',
message: `配置文件未指定 name,且文件名 "${fpath.substring(fpath.lastIndexOf('/')+1)}" 不符合命名规则,无法作为服务名。`
})
continue
}
data.name = derived
}
cfglist.push(data)
} else {
// 数组模式:每项必须显式 name(文件名只能对应一个服务)。
// 缺 name 的项被丢进 skipped,其他项照常加载。
data.forEach((d, idx) => {
d.configPath = fpath
if (!d.name) {
skipped.push({
file: fpath,
index: idx,
code: 'ARRAY_ITEM_MISSING_NAME',
message: `配置文件以数组导出时,第 ${idx} 项必须显式指定 name。`
})
return
}
cfglist.push(d)
})
}
} catch (err) {
this.errorHandle(err, '--ERR-READ-CONFIG--')
skipped.push({
file: fpath || f,
code: 'READ_ERROR',
message: err.message
})
}
}
return {
ok: true,
data: cfglist,
skipped: skipped
}
}
loadConfig(filename='', reload=false) {
if (typeof filename === 'boolean') {
reload = filename
filename = ''
}
let r = this.readConfig(filename)
if (!r.ok) {
return this._loadResult({
ok: false,
errmsg: r.errmsg,
loaded: [],
skipped: r.skipped || [],
removed: []
})
}
let removed = []
/**
* reload 语义(与设计约定一致):
* - 文件被删除或被改名:旧服务 safeRemove,配置以新的 name 视作新服务启动。
* - 文件内容变更:交给 tryMakeChild 的 reload 分支处理(kill + 重新 spawn)。
*
* [FIX] 之前的实现是先 runChilds 再 safeRemove 差集,
* 存在 ck 复用导致 safeRemove 误伤新进程的暗坑(同 cmdline 被改名时)。
* 新流程:先按 name 做差集 safeRemove,再 runChilds,
* 让 tryMakeChild 进入时旧 name 已经清干净。
*/
try {
if (reload) {
let namesBefore = new Set(Object.keys(this.childs))
// 注意:cfg.name 在 readConfig 阶段已经派生/校验,到这里都是合法 name。
let namesAfter = new Set(
r.data.filter(cfg => cfg.name).map(cfg => cfg.name)
)
for (let oldName of namesBefore) {
// lockReload:程序化添加、不随配置目录同步的服务(如内置 web server)
// 不属于配置目录命名空间,reload 差集移除必须跳过,否则一次 reload 就会误删它。
let curChk = this.childs[oldName]
if (curChk && curChk.lockReload) continue
if (!namesAfter.has(oldName)) {
// [FIX] 必须用同步的 remove(SIGKILL + 立即清注册表),不能用 safeRemove。
// safeRemove 的 registry 清理在 setTimeout 里,紧随其后的 runChilds 会看到
// 旧 chk 仍占着 ck 槽位,导致"同 ck 改名"场景下新服务被 reload no-op 误命中。
// graceful 关闭场景请用户显式 `cdpc stop` 后再编辑配置 reload。
this.debug && console.log(`[reload] 服务 ${oldName} 已从配置中移除,同步清理`)
this.remove(oldName)
removed.push(oldName)
}
}
}
var runResult = this.runChilds(r.data, reload)
} catch (err) {
this.errorHandle(err, '--ERR-LOAD-CONFIG--')
return this._loadResult({
ok: false,
errmsg: err.message,
loaded: [],
skipped: r.skipped || [],
removed: removed
})
}
// loaded = 实际通过 checkConfig 进入 tryMakeChild 的 name 列表。
// checkConfig 失败的项(非法 name、name 冲突、文件不存在等)合并进 skipped。
return this._loadResult({
ok: true,
errmsg: 'ok',
loaded: runResult.ok,
skipped: (r.skipped || []).concat(runResult.failures),
removed: removed
})
}
/**
* 统一处理 loadConfig 的返回:缓存最近一次结果,并触发 onLoadConfig 回调。
* 让调用方(含 watch 触发的 reload)都能拿到结构化的加载报告。
*/
_loadResult(result) {
result.time = Date.now()
this.lastConfigResult = result
if (typeof this.onLoadConfig === 'function') {
try {
this.onLoadConfig(result)
} catch (err) {
this.errorHandle(err, '--ERR-ON-LOAD-CONFIG--')
}
}
return result
}
run(config, reload=false) {
return this.runChilds(config, reload)
}
/**
* @returns {object} { ok: [成功处理的 name], failures: [{file,name,code,message}] }
*/
runChilds(config, reload=false) {
if ( !Array.isArray(config) ) {
config = [ config ]
}
let ok = []
let failures = []
for (let cfg of config) {
if ( this.checkConfig(cfg, reload) ) {
this.tryMakeChild(cfg, reload)
ok.push(cfg.name)
} else {
this.errorHandle(this.lastErrorInfo, '--ERR-CONFIG--')
failures.push({
file: cfg.configPath || '',
name: cfg.name || '',
code: 'CONFIG_INVALID',
message: this.lastErrorInfo
})
}
}
return { ok, failures }
}
/**
* 注意:如果依赖的命令不存在会导致问题,比如Node.js版本变化,旧的版本已经被删除了。
* 检测配置文件参数。
* @param {object} cfg
*/
checkConfig(cfg, reload=false) {
if (!cfg.args || !Array.isArray(cfg.args)) cfg.args = []
if (!cfg.options) cfg.options = {}
//这表示扩展,因为cfg.options.env默认是process.env
if (cfg.env && typeof cfg.env === 'object') {
if (!cfg.options.env || typeof cfg.options.env !== 'object') {
cfg.options.env = {
...process.env
}
}
for (let k in cfg.env) {
cfg.options.env[k] = cfg.env[k]
}
}
// detached 是 spawn 的能力,cdpc 始终透传;是否放行由实例策略 allowDetached
// 决定(默认不允许)。配置写了 detached:true 但未放行 → 归正为 false。
if (cfg.options.detached && !this.allowDetached) {
cfg.options.detached = false
}
if (!cfg.step || typeof cfg.step !== 'number' || cfg.step < 0) cfg.step = 0
if (!cfg.limit || typeof cfg.limit !== 'object') {
cfg.limit = null
}
this.fmtLimit(cfg)
if (!cfg.after || (!Array.isArray(cfg.after) && typeof cfg.after !== 'string' ) ) {
cfg.after = null
} else if (typeof cfg.after === 'string') {
cfg.after = [ cfg.after ]
}
if (!cfg.name) {
cfg.name = ''
} else {
// [FIX] 不再静默截断超长 name,直接由 _checkAppName 拒绝,让用户感知到。
if (!_checkAppName(cfg.name)) {
this.lastErrorInfo =
`命名不合法:${cfg.name},支持字母数字下划线减号,并且以字母或数字开头,长度不超过 50。`
return false
}
}
if (cfg.detail && cfg.detail.length > 50) {
cfg.detail = cfg.detail.substring(0, 50)
}
if (this.childs[cfg.name]) {
if (!reload) {
this.lastErrorInfo = `${cfg.name}:应用名称冲突。(${cfg.name} conflict.)`
return false
}
}
if (cfg.file) {
try {
fs.accessSync(cfg.file)
} catch (err) {
this.debug && outError(err)
return false
}
if (!cfg.command) {
let extname = cfg.file.substring(cfg.file.length - 3)
let extname2 = cfg.file.substring(cfg.file.length - 4)
if (extname === '.js' || extname2 === '.cjs' || extname2 === '.mjs') {
cfg.command = 'node'
cfg.commandList = [this.command]
} else if (extname === '.sh') {
cfg.command = 'bash'
cfg.commandList = [ 'sh' ]
} else if (extname === '.py') {
cfg.command = 'python'
} else {
this.lastErrorInfo = `${cfg.name} ${cfg.file}没有指定运行脚本的命令。`
return false
}
}
if (!cfg.command) {
this.lastErrorInfo = `${cfg.name} 未指定命令`
return false
}
let cwd = path.resolve( path.dirname(cfg.file) )
let cfile = path.basename(cfg.file)
if (!cfg.options.cwd) {
cfg.options.cwd = cwd
}
cfg.realfile = `${cwd}/${cfile}`
if (cfg.args.indexOf(cfg.realfile) < 0)
cfg.args.unshift(cfg.realfile);
}
if (!cfg.onlyArgs || !Array.isArray(cfg.onlyArgs)) {
cfg.onlyArgs = [...cfg.args]
}
if (cfg.restartDelay === undefined || typeof cfg.restartDelay !== 'number') {
cfg.restartDelay = 1000
}
if (cfg.restartLimit !== undefined) {
if (typeof cfg.restartLimit !== 'number') {
cfg.restartLimit = 1
} else if (cfg.restartLimit > this.maxCount) {
cfg.restartLimit = this.maxCount
}
}
if (cfg.restart === undefined || ['count', 'fail', 'always', 'none', 'fail-count'].indexOf(cfg.restart) < 0) {
cfg.restart = 'always'
}
if (cfg.stopTimeout === undefined || typeof cfg.stopTimeout !== 'number' || cfg.stopTimeout < 0)
cfg.stopTimeout = 0;
if (cfg.onceMode) {
cfg.autoRemove = true
cfg.restart = 'count'
cfg.restartLimit = 0
}
if (process.platform !== 'win32') {
if (cfg.user) {
let ug = this.getUserId(cfg.user)
if (ug) {
cfg.options.uid = ug.uid
cfg.options.gid = ug.gid
}
}
if (cfg.group) {
let g = this.getGroupId(cfg.group)
if (g) {
cfg.options.gid = g.gid
}
}
if (process.geteuid() === 0) return true
if (cfg.options.uid !== undefined || cfg.options.gid !== undefined) {
this.lastErrorInfo =
`必须以root用户运行才可以改变子进程的uid和gid。\n\t${JSON.stringify(cfg.options)}`
return false
}
}
return true
}
fmtLimit(cfg) {
if (cfg.limit) {
//最大内存(KB)、最大内存的基础值(KB)、最长运行时间(ms)、频率(f/s)、一天最大允许的运行次数。
;[
'maxrss', 'rssOffset', 'maxtime', 'frequency', 'maxdaylimit', 'maxRestart'
].forEach(x => {
if (cfg.limit[x] === undefined || typeof cfg.limit[x] !== 'number') {
cfg.limit[x] = 0
}
cfg.limit.rssRestartCount = 0
})
}
}
/**
*
* @param {object} chk
* 根据配置生成名称,这样的名称在同样的配置上是唯一的。
*/
makeName(chk) {
// [FIX] 用 basename(command) 派生 name,比哈希命名更具人类可读性,
// 重启后同一配置生成相同 name(可重现)。
let base = chk.command || 'child'
// 取 basename:去掉路径前缀
let lastSlash = base.lastIndexOf('/')
if (lastSlash >= 0) base = base.substring(lastSlash + 1)
// sanitize:保留 _checkAppName 合法字符集,去开头减号
base = base.replace(/[^a-zA-Z0-9_-]/g, '').replace(/^-+/, '')
if (!base || !_checkAppName(base)) base = 'child'
// 找最小可用编号:base / base-2 / base-3 ...
// - 确定性:daemon 重启后同一组配置生成的 name 完全一致
// - 不饱和:删过 base-3 后新加的仍然是 base-3
if (!this.childs[base]) return base
let n = 2
while (this.childs[`${base}-${n}`]) n++
return `${base}-${n}`
}
serialName(prefix='child_') {
if (this.__child_number < 10000) {
this.__child_number++
} else {
this.__child_number = parseInt(Math.random() * 10000) + 10001
}
return `${prefix}${this.__child_number}`
}
//查找依赖的应用,并更新afterCount
setAfterCount(chd, op) {
let name = chd.name;
let tapp;
if (!this.relationAfter[name]) return false;
let rk = this.relationAfter[name];
for (let a of rk) {
tapp = this.find(a);
if (!tapp) continue;
if (op === this.state.EXIT) {
// 修改依赖引用计数,若是此时应用已经运行则无影响,
// 但是若应用退出,此时如果之前依赖的服务已经退出,则必须要等到依赖的服务运行后才会继续重启。
tapp.afterCount += 1;
} else if (op === this.state.RUNNING) {
tapp.afterCount -= 1;
if (tapp.afterCount <= 0) {
//因为tapp是在外层,导致在异步情况下,最终会都去运行最后一个应用。
let tmp_app = tapp;
queueMicrotask(() => {
this.startChild(tmp_app);
});
}
}
}
return true;
}
appEventFile(evt, name) {
;(typeof name === 'object') && (name = name.name);
return `${this.eventDir}/${evt}/${name}`
}
appStateFile(chk) {
return this.eventDir + '/state/' + chk.name
}
// [FIX] PID 文件路径
_pidFilePath(name) {
return `${this.eventDir}/pids/${name}.pid`
}
async writeChildState(chk) {
if (this.notWatch) return false
let cname = this.appStateFile(chk)
let str_state = ''
switch (chk.state) {
case this.state.ERROR:
str_state = 'error'
break
case this.state.PREPARE:
str_state = 'prepare'
break
case this.state.RUNNING:
str_state = 'running'
break
case this.state.EXIT:
str_state = 'exit'
break
case this.state.PAUSE:
str_state = 'pause'
break
}
if (chk.disabled) {
str_state += '(disabled)'
}
// [包B] 优先用真实 child pid,其次用被接管进程 pid
let realPid = chk.child ? (chk.child.pid || '0') : (chk.adoptedPid || '0')
str_state = `${str_state} ${realPid} ${(new Date()).toLocaleString().replaceAll('/', '-')}`
try {
await fsp.writeFile(cname, str_state, {encoding: 'utf8'})
} catch(err) {
return false
}
return true
}
/**
* 更新子进程的command(ENOENT fallback 用)。
* [REFACTOR] childs 按 name 索引后,只需更新 chk 内字段,无需搬家。
*/
setChildCommand(name, command) {
let ap = this.find(name)
if (!ap) return false
ap.command = command
ap.ck = `${command}\x00${ap.args.join('\x00')}`
return ap
}
/**
* [REFACTOR] 统一清理旧进程:在 tryMakeChild 进入主逻辑前调用。
* 现仅处理"name 相同但 ck 变化"的场景(改了 command/args 等)。
* 同 ck 时直接返回,由 tryMakeChild 的 name-hit 分支处理"原样保留"。
* @param {string} name
* @param {string} newCk
*/
_cleanupOldByName(name, newCk) {
let old_chk = this.childs[name]
if (!old_chk || old_chk.ck === newCk) return
// ck 变化:停止旧进程并清理 chk
if (old_chk.child) {
old_chk.restart = 'none'
try {
this.killAllChilds(old_chk.child.pid, 'SIGTERM')
old_chk.child.kill('SIGTERM')
} catch (err) {
try { old_chk.child.kill('SIGKILL') } catch(e) {}
}
}
// [包B] 若旧 chk 是被接管的 detached 进程:终止真实 pid + 停轮询
if (old_chk.adoptedPid) {
try {
this.killAllChilds(old_chk.adoptedPid, 'SIGTERM')
process.kill(old_chk.adoptedPid, 'SIGTERM')
} catch (e) {}
old_chk.adoptedPid = 0
}
if (old_chk._adoptTimer) {
clearInterval(old_chk._adoptTimer)
old_chk._adoptTimer = null
}
// 清理 monitorChilds(已按 name 索引)
let ind = this.monitorChilds.indexOf(name)
if (ind >= 0) this.monitorChilds.splice(ind, 1)
delete this.childs[name]
// 清理 relationAfter 中对该 name 的旧引用
for (let depName in this.relationAfter) {
let arr = this.relationAfter[depName]
let idx = arr.indexOf(name)
if (idx >= 0) arr.splice(idx, 1)
}
// 清理旧的 PID 文件
this._removePidFile(name)
}
// [FIX] 写入 PID 文件(仅 detached 进程)
_writePidFile(name, pid) {
if (this.notWatch) return
try {
let pidFile = this._pidFilePath(name)
fs.writeFileSync(pidFile, `${pid}`, {encoding: 'utf8'})
} catch (err) {
this.debug && this.errorHandle(err, '--ERR-WRITE-PID-FILE--')
}
}
// [FIX] 删除 PID 文件
_removePidFile(name) {
try {
let pidFile = this._pidFilePath(name)
fs.unlinkSync(pidFile)
} catch (err) {
// 文件不存在则忽略
}
}
/**
* [包B] 检查并处理 detached 进程的"接管恢复",在 startChild 的 spawn 之前调用。
*
* cdpcd 重启后,上次以 detached 方式启动的子进程会被 PID 1 收养而存活。
* 新实例需要:识别遗留进程 → 接管(adopt)→ 状态置 RUNNING + 轮询存活,
* 而不是错误地置为 EXIT,也不是盲目再起一个导致端口/资源冲突。
*
* @param {object} chk
* @returns {boolean} true 表示已处理(不应再 spawn);状态由本函数内部设置
*/
_recoverDetachedProcess(chk) {
if (!chk.options || !chk.options.detached) return false
let pidFile = this._pidFilePath(chk.name)
let oldPid
try {
let pidStr = fs.readFileSync(pidFile, {encoding: 'utf8'}).trim()
oldPid = parseInt(pidStr)
if (isNaN(oldPid) || oldPid <= 0) {
fs.unlinkSync(pidFile)
return false
}
} catch (err) {
// PID 文件不存在,无需恢复
return false
}
// 检查旧进程是否仍在运行
let alive = true
let permLimited = false
try {
process.kill(oldPid, 0) // 信号0:仅探测存在性
} catch (err) {
if (err.code === 'ESRCH') {
alive = false
} else if (err.code === 'EPERM') {
permLimited = true // 进程存在,但无权限信号控制
} else {
try { fs.unlinkSync(pidFile) } catch(e) {}
return false
}
}
if (!alive) {
try { fs.unlinkSync(pidFile) } catch(e) {}
return false
}
// [包B] 校验 pid 确实是我们的进程(避免 PID 文件过期后 pid 被无关进程复用)
let cmdbase = chk.command.indexOf('/') >= 0
? chk.command.substring(chk.command.lastIndexOf('/') + 1)
: chk.command
let cmdlineMatch = false
try {
let cmdline = fs.readFileSync(`/proc/${oldPid}/cmdline`, {encoding: 'utf8'})
if (cmdline.indexOf(cmdbase) >= 0) cmdlineMatch = true
} catch (e) {
// 读不到 cmdline:EPERM 场景保守认为是本进程,否则视为 PID 文件过期
cmdlineMatch = permLimited
}
if (!cmdlineMatch) {
this.debug && console.log(`[detached-recover] PID 文件过期 (${chk.name}, pid:${oldPid})`)
try { fs.unlinkSync(pidFile) } catch(e) {}
return false
}
// force:杀掉遗留进程,重新 spawn(用户主动抢回管理权)
if (chk.force) {
this.debug && console.log(
`[detached-recover] force:终止遗留 detached 进程 PID:${oldPid} (${chk.name})`
)
try {
this.killAllChilds(oldPid, 'SIGKILL')
process.kill(oldPid, 'SIGKILL')
} catch (err) {
this.debug && this.errorHandle(err, '--ERR-DETACHED-FORCE-KILL--')
}
try { fs.unlinkSync(pidFile) } catch(e) {}
return false
}
// 非 force:接管遗留进程
this._adoptDetachedProcess(chk, oldPid, permLimited)
return true
}
/**
* [包B] 接管一个遗留的 detached 进程:状态置 RUNNING、记录真实 pid、启动存活轮询。
* 因为不是本实例 spawn 的,拿不到 ChildProcess 句柄,只能轮询 /proc 感知退出。
*/
_adoptDetachedProcess(chk, pid, permLimited=false) {
chk.adoptedPid = pid
chk.state = this.state.RUNNING
chk.lockForStart = false
chk.cause = permLimited
? `ADOPTED|DETACHED-RECOVER|接管遗留 detached 进程(PID:${pid}),无权限信号控制`
: `ADOPTED|DETACHED-RECOVER|接管了上次遗留的 detached 进程(PID:${pid})`
if (chk.monitor && this.monitorChilds.indexOf(chk.name) < 0) {
this.monitorChilds.push(chk.name)
}
this._startAdoptPoll(chk, pid)
this.debug && console.log(`[detached-recover] 已接管 ${chk.name} (PID:${pid})`)
}
/**
* [包B] 轮询被接管进程的存活;退出后触发 _onAdoptedExit 走重启策略。
*/
_startAdoptPoll(chk, pid) {
if (chk._adoptTimer) {
clearInterval(chk._adoptTimer)
chk._adoptTimer = null
}
chk._adoptTimer = setInterval(() => {
// chk 已不再被本实例管理 → 停止轮询
if (this.childs[chk.name] !== chk) {
clearInterval(chk._adoptTimer)
chk._adoptTimer = null
return
}
let alive = true
try {
process.kill(pid, 0)
} catch (err) {
if (err.code === 'ESRCH') alive = false
// EPERM 表示进程仍存在
}
if (!alive) {
clearInterval(chk._adoptTimer)
chk._adoptTimer = null
chk.adoptedPid = 0
this._onAdoptedExit(chk)
}
}, 2000)
if (chk._adoptTimer.unref) chk._adoptTimer.unref()
}
/**
* [包B] 被接管进程退出后的处理:状态转 EXIT,按 restart 策略决定是否重启。
* 注意:被接管进程没有退出码,fail / fail-count 策略一律视为需要重启。
*/
_onAdoptedExit(chk) {
chk.state = this.state.EXIT
chk.cause = 'EXIT|ADOPT-POLL|被接管的 detached 进程已退出'
this.writeChildState(chk)
this._removePidFile(chk.name)
if (chk.restart === 'none' || chk.restart === 'remove') return
if (chk.restart === 'count' || chk.restart === 'fail-count') {
if (chk.restartCount >= chk.restartLimit) {
chk.autoRemove && this.remove(chk.name)
return
}
}
;(chk.restartCount < this.maxCount) && (chk.restartCount += 1)
if (chk.restartDelay <= 0) {
this.startChild(chk)
} else {
setTimeout(() => { this.startChild(chk) }, chk.restartDelay)
}
}
/**
*
* @param {object} cfg
* @param {boolean} reload
* @returns {object}
*/
tryMakeChild(cfg, reload=false) {
if (!cfg.name) cfg.name = this.makeName(cfg)
let ck = `${cfg.command}\x00${cfg.args.join('\x00')}`
// [FIX] 在所有分支